In [46]:
import pandas as pd
import time
import nltk

In [114]:
class nlp:
    #need to download from nltk - 'averaged_perceptron_tagger','wordnet','stopwords'
    re = __import__('re')
    pd = __import__('pandas')
    nltk = __import__('nltk')
    
    def __init__(self):
        print('nlp V.0.1 \nImported pandas,re packages')
        
    def clean_text_for_topic_modelling(self,text_column):
        text_column=text_column.str.lower()
        text_column = [self.re.sub(r'https?:\/\/*[^ ]*', '', x) for x in text_column]
        text_column = [self.re.sub(r'[.,;/]',' ', x) for x in text_column]
        text_column = [self.re.sub(r'\((cont)\)','', x) for x in text_column]
        text_column = [self.re.sub(r'[^A-Za-z0-9$% ]','', x) for x in text_column]
        text_column = [x.split() for x in text_column]
        temp_corpus=[]
        for tweet in text_column:
            new_list=[w for w in tweet if len(w)>2]
            temp_corpus.append(new_list)
        text_column=self.pd.Series(temp_corpus)
        return text_column
    
    def remove_stopwords_from_corpus(self,text_column,extra_stopwords_list=[]):
        from nltk.corpus import stopwords
        stops = set(stopwords.words('english')).union(extra_stopwords_list)
        text_column=text_column.apply(lambda x: [w for w in x if w not in stops])
        return text_column
    
    def stemm_corpus(self,text_column,stemmer='porter'):
        if stemmer=='porter':
            from nltk.stem import PorterStemmer
            porter = PorterStemmer()
            text_column=text_column.apply(lambda x:[porter.stem(w) for w in x])
        else:
            from nltk.stem import LancasterStemmer
            lancaster=LancasterStemmer()
            text_column=text_column.apply(lambda x:[lancaster.stem(w) for w in x])
        return text_column
    
    def lemmatize_corpus(self,text_column):
        from nltk.stem import WordNetLemmatizer
        from nltk.corpus import wordnet
        
        def get_word_pos(word):
            tag = self.nltk.pos_tag(word)[0][1][0].upper()
            tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
            return tag_dict.get(tag, wordnet.NOUN)
        
        wordnet_lemmatizer = WordNetLemmatizer()
        text_column=text_column.apply(lambda x:[wordnet_lemmatizer.lemmatize(w, pos=get_word_pos(w)) for w in x])
        return text_column
    

In [5]:
df=pd.read_csv('trump_tweets_reducted.csv',parse_dates=['created_at_utc'])

In [115]:
nlpob=nlp()

nlp V.0.1 
Imported pandas,re packages


In [116]:
corpus=nlpob.clean_text_for_topic_modelling(df['text'])

In [117]:
corpus=nlpob.remove_stopwords_from_corpus(corpus,['realdonaldtrump','amp'])

In [43]:
corpus=nlpob.stemm_corpus(corpus,stemmer='lancaster')

In [118]:
corpus=nlpob.lemmatize_corpus(corpus)

In [121]:
from nltk.util import ngrams

In [123]:
for ngram in ngrams(corpus[17000], 2):
    print(ngram)

('message', 'great')
('great', 'people')
('people', 'new')
('new', 'hampshire')
('hampshire', 'important')
('important', 'day')
('day', 'votetrumpnh')
('votetrumpnh', 'video')


In [119]:
compare=pd.DataFrame({'corpus':corpus,'real':df['text']})

In [120]:
for i,row in compare[17000:].iterrows():
    print(row['corpus'])
    print('\n')
    print(row['real'])
    time.sleep(2)

['message', 'great', 'people', 'new', 'hampshire', 'important', 'day', 'votetrumpnh', 'video']


A message to the great people of New Hampshire on this important day! #VoteTrumpNH Video: https://t.co/7S2f2GSMB8 https://t.co/9WwRWnvqiv
['new', 'hampshire', 'vote', 'today', 'make', 'america', 'great']


New Hampshire vote today -  MAKE AMERICA GREAT AGAIN!
['worthless', 'nydailynews', 'dopey', 'mort', 'zuckerman', 'desperately', 'trying', 'sell', 'buyer', 'liability', 'massive']


Worthless @NYDailyNews which dopey Mort Zuckerman is desperately trying to sell has no buyer! Liabilities are massive!
['like', 'worthless', 'nydailynews', 'look', 'like', 'politico', 'going', 'business', 'bad', 'reporting', 'money', 'cred']


Like the worthless @NYDailyNews looks like @politico will be going out of business. Bad reporting- no money no cred!
['great', 'experience', 'new', 'hampshire', 'amazing', 'people', 'leaving', 'big', 'event', 'south', 'carolina', 'today']


Such a great experience in New 

['wino911', 'trump2016', 'know', 'better', 'trust', 'rnc', 'cute']


@wino911:  #Trump2016 We know better than to trust the RNC https://t.co/EMcgtK25fE   So cute!
['thtswhtkelsaid', 'fed', 'playing', 'card', 'monty', 'tax', 'time', 'people', 'play', 'trump', 'card', 'take', 'back', 'country', '02132016', '192239', '1972', '5867', 'false', '698588112994353152twitter', 'android', 'brandonsawyer84', 'rule', 'southcarolinaprimary']


@ThtsWhtKelSaid: I'm fed up w/ DC playing 3 card Monty w/ our tax $'s. It's time we the people play the Trump card &amp; take back our country,02-13-2016 19:22:39,1972,5867,false,698588112994353152
Twitter for Android,@BrandonSawyer84: @realDonaldTrump will rule #SouthCarolinaPrimary! https://t.co/y0YNO1eyft"
['jeffpaine', 'crushed', 'debate', 'thank', '02142016', '115254', '1416', '5479', 'false', '698837314894766080twitter', 'iphone', 'gopdebate', 'googletrends', '063946', '4164', '10739', 'false', '698758514559340544twitter', 'iphone', 'hey', 'glennbeck', '

['new', 'south', 'carolina', 'poll', 'ppp', 'thank', 'votetrumpsc']


New South Carolina poll from PPP. Thank you! #VoteTrumpSC https://t.co/6618Oi6vle
['spent', 'full', 'day', 'meeting', 'major', 'rally', 'yesterday', 'south', 'carolina', 'great', 'people', 'spirit', 'today']


Spent the full day at meetings and a major rally yesterday in South Carolina. Great people and spirit. Today will be more of the same.
['new', 'ppp', 'poll', 'trump', 'big', 'cruz', 'rubio', 'bush', 'debate', 'result', 'even', 'stacked', 'rnc', 'audience', 'wonderful']


New PPP Poll just out - Trump up big Cruz Rubio and Bush down. The debate results even with a stacked RNC audience were wonderful!
['thank', 'makeamericagreatagain']


Thank you! #MakeAmericaGreatAgain https://t.co/tIBB3BMUvp
['sit', 'great', 'interview', 'phussionwyff', 'greenville', 'today', 'watch', '5pm', 'amazing', 'day', 'south', 'carolina', 'votetrumpsc']


Just sat down for a great interview with @PHussionWYFF in Greenville today. Watch

KeyboardInterrupt: 