In [1]:
import pandas as pd
import time
import nltk

In [2]:
class nlp:
    #need to download from nltk - 'averaged_perceptron_tagger','wordnet','stopwords'
    re = __import__('re')
    pd = __import__('pandas')
    nltk = __import__('nltk')
    
    def __init__(self):
        print('nlp V.0.1 \nImported pandas,re,nltk packages')
        
    def clean_text_for_topic_modelling(self,text_column):
        text_column=text_column.str.lower()
        text_column = [self.re.sub(r'https?:\/\/*[^ ]*', '', x) for x in text_column]
        text_column = [self.re.sub(r'[.,;/]',' ', x) for x in text_column]
        text_column = [self.re.sub(r'\((cont)\)','', x) for x in text_column]
        text_column = [self.re.sub(r'[^A-Za-z0-9$% ]','', x) for x in text_column]
        text_column = [x.split() for x in text_column]
        temp_corpus=[]
        for tweet in text_column:
            new_list=[w for w in tweet if len(w)>2]
            temp_corpus.append(new_list)
        text_column=self.pd.Series(temp_corpus)
        return text_column
    
    def remove_stopwords_from_corpus(self,text_column,extra_stopwords_list=[]):
        from nltk.corpus import stopwords
        stops = set(stopwords.words('english')).union(extra_stopwords_list)
        text_column=text_column.apply(lambda x: [w for w in x if w not in stops])
        return text_column
    
    def stemm_corpus(self,text_column,stemmer='porter'):
        if stemmer=='porter':
            from nltk.stem import PorterStemmer
            porter = PorterStemmer()
            text_column=text_column.apply(lambda x:[porter.stem(w) for w in x])
        elif stemmer=='lancaster':
            from nltk.stem import LancasterStemmer
            lancaster=LancasterStemmer()
            text_column=text_column.apply(lambda x:[lancaster.stem(w) for w in x])
        else:
            from nltk.stem import SnowballStemmer
            snowball=SnowballStemmer("english")
            text_column=text_column.apply(lambda x:[snowball.stem(w) for w in x])
        return text_column
    
    def lemmatize_corpus(self,text_column):
        from nltk.stem import WordNetLemmatizer
        from nltk.corpus import wordnet
        
        def get_word_pos(word):
            tag = self.nltk.pos_tag(word)[0][1][0].upper()
            tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
            return tag_dict.get(tag, wordnet.NOUN)
        
        wordnet_lemmatizer = WordNetLemmatizer()
        text_column=text_column.apply(lambda x:[wordnet_lemmatizer.lemmatize(w, pos=get_word_pos(w)) for w in x])
        return text_column
    
    def create_ngrams(self,text_column,replace=False,num_grams=2):
        from nltk import ngrams
        if replace:
            grams_list=[]
            for tweet in text_column:
                grams_list.append([' '.join(ngram) for ngram in ngrams(tweet,num_grams)])
            return self.pd.Series(grams_list)
        else:
            for i,tweet in text_column.iteritems():
                copy_tweet=tweet.copy()
                for ngram in ngrams(tweet,2):
                    copy_tweet.append(' '.join(ngram))
                text_column.at[i]=copy_tweet
            return text_column
        
    def gensim_dic(self,text_column,filter_extremes=True,no_below=3,no_above=0.99):
        from gensim.corpora import Dictionary
        dictionary=Dictionary(corpus)
        if filter_extremes:
            dictionary.filter_extremes(no_below=no_below, no_above=no_above)
        return dictionary
    
    def bow_corpus(self,dictionary,text_column):
        return [dictionary.doc2bow(doc) for doc in text_column]
    
    def vec_to_tfidf(self,bow_corpus):
        from gensim import models
        tfidf = models.TfidfModel(bow_corpus)
        return tfidf[bow_corpus]
    
    def lda_model(self,tfidf_corp,dic,num_topics=20,passes=30,alpha=0.001,eta='auto'):
        from gensim import models
        return models.ldamodel.LdaModel(corpus=tfidf_corp, num_topics=num_topics, id2word=dic, passes=passes, alpha=alpha, 
                                             eta=eta,random_state=13)
    
    def get_coherence(self,lda_model,corpus,dic):
        from gensim.models import CoherenceModel
        coherence_model_lda = CoherenceModel(model=lda_model, texts=corpus, dictionary=dic, coherence='c_v')
        return coherence_model_lda.get_coherence()
    
    def is_tweet_intopic(self,text_column,words_list):
        
        def is_tweets_words_inlist(tweet,words_list):
            for word in tweet:
                if word in words_list:
                    return True
            return False
        
        results=[]
        for tweet in text_column:
            results.append(is_tweets_words_inlist(tweet,words_list))
        return self.pd.Series(results)
    
    def assign_lda_topic(self,lda_model,bow_corpus):
        topics=[]
        data=lda_model.get_document_topics(bow_corpus)
        for tweet in data:  
            df = self.pd.DataFrame(tweet, columns=['topic_num', 'probability'])
            topics.append(df.loc[df['probability'].idxmax(), 'topic_num'])
        return (self.pd.Series(topics))
    
    def get_sentiment(self,text_column):
        from textblob import TextBlob
        return text_column.apply(lambda x: TextBlob(' '.join(x)).sentiment.polarity)

In [3]:
df=pd.read_csv('trump_tweets_reducted.csv',parse_dates=['created_at_utc'])
df.reset_index(inplace=True,drop=True)

In [4]:
nlpob=nlp()

nlp V.0.1 
Imported pandas,re,nltk packages


In [5]:
corpus=nlpob.clean_text_for_topic_modelling(df['text'])

In [6]:
corpus=nlpob.remove_stopwords_from_corpus(corpus,['realdonaldtrump','amp','president','android','iphone'])

In [7]:
corpus_without_stemorlem=corpus.copy()

In [8]:
corpus=nlpob.stemm_corpus(corpus)

In [9]:
corpus=nlpob.lemmatize_corpus(corpus)

In [10]:
corpus=nlpob.create_ngrams(corpus)

In [11]:
topic_words=['deal','deals','china','chinese','nafta','market','markets','stock','stocks','trade','trades','trading','tax',
             'taxes','taxation','rate','rates','unemployment','jobs','manafacture','manafacturers','dollar','dollars','$',
             'billion','billions','consumer','consumers','reserve','gdp','growth','bank','banks','debt','factories',
             'revenue','spending','deficit','economy','economies','economist','economic','economists','money',
             'tariff','tariffs','companies','inflation','refinance','finance','financial','currency','depreciating',
             'depreceate','fed','business','businesses','bankruptcy','price','prices','products','opec','investment',
             'savings','cents','cent','corporate','corporates','deficits','dow','nasdaq','labor','purchase','500',
             'industry','industries']

In [12]:
df['financial_topic']=nlpob.is_tweet_intopic(corpus_without_stemorlem,topic_words)

In [13]:
dic=nlpob.gensim_dic(corpus)
bow_corp=nlpob.bow_corpus(dic,corpus)
tfidf_corp=nlpob.vec_to_tfidf(bow_corp)
lda_model = nlpob.lda_model(tfidf_corp,dic)
nlpob.get_coherence(lda_model,corpus,dic)

0.43284546532459034

In [14]:
df['lda_topic'] = nlpob.assign_lda_topic(lda_model,bow_corp)

In [15]:
df['sentiment'] = nlpob.get_sentiment(corpus_without_stemorlem)

In [16]:
df.to_csv('trump_tweets_reducted_after_nlp.csv')

In [20]:
df[df['financial_topic']==True]['text']

2        Donald Trump reads Top Ten Financial Tips on L...
22       “If you don't have problems you're pretending ...
31       Watch a powerful and frank interview with Dona...
37       Browse Donald Trump's Summer Reading List for ...
48       Watch video of Ivanka Trump sharing business a...
                               ...                        
25663    ....absolutely no pressure. I don’t know of an...
25668    Chuck Schumer sat for years during the Obama A...
25695    ....energy independence manufacturing resilien...
25696    ....of the most successful presidencies in his...
25698    ....energy independence manufacturing resilien...
Name: text, Length: 3688, dtype: object

In [17]:
# for idx, topic in lda_model.print_topics(-1):
#     print('Topic: {} Word: {}'.format(idx, topic))

import pyLDAvis
import pyLDAvis.gensim 
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, bow_corp, dic)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [None]:
## hdpmodel
# from gensim.corpora import Dictionary
# dct = Dictionary(corpus)
# vector_corpus=[dct.doc2bow(tweet) for tweet in corpus]
# from gensim.models import HdpModel
# hdp = HdpModel(vector_corpus, dct,alpha=0.001)
# topic_info = hdp.print_topics(num_topics=10, num_words=20)
# topic_info

In [None]:
# compare=pd.DataFrame({'corpus':corpus,'real':df['text']})
# for i,row in compare[17000:].iterrows():
#     print(row['corpus'])
#     print('\n')
#     print(row['real'])
#     time.sleep(2)