In [1]:
import pandas as pd
import time
import nltk
import gensim
import numpy as np

In [2]:
class nlp:
    #need to download from nltk - 'averaged_perceptron_tagger','wordnet','stopwords'
    re = __import__('re')
    pd = __import__('pandas')
    nltk = __import__('nltk')
    np = __import__('numpy')
    
    def __init__(self):
        print('nlp V.0.1 \nImported pandas,re,nltk,numpy packages')
        
    def clean_text_for_topic_modelling(self,text_column):
        import string
        text_column=text_column.str.lower()
        text_column = [self.re.sub(r'https?:\/\/*[^ ]*', '', x) for x in text_column]
#         text_column = [self.re.sub(r'[.,;/]',' ', x) for x in text_column]
        text_column = [self.re.sub(r'\((cont)\)','', x) for x in text_column]
        text_column = [self.re.sub(r'[^A-Za-z0-9$% ]','', x) for x in text_column]
        text_column = [x.translate(str.maketrans('', '', string.punctuation)) for x in text_column]
        text_column = [x.split() for x in text_column]
        temp_corpus=[]
        for tweet in text_column:
            new_list=[w for w in tweet if len(w)>2]
            temp_corpus.append(new_list)
        text_column=self.pd.Series(temp_corpus)
        return text_column
    
    def remove_stopwords_from_corpus(self,text_column,extra_stopwords_list=[]):
        from nltk.corpus import stopwords
        stops = set(stopwords.words('english')).union(extra_stopwords_list)
        text_column=text_column.apply(lambda x: [w for w in x if w not in stops])
        return text_column
    
    def stemm_corpus(self,text_column,stemmer='porter'):
        if stemmer=='porter':
            from nltk.stem import PorterStemmer
            porter = PorterStemmer()
            text_column=text_column.apply(lambda x:[porter.stem(w) for w in x])
        elif stemmer=='lancaster':
            from nltk.stem import LancasterStemmer
            lancaster=LancasterStemmer()
            text_column=text_column.apply(lambda x:[lancaster.stem(w) for w in x])
        else:
            from nltk.stem import SnowballStemmer
            snowball=SnowballStemmer("english")
            text_column=text_column.apply(lambda x:[snowball.stem(w) for w in x])
        return text_column
    
    def lemmatize_corpus(self,text_column):
        from nltk.stem import WordNetLemmatizer
        from nltk.corpus import wordnet
        
        def get_word_pos(word):
            tag = self.nltk.pos_tag(word)[0][1][0].upper()
            tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
            return tag_dict.get(tag, wordnet.NOUN)
        
        wordnet_lemmatizer = WordNetLemmatizer()
        text_column=text_column.apply(lambda x:[wordnet_lemmatizer.lemmatize(w, pos=get_word_pos(w)) for w in x])
        return text_column
    
    def create_ngrams(self,text_column,replace=False,num_grams=2):
        from nltk import ngrams
        if replace:
            grams_list=[]
            for tweet in text_column:
                grams_list.append([' '.join(ngram) for ngram in ngrams(tweet,num_grams)])
            return self.pd.Series(grams_list)
        else:
            for i,tweet in text_column.iteritems():
                copy_tweet=tweet.copy()
                for ngram in ngrams(tweet,2):
                    copy_tweet.append(' '.join(ngram))
                text_column.at[i]=copy_tweet
            return text_column
        
    def gensim_dic(self,text_column,filter_extremes=True,no_below=5,no_above=0.85):
        from gensim.corpora import Dictionary
        dictionary=Dictionary(text_column)
        if filter_extremes:
            dictionary.filter_extremes(no_below=no_below, no_above=no_above)
        return dictionary
    
    def bow_corpus(self,dictionary,text_column):
        return [dictionary.doc2bow(doc) for doc in text_column]
    
    def one_hot_encoding_todf(self,dictionary,bow_corp):
        import gensim
        return self.pd.DataFrame(gensim.matutils.corpus2dense(bow_corp,num_terms=len(dictionary))).T
    
    def tf_idf_tomatrix(self,dictionary,tf_idf_object):
        from gensim.matutils import corpus2dense
        num_terms = len(dictionary.keys())
        num_docs = dictionary.num_docs
        corpus_tfidf_dense = corpus2dense(tf_idf_object, num_terms, num_docs)
        return self.pd.DataFrame(self.np.transpose(corpus_tfidf_dense))
        
    def vec_to_tfidf(self,bow_corpus):
        from gensim import models
        tfidf = models.TfidfModel(bow_corpus)
        return tfidf[bow_corpus]
    
    def lda_model(self,tfidf_corp,dic,num_topics=20,passes=30,alpha=0.001,eta='auto'):
        from gensim import models
        return models.ldamodel.LdaModel(corpus=tfidf_corp, num_topics=num_topics, id2word=dic, passes=passes, alpha=alpha, 
                                             eta=eta,random_state=13)
    
    def get_coherence(self,lda_model,corpus,dic):
        from gensim.models import CoherenceModel
        coherence_model_lda = CoherenceModel(model=lda_model, texts=corpus, dictionary=dic, coherence='c_v')
        return coherence_model_lda.get_coherence()
    
    def assign_lda_topic(self,lda_model,bow_corpus):
        topics=[]
        data=lda_model.get_document_topics(bow_corpus)
        for tweet in data:  
            df = self.pd.DataFrame(tweet, columns=['topic_num', 'probability'])
            topics.append(df.loc[df['probability'].idxmax(), 'topic_num'])
        return (self.pd.Series(topics))
    
    def get_sentiment(self,text_column):
        from textblob import TextBlob
        return text_column.apply(lambda x: TextBlob(x).sentiment.polarity)
    
    def get_subjectivity(self,text_column):
        from textblob import TextBlob
        return text_column.apply(lambda x: TextBlob(x).sentiment.subjectivity)
    
    def get_length_of_comment(self,text_column):
        return text_column.apply(lambda x: len(x))
    
    def amount_of_upper(self,text_column):
        return text_column.apply(lambda x: sum(1 for c in x if c.isupper()))
    
    def create_uppercase_max_sequence_column(self,text_column):
        def get_max_uppercase_run_from_string(s):
            lengths=[len(x) for x in self.re.findall(r"[A-Z]+", s)]
            if len(lengths)>0:
                return max(lengths)
            else:
                return 0
        return (text_column.apply(lambda x: get_max_uppercase_run_from_string(x)))
    
    def amount_of_sign(self,text_column,sign):
        return text_column.apply(lambda x: sum(1 for c in x if c==sign))
    
    def max_sequence(self,text_column,symbol):
        def get_max_sequence_run_from_string(s,symbol):
            lengths=[len(x) for x in self.re.findall(r"["+symbol+"]+", s)]
            if len(lengths)>0:
                return max(lengths)
            else:
                return 0   
        return text_column.apply(lambda x: get_max_sequence_run_from_string(x,symbol))
    
    def bad_comments_column_bin(self,text_column,bad_words_list):
        def is_comment_bad(comment,bad_words_list):
            for word in comment:
                if word in bad_words_list:
                    return 1
            return 0
        return text_column.apply(lambda x: is_comment_bad(x,bad_words_list))

    def bad_comments_column_agg(self,text_column,bad_words_list):
        def amount_comment_bad_words(comment,bad_words_list):
            count_of_bad_words=0
            if len(comment)>0:
                for word in comment:
                    if word in bad_words_list:
                        count_of_bad_words=count_of_bad_words+1
                return count_of_bad_words
            return 0
        return text_column.apply(lambda x: amount_comment_bad_words(x,bad_words_list))

In [3]:
df=pd.read_csv('./vp_tweets/vp.csv',parse_dates=['created_at'],low_memory=False)
df.reset_index(inplace=True,drop=True)

In [4]:
nlpob=nlp()

nlp V.0.1 
Imported pandas,re,nltk,numpy packages


In [5]:
corpus=nlpob.clean_text_for_topic_modelling(df['text'])

In [6]:
# corpus=nlpob.remove_stopwords_from_corpus(corpus,['realdonaldtrump','amp','president','android','iphone'])
corpus=nlpob.remove_stopwords_from_corpus(corpus)

## Another try for LDA

In [42]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [43]:
corpus = corpus.apply(lambda x: ' '.join(x))

In [None]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df = 20)
X = vectorizer.fit_transform(corpus)

In [None]:
number_topics = 50

In [None]:
lda = LatentDirichletAllocation(n_components=number_topics,random_state=0)

In [None]:
lda.fit(X)
lda_results = lda.transform(X)

In [None]:
lda.score(X)

In [None]:
lda_results = pd.DataFrame(lda_results, columns = [f'topic_{i}' for i in range(0,50)])

In [None]:
df = pd.concat([df,lda_results], axis = 1)

In [None]:
df_toinspect = pd.DataFrame(lda.components_,columns = vectorizer.get_feature_names()).T

In [None]:
df_toinspect = pd.DataFrame(df_toinspect.idxmax(axis=1), columns=['topic']).reset_index()

## Sklearn LDA viz

In [None]:
from pyLDAvis import sklearn as sklearn_lda
import pyLDAvis
import pickle
import os

In [None]:
LDAvis_data_filepath = os.path.join('./ldavis_prepared_'+str(number_topics))
LDAvis_prepared = sklearn_lda.prepare(lda, X, vectorizer)

In [None]:
pyLDAvis.save_html(LDAvis_prepared, './ldavis_prepared_'+ str(number_topics) +'.html')

## Dictionary method

In [7]:
corpus=nlpob.create_ngrams(corpus)

In [8]:
trade_words =  ['nafta', 'trade', 'trades', 'trading', 'tariff', 'tariffs', 'opec', 'usmca']

In [9]:
df['trade_topic'] = nlpob.bad_comments_column_agg(corpus,trade_words)

In [10]:
china_words = ['china', 'chinese', 'xi', 'jinping']

In [11]:
df['china_topic'] = nlpob.bad_comments_column_agg(corpus,china_words)

In [12]:
financial_words = ['market', 'markets', 'stock', 'stocks', 'financial', 'investment', 'dow', 'nasdaq', '500', 'wall street',
                   'wall st']

In [13]:
df['financial_topic'] = nlpob.bad_comments_column_agg(corpus,financial_words)

In [14]:
labor_words = ['unemployment','jobs', 'labor','employment', 'work', 'workers', 'payroll']

In [15]:
df['labor_topic'] = nlpob.bad_comments_column_agg(corpus,labor_words)

In [16]:
industry_words= ['manufacture','manufacturers', 'consumer', 'consumers', 'bank', 'banks', 'factories', 'business',
                 'businesses' ,'corporate', 'corporates', 'industry', 'industries', 'product', 'agriculture', 
                 'agricultural', 'products', 'companies', 'production', 'competitive']

In [17]:
df['industry_topic'] = nlpob.bad_comments_column_agg(corpus,industry_words)

In [18]:
currency_rates_topics = ['rate', 'rates', 'reserve', 'inflation', 'currency', 'depreciating', 'depreceate', 'fed', 
                         'federal reserve', 'powell', 'stimulate']

In [19]:
df['currency_rates_topic'] = nlpob.bad_comments_column_agg(corpus,currency_rates_topics)

In [20]:
topic_words=['deal', 'deals', 'dollar', 'dollars', '$', 'billion', 'billions', 'gdp', 'growth', 'revenue', 'economy',
             'economies', 'economist', 'economic', 'economists', 'money', 'price', 'prices', 'cents', 'cent', 'purchase',
             'depletion', 'regulation', '401(k)', 'trillions', 'recession', 'bill', 'military']

In [21]:
df['general_related_topic'] = nlpob.bad_comments_column_agg(corpus,topic_words)

In [22]:
budget_words=['taxes', 'taxation', 'tax', 'debt', 'deficit', 'spending', 'refinance', 'finance', 'savings', 'deficits',
              'bankruptcy', 'spend', 'cost', 'costs', 'subsidizing', 'subsidize']

In [23]:
df['budget_topic'] = nlpob.bad_comments_column_agg(corpus,budget_words)

In [24]:
foreign_policy_words=['sanctions', 'iran', 'nuclear', 'wall', 'mexico', 'daca', 'conflict', 'rockets', 'russia',
                      'middle east', 'ukrainian', 'ukraine', 'isis', 'syria', 'border', 'russian', 'iraq', 'kim', 
                      'jong', 'un', 'caravan']

In [25]:
df['foreign_policy_topic'] = nlpob.bad_comments_column_agg(corpus,foreign_policy_words)

In [26]:
covid_words=['corona', 'virus', 'covid', 'chinese virus', 'containment', 'ventilators', 'h1n1',
             'swine', 'flu', 'pandemic', 'coronavirus', 'killthevirus', 'masks', 'quarantined', 'quarantine']

In [27]:
df['covid_topic']=nlpob.bad_comments_column_agg(corpus,covid_words)

In [28]:
sensitive_words=['danger', 'investigation', 'shutdown', 'crisis']

In [29]:
df['sensitive_topic']=nlpob.bad_comments_column_agg(corpus,sensitive_words)

In [30]:
environment_words=['coal', 'global warming', 'oil', 'wind', 'greta']

In [31]:
df['environment_topic']=nlpob.bad_comments_column_agg(corpus,environment_words)

## sentiment

In [32]:
df['sentiment'] = nlpob.get_sentiment(df['text'])

In [33]:
df.loc[df['sentiment'] > 0, 'sentiment'] = 1

In [34]:
df.loc[df['sentiment'] < 0, 'sentiment'] = -1

In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5657 entries, 0 to 5656
Data columns (total 37 columns):
str_id                   5657 non-null int64
screen_name              5657 non-null object
utc_date                 5657 non-null object
created_at               5657 non-null datetime64[ns]
source                   5657 non-null object
favorite_count           5657 non-null int64
retweet_count            5657 non-null int64
replies                  5657 non-null int64
quotes                   5657 non-null int64
lang                     5657 non-null object
tweet_type               5657 non-null object
text                     5657 non-null object
quote                    615 non-null object
country_code             2 non-null object
place                    2 non-null object
latitude                 2 non-null float64
longitude                2 non-null float64
is_specific_geo          5657 non-null object
urls                     1292 non-null object
media_type               34

In [36]:
df.drop('str_id', axis = 1, inplace = True)

In [37]:
df.to_csv('vp_tweets_reducted_after_nlp.csv')