####     Author : Mohlatlego  nakeng

#### Topic Modelling

* Latent Dirichlet Allocation represents documents as mixtures of topics that spit out words with certain probabilities. So now suppose you have a set of documents. You’ve chosen some fixed number of K topics to discover, and want to use LDA to learn the topic representation of each document and the words associated to each topic.
* Latent Dirichlet allocation (LDA) is a technique that automatically discovers topics that these documents contain.
* Dirichlet is a distribution specified by a vector parameter α containing some αi corresponding to each topic i, which we write as Dir(α)

#### Non-negative Matrix Factorization

* LDA is based on probabilistic graphical modeling while NMF relies on linear algebra.
* Both algorithms take as input a bag of words matrix (i.e., each document represented as a row, with each columns containing the count of words in the corpus).
* The aim of each algorithm is then to produce 2 smaller matrices; a document to topic matrix and a word to topic matrix that when multiplied together reproduce the bag of words matrix with the lowest error.

#### Importing Libraries  

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import pyLDAvis
import pyLDAvis.sklearn
import sys
import pandas as pd
import os
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
import preprocessor as p

In [3]:
from nltk.corpus import stopwords

#### Import datasets

In [6]:
data = pd.read_csv("data/vaccine_jhb.csv", sep='\t' )

In [7]:
data.columns

Index(['id', 'conversation_id', 'created_at', 'date', 'time', 'timezone',
       'user_id', 'username', 'name', 'place', 'tweet', 'language', 'mentions',
       'urls', 'photos', 'replies_count', 'retweets_count', 'likes_count',
       'hashtags', 'cashtags', 'link', 'retweet', 'quote_url', 'video',
       'thumbnail', 'near', 'geo', 'source', 'user_rt_id', 'user_rt',
       'retweet_id', 'reply_to', 'retweet_date', 'translate', 'trans_src',
       'trans_dest'],
      dtype='object')

In [None]:
data.asset_text_extracts

#### data cleaning 

* Removig mentions or Tags

In [None]:
import re

In [None]:
from nltk.stem import PorterStemmer
stop_words=stopwords.words('english')
stemmer=PorterStemmer()

In [None]:
def data_clean(text):
    for i in range(len(data)):
        tweet=re.sub('[^a-zA-Z]',' ',data.iloc[i])
        tweet=re.sub('@[A-Za-z0-9_]+',' ',data.iloc[i])
        tweet=tweet.lower().split()
        tweet=[stemmer.stem(word) for word in tweet if (word not in stop_words)]
    #     tweet = p.clean(data.tweet)
        tweet=' '.join(tweet)
        return tweet

In [None]:
def display_topics(model, feature_names, no_top_words):
    
    for topic_idx, topic in enumerate(model.components_):
        print("Topic:", (topic_idx))
        print(" ".join([feature_names[i]
        for i in topic.argsort()[:-no_top_words - 1:-1]]))


def tfidf_vectorizer(documents,total_features):

    #  TFIDF Vectorizer
    tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=total_features, stop_words='english')
    tfidf = tfidf_vectorizer.fit_transform(documents)
    tfidf_feature_names = tfidf_vectorizer.get_feature_names()
    return tfidf_vectorizer,tfidf,tfidf_feature_names

def count_vectorizer(documents,total_features):

    #  Count Vectorizer
    tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=total_features, stop_words='english')
    tf = tf_vectorizer.fit_transform(documents)
    tf_feature_names = tf_vectorizer.get_feature_names()
    return tf_vectorizer,tf,tf_feature_names

In [None]:
total_features = 15000
num_topic = 20
tfidf_vectorizer, tfidf, tfidf_feature_names = tfidf_vectorizer(data['asset_text_extracts'].values.astype('U'),total_features)
tf_vectorizer, tf, tf_feature_names = count_vectorizer(data['asset_text_extracts'],total_features)

In [None]:
model_lda = LatentDirichletAllocation(n_components=num_topic, max_iter=30, learning_method='online', learning_offset=50.,random_state=0).fit(tfidf)
no_top_words = 20

In [None]:
display_topics(model_lda, tfidf_feature_names, no_top_words)

##### Testing on JHB data

In [None]:
data_op = pyLDAvis.sklearn.prepare(model_lda,tfidf,tfidf_vectorizer)
pyLDAvis.enable_notebook()
pyLDAvis.display(data_op)

#### Testing on MMA data

In [None]:
data_comp =pd.read_excel("Complaints_Reviewed_As_Disinformation_twitter.xlsx")

In [None]:
data_comp.title

In [None]:
def display_topics_comp(model, feature_names, no_top_words):
    
    for topic_idx, topic in enumerate(model.components_):
        print("Topic:", (topic_idx))
        print(" ".join([feature_names[i]
        for i in topic.argsort()[:-no_top_words - 1:-1]]))


def tfidf_vectorizer_comp(documents,total_features):

    #  TFIDF Vectorizer
    tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=total_features, stop_words='english')
    tfidf = tfidf_vectorizer.fit_transform(documents)
    tfidf_feature_names = tfidf_vectorizer.get_feature_names()
    return tfidf_vectorizer,tfidf,tfidf_feature_names

def count_vectorizer_comp(documents,total_features):

    #  Count Vectorizer
    tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=total_features, stop_words='english')
    tf = tf_vectorizer.fit_transform(documents)
    tf_feature_names = tf_vectorizer.get_feature_names()
    return tf_vectorizer,tf,tf_feature_names

In [None]:
total_features = 15000 
num_topic = 20
tfidf_vectorizer_comp, tfidf_comp, tfidf_feature_names_comp = tfidf_vectorizer_comp(data_comp['title'],total_features)
tf_vectorizer_comp, tf_comp, tf_feature_names_comp = count_vectorizer_comp(data_comp['title'],total_features)

In [None]:
model_lda_comp = LatentDirichletAllocation(n_components=num_topic, max_iter=30, learning_method='online', learning_offset=50.,random_state=0).fit(tfidf_comp)
no_top_words = 20

In [None]:
display_topics(model_lda_comp, tfidf_feature_names_comp, no_top_words)

In [None]:
data_op = pyLDAvis.sklearn.prepare(model_lda_comp,tfidf_comp,tfidf_vectorizer_comp)
pyLDAvis.enable_notebook()
pyLDAvis.display(data_op)
pyLDAvis.save_html(data_op, 'topics_MMA_LDA.html')

#### Non-negative Matrix Factorization

In [None]:
n_components = 20

In [None]:
def fit_NMF(X, n_components):
    model = NMF(n_components=n_components,random_state=0)
    nmf_tfidf_limit =model.fit(X)
    return nmf_tfidf_limit

In [None]:
nmf_tfidf_comp = fit_NMF(tfidf_comp, n_components)
display_topics(nmf_tfidf_comp ,tf_feature_names_comp ,  no_top_words)