In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import seaborn as sb
import scipy.stats as stats
from IPython.display import display
from tqdm import tqdm
from collections import Counter
import ast
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.manifold import TSNE
from bokeh.plotting import figure, output_file, show
from bokeh.models import Label
from bokeh.io import output_notebook
output_notebook()


In [2]:
pd.set_option('display.max_colwidth', None)
df = pd.read_csv(r'C:\Users\giorg\clean_text.csv', encoding = 'utf-8-sig')
text =df['text']

In [3]:
# Defining the helper functions
def get_top_n_words(n_top_words, count_vectorizer, text_data):
    '''
    This will return a tuple of the top n words in a sample and their 
    accompanying counts, given a CountVectorizer object and text sample
    '''
    vec_headlines = count_vectorizer.fit_transform(text_data.values)
    vec_total = np.sum(vec_headlines, axis=0)
    word_indices = np.flip(np.argsort(vec_total)[0,:], 1)
    word_values = np.flip(np.sort(vec_total)[0,:],1)
    
    word_vec = np.zeros((n_top_words, vec_headlines.shape[1]))
    for i in range(n_top_words):
        word_vec[i,word_indices[0,i]] = 1

    words = [word[0].encode('ascii').decode('utf-8') for 
             word in count_vectorizer.inverse_transform(word_vec)]

    return (words, word_values[0,:n_top_words].tolist()[0])

count_vectorizer = CountVectorizer(stop_words='english')
words, word_values = get_top_n_words(n_top_words=15,
                                     count_vectorizer=count_vectorizer, 
                                     text_data=text)

In [4]:
count_vectorizer = CountVectorizer(stop_words='english', max_features=40000)
document_term_matrix = text.sample(n=10000, random_state=0).values

print('Headline before vectorization: {}'.format(text[0]))

document_term_matrix = count_vectorizer.fit_transform(text)

print('Headline after vectorization: \n{}'.format(document_term_matrix[0]))

Headline before vectorization: folks paste cytokine storm pfizerbiontech
Headline after vectorization: 
  (0, 13516)	1
  (0, 25434)	1
  (0, 9330)	1
  (0, 32664)	1
  (0, 25844)	1


In [5]:
lda_model = LatentDirichletAllocation(n_components=20, learning_method='online', 
                                          random_state=0, verbose=0)
lda_topic_matrix = lda_model.fit_transform(document_term_matrix)

In [6]:
def get_keys(topic_matrix):
    '''
    This returns an integer list of predicted topic 
    categories for a given topic matrix
    '''
    keys = topic_matrix.argmax(axis=1).tolist()
    return keys

def keys_to_counts(keys):
    '''
    This returns a tuple of topic categories and their 
    accompanying magnitudes for a given list of keys
    '''
    count_pairs = Counter(keys).items()
    categories = [pair[0] for pair in count_pairs]
    counts = [pair[1] for pair in count_pairs]
    return (categories, counts)

In [7]:
lda_keys = get_keys(lda_topic_matrix)
lda_categories, lda_counts = keys_to_counts(lda_keys)

In [8]:
def get_top_n_words(n, keys, document_term_matrix, count_vectorizer):
    '''
    This returns a list of n_topic strings, where each string contains the n most common 
    words in a predicted category, in order
    '''
    top_word_indices = []
    for topic in range(20):
        temp_vector_sum = 0
        for i in range(len(keys)):
            if keys[i] == topic:
                temp_vector_sum += document_term_matrix[i]
        temp_vector_sum = temp_vector_sum.toarray()
        top_n_word_indices = np.flip(np.argsort(temp_vector_sum)[0][-n:],0)
        top_word_indices.append(top_n_word_indices)   
    top_words = []
    for topic in top_word_indices:
        topic_words = []
        for index in topic:
            temp_word_vector = np.zeros((1,document_term_matrix.shape[1]))
            temp_word_vector[:,index] = 1
            the_word = count_vectorizer.inverse_transform(temp_word_vector)[0][0]
            topic_words.append(the_word.encode('ascii').decode('utf-8'))
        top_words.append(" ".join(topic_words))         
    return top_words

In [9]:
top_n_words_lda = get_top_n_words(15, lda_keys, document_term_matrix, count_vectorizer)

for i in range(len(top_n_words_lda)):
    print("Topic {}: ".format(i+1), top_n_words_lda[i])
    print('-----------------------------\n\n')

Topic 1:  covaxin covid vaccine modi dose hospital moderna pm ji work safe dr sputnikv single fortis
-----------------------------


Topic 2:  vaccine sputnikv covid russian astrazeneca moderna st india vaccines russia great covidvaccination russias covidvaccine covaxin
-----------------------------


Topic 3:  india vaccine covaxin sputnikv covid th vaccines moderna _india phase emergency company national life vaccination
-----------------------------


Topic 4:  vaccine covid doses sinovac sinopharm coronavirus vaccines moderna pfizerbiontech thank chinese china health chinas sputnikv
-----------------------------


Topic 5:  vaccine covid pfizer moderna emergency vaccines sputnikv mrna johnsonandjohnson data covaxin president available yesterday drug
-----------------------------


Topic 6:  effects vaccine moderna china covid vaccines govt covaxin sputnikv shot study update headache dose nifty
-----------------------------


Topic 7:  vaccine jab covid moderna covaxin canada oxford

# Result Visualiaztion

In [10]:
tsne_lda_model = TSNE(n_components=2, perplexity=50, learning_rate=100, 
                        n_iter=2000, verbose=1, random_state=0, angle=0.75)
tsne_lda_vectors = tsne_lda_model.fit_transform(lda_topic_matrix)

[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 64661 samples in 1.189s...
[t-SNE] Computed neighbors for 64661 samples in 398.327s...
[t-SNE] Computed conditional probabilities for sample 1000 / 64661
[t-SNE] Computed conditional probabilities for sample 2000 / 64661
[t-SNE] Computed conditional probabilities for sample 3000 / 64661
[t-SNE] Computed conditional probabilities for sample 4000 / 64661
[t-SNE] Computed conditional probabilities for sample 5000 / 64661
[t-SNE] Computed conditional probabilities for sample 6000 / 64661
[t-SNE] Computed conditional probabilities for sample 7000 / 64661
[t-SNE] Computed conditional probabilities for sample 8000 / 64661
[t-SNE] Computed conditional probabilities for sample 9000 / 64661
[t-SNE] Computed conditional probabilities for sample 10000 / 64661
[t-SNE] Computed conditional probabilities for sample 11000 / 64661
[t-SNE] Computed conditional probabilities for sample 12000 / 64661
[t-SNE] Computed conditional probabilities for 

In [11]:
def get_mean_topic_vectors(keys, two_dim_vectors):
    '''
    This returns a list of centroid vectors from each predicted topic category
    '''
    mean_topic_vectors = []
    for t in range(20):
        articles_in_that_topic = []
        for i in range(len(keys)):
            if keys[i] == t:
                articles_in_that_topic.append(two_dim_vectors[i])    
        
        articles_in_that_topic = np.vstack(articles_in_that_topic)
        mean_article_in_that_topic = np.mean(articles_in_that_topic, axis=0)
        mean_topic_vectors.append(mean_article_in_that_topic)
    return mean_topic_vectors

In [12]:
colormap = np.array([
    "#1f77b4", "#aec7e8", "#ff7f0e", "#ffbb78", "#2ca02c",
    "#98df8a", "#d62728", "#ff9896", "#9467bd", "#c5b0d5",
    "#8c564b", "#c49c94", "#e377c2", "#f7b6d2", "#7f7f7f",
    "#c7c7c7", "#bcbd22", "#dbdb8d", "#17becf", "#9edae5" ])
colormap = colormap[:20]

In [13]:
top_3_words_lda = get_top_n_words(3, lda_keys, document_term_matrix, count_vectorizer)
lda_mean_topic_vectors = get_mean_topic_vectors(lda_keys, tsne_lda_vectors)

plot = figure(title="t-SNE CLUSTERING OF {} LDA TOPICS".format(20), plot_width=900, plot_height=900)
plot.scatter(x=tsne_lda_vectors[:,0], y=tsne_lda_vectors[:,1], color=colormap[lda_keys])

for t in range(20):
    label = Label(x=lda_mean_topic_vectors[t][0], y=lda_mean_topic_vectors[t][1], 
                  text=top_3_words_lda[t], text_color=colormap[t])
    plot.add_layout(label)

show(plot)