### Imports

In [1]:
import re
import spacy
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer

### Data

In [2]:
news = pd.read_excel('/content/drive/MyDrive/InShorts/Inshorts.xlsx')
nlp = spacy.load('en_core_web_sm')

### Functions

In [3]:
def get_contractions(x):
    try:
        x = C_Contractions[x]
    except KeyError:
        x = x
    return x

def preprocess(x):
    x = x.lower()
    x = x.split()
    x = ' '.join([get_contractions(word) for word in x])
    x = re.sub(r'[^a-z]', ' ', x)
    x = re.sub(r'\s{1,}', ' ', x)
    x = x.split()
    x = ' '.join([word for word in x if word not in nlp.Defaults.stop_words])
    x = nlp(x)
    x = [word.lemma_ for word in x]
    x = [word for word in x if len(word) > 2]
    return ' '.join(x)

def plot_tsne(topic_matrix = None):
    keys = topic_matrix.argmax(axis = 1).tolist()
    tsne_model = TSNE(n_components = 2, perplexity = 50, learning_rate=100, n_iter=2000, verbose=0, random_state=0, angle=0.75)
    tsne_vectors = tsne_model.fit_transform(topic_matrix)
    plt.figure(figsize = (10, 10))
    sns.scatterplot(x = tsne_vectors[:,0], y = tsne_vectors[:,1], hue = keys, palette = sns.color_palette(n_colors = C_Topics))

### Initialization and Preprocessing

In [4]:
C_TfidfVectorizer = TfidfVectorizer(max_df = 0.95, min_df = 0.05)
C_Contractions = {"don't": 'do not', "can't": 'cannot', "won't": 'will not', "hasn't": 'has not', "haven't": 'have not'}
C_Topics = 12
news['Preprocessed_Headline'] = news['Headline'].apply(preprocess)

In [5]:
for i in range(20):
    print(news['Preprocessed_Headline'][i])

social medium platform disclose originator mischievous message govt
people trust twitter ceo centre frame rule social medium
fast hire mistake kavin hike messenger shutdown
australia pass law google facebook pay news
digital news portal follow press council ethics code govt
new guideline social medium platform announce govt
myanmar military ban facebook instagram coup
tier grievance redressal mechanism ott platforms decide govt
govt unveils rule social medium platform month implement rule
social medium platform provision voluntary verification user govt
double standard social medium firm riot red fort unacceptable prasad
twitter announce super follow feature let user charge content
coo share open love letter fianc insta talk husband death
twitter begin label tweet contain hack leak document
need study detail facebook new social medium rule
google pay newspaper content indian newspaper society
social medium platform abide indian law amit shah
india big target cyber criminals asia pacifi

### Plot LDA Model using TSNE

In [6]:
lda_model = LatentDirichletAllocation(n_components = C_Topics, learning_method='online', random_state=0, verbose=0)
data_doc_term_matrix = C_TfidfVectorizer.fit_transform(news['Preprocessed_Headline'].values)
topic_matrix = lda_model.fit_transform(data_doc_term_matrix)
plot_tsne(topic_matrix = topic_matrix)

ValueError: ignored

<Figure size 720x720 with 0 Axes>

### Validation

In [None]:
i = 0
for topic in lda_model.components_:
    i += 1
    words = []
    topic = topic.argsort()
    top_n_words = topic[-15:]
    for index in top_n_words:
        words.append(C_TfidfVectorizer.get_feature_names()[index])
    print(f"Topic_{i:02d} : {words}")