In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
import math
import json
from func import *

import nltk
from nameparser.parser import HumanName
from nltk.corpus import wordnet

import warnings
warnings.simplefilter("ignore", DeprecationWarning)
# Load the LDA model from sk-learn
from sklearn.decomposition import LatentDirichletAllocation as LDA

pd.options.display.max_colwidth = 100
sns.set_style('whitegrid')
%matplotlib inline

## We performed extra experiments to see how an unsupervised learning model would analyze the data. We chose LDA (Latent Dirichlet Allocation) model in particular.

In [None]:
titles = []
for idx, row in clean_data.iterrows():
    title = row.title
    
    title = re.sub(r"[()\[\],-.?!:;#&]", " ", title)
    title = re.split(" ",title)
    title = list(filter(lambda a: a != '', title))
    title = [word.lower() for word in title if word.isalpha() and word.lower() != 'trump' and word.lower() != 'biden']
#     title = [word.lower() for word in title if word.isalpha()]

    title = ' '.join(title)

    titles.append(title)

In [None]:
N = 30
# Helper function
def plot_N_most_common_words(count_data, count_vectorizer, N):
    import matplotlib.pyplot as plt
    words = count_vectorizer.get_feature_names()
    total_counts = np.zeros(len(words))
    for t in count_data:
        total_counts+=t.toarray()[0]
    
    count_dict = (zip(words, total_counts))
    count_dict = sorted(count_dict, key=lambda x:x[1], reverse=True)[0:N]
    words = [w[0] for w in count_dict]
    counts = [w[1] for w in count_dict]
    x_pos = np.arange(len(words)) 
    
    plt.figure(2, figsize=(15, 15/1.6180))
    plt.subplot(title=f'{N} most common words')
    sns.set_context("notebook", font_scale=1.25, rc={"lines.linewidth": 2.5})
    sns.barplot(x_pos, counts, palette='husl')
    plt.xticks(x_pos, words, rotation=40) 
    plt.xlabel('words')
    plt.ylabel('counts')
    plt.show()
    
# Initialise the count vectorizer with the English stop words
count_vectorizer = CountVectorizer(stop_words='english')
# Fit and transform the processed titles
count_data = count_vectorizer.fit_transform(titles)
# Visualise the 10 most common words
plot_N_most_common_words(count_data, count_vectorizer, N)

In [None]:
# Helper function
def print_topics(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join([words[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
        
# Tweak the two parameters below
number_topics = 10
number_words = 10
# Create and fit the LDA model
lda = LDA(n_components=number_topics, n_jobs=-1)
lda.fit(count_data)
# Print the topics found by the LDA model
print("Topics found via LDA:")
print_topics(lda, count_vectorizer, number_words)