# Headline Hindsight: Data Pre-Processing and NMF Model
Irti Haq Irtihaq@uw.edu BC | Andrew Simon simona6@uw.edu BC | Jeleen Limawan je0110@uw.edu AB | Joeph Rafael joephr@uw.edu AD



In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#@title Imports
!pip install contractions

import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

import string 
import nltk
from nltk.corpus import stopwords
import contractions


In [None]:
# Load Data
news_df = pd.read_csv('all-the-news-headlines.csv').drop(['section', 'Unnamed: 0'], axis = 1).reset_index(drop=True)
news_df

In [None]:
#@title Pre-Proccessing The Dataset

# List of Publications are going to be Removed Because They Are more Lifestyle News Oriented
pubs_to_remove = ['Gizmodo', 'Hyperallergic', 'Mashable', 'New Republic', 
                  'New Yorker', 'People', 'Refinery 29', 'TMZ', 'TechCrunch',
                  'The Verge', 'Vice', 'Wired']

# Remote Articles without Publication Name
news_df['publication'].replace('', np.nan, inplace=True)
news_df.dropna(subset=['publication'], inplace=True)

# Remote Articles without Headlines
news_df['title'].replace('', np.nan, inplace=True)
news_df.dropna(subset=['title'], inplace=True)

# Calc Word Count of Headlines
news_df['word_count'] = news_df['title'].str.split().str.len()

# Remove Article Where Headline is Less than 2 words
news_df = news_df[news_df['word_count'] > 2]
news_df = news_df.drop('word_count', axis=1)

# Drop Lifestyle Publications and Reset Index
news_df = news_df[~news_df['publication'].isin(pubs_to_remove)].reset_index(drop=True)

In [None]:
#@title Pre-Proccessing Headlines

# Filling NA's
news_df = news_df.fillna('')

# Make Headlines Lower Case & Removing Punctuation
news_df['title'] = news_df['title'].str.lower().apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

# Specifying Additional words
stop_words_add = news_df['publication'].str.split().explode().str.lower().unique().tolist()
stop_words_add.extend(['thehill','(cnn)','(reuters)', 
                       'playerwatch', 'new',  'york', 'times', 
                       'opinion', 'stockswall', 'snapshotwall',
                       'north', 'south'])
# Downloading Stopwords from NLTK
nltk.download('stopwords')

# Saving only English Stopwords
all_stop_words = set(stopwords.words('english'))

# Adding Additional Stopwords
all_stop_words.update(stop_words_add)


# Stop Word Remover Function
def stopword_remover(text):
    words = text.split()
    words = [word for word in words if word not in all_stop_words]
    return ' '.join(words)

# Fixing and Replacing Contractions
news_df['title'] = news_df['title'].apply(lambda x: contractions.fix(x))

# Removing Stop Words
news_df['title'] = news_df['title'].apply(stopword_remover)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# Remove Additional Anomalous Rows
plus_remove = ['watch', 'trump', 'trump trump trump', 'coronavirus us', 'coronavirus', 'coronavirus 1', 'stocks stocks', 'winning numbers drawn ‘pick 3 day’ game', 
     'winning numbers drawn ‘pick 4 day’ game', 'week', 'hills 1230 report inside mueller report', 'today state state', 'preview state', 'us america']

news_df = news_df[~news_df['title'].isin(plus_remove)].reset_index(drop=True)

In [None]:
# Saving Pre-Proccessed Dataset as a Pickle File
news_df.to_pickle("/content/drive/MyDrive/CSE 412 Group Project/Data/cleaned-all-the-news_smpl_270k.pkl")

In [None]:
#@title Hyperparameter Tunning
# Number of Components Tunning 
# Source: https://www.kaggle.com/code/rockystats/topic-modelling-using-nmf

# For All Years
'''
import pandas as pd
import numpy as np
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary
from gensim.models.nmf import Nmf
from collections import Counter
from operator import itemgetter
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

from nltk.tokenize import TweetTokenizer

def casual_tokenizer(text):
    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(text)
    return tokens

# Use Gensim's NMF to get the best num of topics via coherence score
texts = news_df['title'].apply(casual_tokenizer)

# Create a dictionary
# In gensim a dictionary is a mapping between words and their integer id
dictionary = Dictionary(texts)

# Filter out extremes to limit the number of features
dictionary.filter_extremes(
    no_below=3,
    no_above=0.85,
    keep_n=5000
)

# Create the bag-of-words format (list of (token_id, token_count))
corpus = [dictionary.doc2bow(text) for text in texts]

# Create a list of the topic numbers we want to try
topic_nums = list(np.arange(5, 75 + 1, 5))

# Run the nmf model and calculate the coherence score
# for each number of topics
coherence_scores = []

for num in topic_nums:
    nmf = Nmf(
        corpus=corpus,
        num_topics=num,
        id2word=dictionary,
        chunksize=2000,
        passes=5,
        kappa=.1,
        minimum_probability=0.01,
        w_max_iter=300,
        w_stop_condition=0.0001,
        h_max_iter=100,
        h_stop_condition=0.001,
        eval_every=10,
        normalize=True,
        random_state=42
    )
    
    # Run the coherence model to get the score
    cm = CoherenceModel(
        model=nmf,
        texts=texts,
        dictionary=dictionary,
        coherence='c_v'
    )
    
    coherence_scores.append(round(cm.get_coherence(), 5))

# Get the number of topics with the highest coherence score
scores = list(zip(topic_nums, coherence_scores))
best_num_topics = sorted(scores, key=itemgetter(1), reverse=True)[0][0]

# Plot the results
fig = plt.figure(figsize=(15, 7))

plt.plot(
    topic_nums,
    coherence_scores,
    linewidth=3,
    color='#4287f5'
)

plt.xlabel("Topic Num", fontsize=14)
plt.ylabel("Coherence Score", fontsize=14)
plt.title('Coherence Score by Topic Number - Best Number of Topics: {}'.format(best_num_topics), fontsize=18)
plt.xticks(np.arange(5, max(topic_nums) + 1, 5), fontsize=12)
plt.yticks(fontsize=12)

plt.show()

'''

# By Year
'''
import pandas as pd
import numpy as np
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary
from gensim.models.nmf import Nmf
from collections import Counter
from operator import itemgetter
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

from nltk.tokenize import TweetTokenizer

def casual_tokenizer(text):
    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(text)
    return tokens


years = [2016, 2017, 2018, 2019, 2020]

for year in years:

    # Use Gensim's NMF to get the best num of topics via coherence score
    texts = news_df.loc[news_df['year'] == year, 'title'].apply(casual_tokenizer)

    # Create a dictionary
    # In gensim a dictionary is a mapping between words and their integer id
    dictionary = Dictionary(texts)

    # Filter out extremes to limit the number of features
    dictionary.filter_extremes(
        no_below=3,
        no_above=0.85,
        keep_n=5000
    )

    # Create the bag-of-words format (list of (token_id, token_count))
    corpus = [dictionary.doc2bow(text) for text in texts]

    # Create a list of the topic numbers we want to try
    topic_nums = list(np.arange(5, 75 + 1, 5))

    # Run the nmf model and calculate the coherence score
    # for each number of topics
    coherence_scores = []

    for num in topic_nums:
        nmf = Nmf(
            corpus=corpus,
            num_topics=num,
            id2word=dictionary,
            chunksize=2000,
            passes=5,
            kappa=.1,
            minimum_probability=0.01,
            w_max_iter=300,
            w_stop_condition=0.0001,
            h_max_iter=100,
            h_stop_condition=0.001,
            eval_every=10,
            normalize=True,
            random_state=42
        )
        
        # Run the coherence model to get the score
        cm = CoherenceModel(
            model=nmf,
            texts=texts,
            dictionary=dictionary,
            coherence='c_v'
        )
        
        coherence_scores.append(round(cm.get_coherence(), 5))

    # Get the number of topics with the highest coherence score
    scores = list(zip(topic_nums, coherence_scores))
    best_num_topics = sorted(scores, key=itemgetter(1), reverse=True)[0][0]

    # Plot the results
    fig = plt.figure(figsize=(15, 7))

    plt.plot(
        topic_nums,
        coherence_scores,
        linewidth=3,
        color='#4287f5'
    )

    plt.xlabel("Topic Num", fontsize=14)
    plt.ylabel("Coherence Score", fontsize=14)
    plt.title('Coherence Score by Topic Number - Best Number of Topics: {}'.format(best_num_topics), fontsize=18)
    plt.xticks(np.arange(5, max(topic_nums) + 1, 5), fontsize=12)
    plt.yticks(fontsize=12)

    plt.savefig('score' + str(year) + ".png")
    plt.close()
'''
# Saved Plots Can Found Using this Link 
#https://drive.google.com/drive/folders/1hfrEH970GetD0hsIQiAq-4uLzyfsO6si?usp=share_link

In [None]:
#@title NMF Modeling Function

def nmf_by_filter (filt_news_df = news_df, publication_name=None, 
                   year =None,	month=None,	day=None, nun_comp=10):
  '''
  runs NMF on a subset of the data based on the filters
  Importaint Make Sure to Run Pre-Proccessing and Cleaning

  Args:
  - news_df
  - publication_name (string or list): Name of the publication
  - year (int)
  - month (int)
  - day (int)
  - nun_comp(int): Number of Topics

  Returns:
  - A list of dataframes [word_score_by_topic_df, article_score_df, word_score_df]
  - word_score_by_topic: Word Score by Topic 
  - article_score_df: Article Scores
  - word_score_df: Word Scores
  '''

  # Checks to See If any Filters are Specified and Filters Dataset
  if isinstance(publication_name, str):
    publication_name = [publication_name]

  if publication_name is not None:
    filt_news_df = filt_news_df[filt_news_df['publication'].isin(publication_name)]
  
  if year is not None:  
    filt_news_df = filt_news_df[filt_news_df['year'] == year]
  
  if month is not None:  
    filt_news_df = filt_news_df[filt_news_df['month'] == month]
  
  if day is not None:
    filt_news_df = filt_news_df[filt_news_df['day'] == day]

  filt_news_df = filt_news_df.reset_index(drop=True)
  
  #TF-IDF matrix
  filt_vectorizer = TfidfVectorizer(max_df=0.80) # Might Want to tweak max_df <- Hyperparam
  filt_tf_idf = filt_vectorizer.fit_transform(filt_news_df['title'])

  # Extract Feature Names
  filt_feature_names = filt_vectorizer.get_feature_names_out()

  # Model

  filt_model = NMF(init='nndsvd', n_components=nun_comp) # <- n_components Hyperparam
  article_modeled = filt_model.fit_transform(filt_tf_idf)

  # Word Scores Dataframe
  filt_word_score_df = pd.DataFrame(filt_model.components_, columns=filt_feature_names).T

  # Calculated Max score For a Topic and Determines Which Topic a Word Most Closesly it Associates With
  filt_word_score_df['Max_score'] = filt_word_score_df.max(axis=1)
  filt_word_score_df['Topic_Asoc'] = filt_word_score_df.iloc[:,1:-1].idxmax(axis=1)
  filt_word_score_df = filt_word_score_df.sort_values('Max_score', ascending = False).reset_index().rename(columns={'index':'Word'})

  filt_word_score_by_topic = []
  
  # Sorts Word Score DF By Top Word for Each Catagory and Set Col Names
  for i in range(filt_word_score_df.shape[1] - 3):
    cols = filt_word_score_df[['Word',i]].sort_values(i, ascending = False).reset_index(drop=True)
    cols.columns=['Topic ' + str(i) + ' Word', 'Topic ' + str(i) + ' Score']
    filt_word_score_by_topic.append(cols)

  # Saves Top Words by Topic as DF
  filt_word_score_by_topic_df = pd.concat(filt_word_score_by_topic, axis = 1).reset_index().rename(columns={'index':'Rank'})

  # Article/Headline Score Dataframe
  filt_news_df_select = filt_news_df
  filt_article_score_df = pd.concat([filt_news_df_select, pd.DataFrame(article_modeled)], axis=1)

  # Calculated Max score For a Topic and Determines Which Topic a Headline Most Closesly it Associates With
  filt_article_score_df['Max_score'] = filt_article_score_df.iloc[:,6:].max(axis=1)
  filt_article_score_df['Topic_Asoc'] = filt_article_score_df.iloc[:,6:-1].idxmax(axis=1)

  #Returns Word Score and Article Score Dataframes
  return [filt_word_score_by_topic_df, filt_article_score_df, filt_word_score_df]

In [None]:
#@title Runing The NMF Model

# dfs_all_year = nmf_by_filter(nun_comp=60)

# By Year
dfs_2016 = nmf_by_filter(year=2016, nun_comp=50)
# dfs_2017 = nmf_by_filter(year=2017, nun_comp=50)
# dfs_2018 = nmf_by_filter(year=2018, nun_comp=60)
# dfs_2019 = nmf_by_filter(year=2019, nun_comp=40)
# dfs_2020 = nmf_by_filter(year=2020, nun_comp=10)

In [None]:
dfs_2016[0].head()

Unnamed: 0,Rank,Topic 0 Word,Topic 0 Score,Topic 1 Word,Topic 1 Score,Topic 2 Word,Topic 2 Score,Topic 3 Word,Topic 3 Score,Topic 4 Word,...,Topic 45 Word,Topic 45 Score,Topic 46 Word,Topic 46 Score,Topic 47 Word,Topic 47 Score,Topic 48 Word,Topic 48 Score,Topic 49 Word,Topic 49 Score
0,0,trump,5.28921,share,3.646196,us,4.503317,clinton,2.900146,profit,...,world,2.422052,market,2.398414,bln,1.655838,bill,2.074253,update,2.481521
1,1,poll,0.287683,per,3.481434,st,0.250524,hillary,1.323375,net,...,going,0.912393,could,1.442341,group,1.099958,senate,1.067592,growth,0.311746
2,2,would,0.168997,earnings,2.365947,data,0.189681,poll,0.483764,q1,...,around,0.80977,stock,0.585244,unit,0.698124,puerto,0.490989,provides,0.184691
3,3,rally,0.149775,loss,1.459404,military,0.1752,email,0.152101,h1,...,today,0.807451,cramer,0.400677,yuan,0.647842,rico,0.470166,stocksfactors,0.153869
4,4,republicans,0.143753,q2,1.095093,syria,0.159704,fbi,0.147285,zlotys,...,lobbying,0.24435,rally,0.199455,plans,0.633129,energy,0.37491,1uk,0.13534


In [None]:
dfs_2016[1].head()

Unnamed: 0,date,year,month,day,title,publication,0,1,2,3,...,42,43,44,45,46,47,48,49,Max_score,Topic_Asoc
0,2016-09-15 16:46:51,2016,9.0,15,albariño wine forget context,The New York Times,0.0,9.9e-05,0.00021,0.0,...,0.0,2.8e-05,0.0,0.000158,8e-06,0.000118,0.0,0.000349,0.000491,27
1,2016-03-14,2016,3.0,14,cramer gw pharma finds oxycodone replacement,CNBC,0.0,0.000255,0.000211,0.000138,...,0.0,0.0,0.0,0.001274,0.014798,0.002757,2.8e-05,7.7e-05,0.014798,46
2,2016-12-11,2016,12.0,11,briefblue sky alternative investments says us ...,Reuters,0.0,0.0,0.032761,0.0,...,0.0,0.003631,2e-06,0.0,0.0,0.002526,0.0,0.000354,0.052237,5
3,2016-01-22 04:46:03,2016,1.0,22,‘liberty coercion’ gary gerstle,The New York Times,4.5e-05,0.0,0.0,2e-05,...,0.0,0.000142,0.000388,0.000107,0.000245,0.0,0.001684,0.0,0.001684,48
4,2016-08-20 00:00:00,2016,8.0,20,white may saved florida faceeating suspect sho...,Vice News,0.0,0.0,0.0,0.0,...,0.0,0.0,0.00018,0.0,0.0,0.0,0.0,0.0,0.10733,36


In [None]:
dfs_2016[2].head()

Unnamed: 0,Word,0,1,2,3,4,5,6,7,8,...,42,43,44,45,46,47,48,49,Max_score,Topic_Asoc
0,win,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.864115,11
1,big,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.003177,0.0,0.0,0.0,0.0,0.0,6.255719,11
2,gop,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.843846,17
3,debate,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.416019,18
4,trump,5.28921,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.001589,0.0,0.001213,0.0,0.0,0.0,0.0,0.0,5.28921,15


In [None]:
#@title Saving Model Dataframes as Pickle Files

"""
dfs_2016[0].to_pickle("/content/drive/MyDrive/CSE 412 Group Project/Data/word_score_2016_df.pkl")
dfs_2017[0].to_pickle("/content/drive/MyDrive/CSE 412 Group Project/Data/word_score_2017_df.pkl")
dfs_2018[0].to_pickle("/content/drive/MyDrive/CSE 412 Group Project/Data/word_score_2018_df.pkl")
dfs_2019[0].to_pickle("/content/drive/MyDrive/CSE 412 Group Project/Data/word_score_2019_df.pkl")
dfs_2020[0].to_pickle("/content/drive/MyDrive/CSE 412 Group Project/Data/word_score_2020_df.pkl")

dfs_2016[1].to_pickle("/content/drive/MyDrive/CSE 412 Group Project/Data/aricle_score_2016_df.pkl")
dfs_2017[1].to_pickle("/content/drive/MyDrive/CSE 412 Group Project/Data/aricle_score_df_2017.pkl")
dfs_2018[1].to_pickle("/content/drive/MyDrive/CSE 412 Group Project/Data/aricle_score_df_2018.pkl")
dfs_2019[1].to_pickle("/content/drive/MyDrive/CSE 412 Group Project/Data/aricle_score_df_2019.pkl")
dfs_2020[1].to_pickle("/content/drive/MyDrive/CSE 412 Group Project/Data/aricle_score_df_2020.pkl")

dfs_all_year[0].to_pickle("/content/drive/MyDrive/CSE 412 Group Project/Data/word_score_all_year_df.pkl")
dfs_all_year[1].to_pickle("/content/drive/MyDrive/CSE 412 Group Project/Data/aricle_score_all_year_df.pkl")
"""

'\ndfs_2016[0].to_pickle("/content/drive/MyDrive/CSE 412 Group Project/Data/word_score_2016_df.pkl")\ndfs_2017[0].to_pickle("/content/drive/MyDrive/CSE 412 Group Project/Data/word_score_2017_df.pkl")\ndfs_2018[0].to_pickle("/content/drive/MyDrive/CSE 412 Group Project/Data/word_score_2018_df.pkl")\ndfs_2019[0].to_pickle("/content/drive/MyDrive/CSE 412 Group Project/Data/word_score_2019_df.pkl")\ndfs_2020[0].to_pickle("/content/drive/MyDrive/CSE 412 Group Project/Data/word_score_2020_df.pkl")\n\ndfs_2016[1].to_pickle("/content/drive/MyDrive/CSE 412 Group Project/Data/aricle_score_2016_df.pkl")\ndfs_2017[1].to_pickle("/content/drive/MyDrive/CSE 412 Group Project/Data/aricle_score_df_2017.pkl")\ndfs_2018[1].to_pickle("/content/drive/MyDrive/CSE 412 Group Project/Data/aricle_score_df_2018.pkl")\ndfs_2019[1].to_pickle("/content/drive/MyDrive/CSE 412 Group Project/Data/aricle_score_df_2019.pkl")\ndfs_2020[1].to_pickle("/content/drive/MyDrive/CSE 412 Group Project/Data/aricle_score_df_2020.pk