# News Headlines TF-IDF/Keyword Extraction

In [1]:
# import libraries
import os
import datetime
import numpy as np
import pandas as pd
import seaborn as sns
import pandas_datareader as pdr
import matplotlib.pyplot as plt
import pytrends
import json
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer

In [2]:
# exclude stop words, punctuation, one letter word 
stopwords = {'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours',
                       'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself',
                       'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which',
                       'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be',
                       'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'an',
                       'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by',
                       'for', 'with', 'about', 'between', 'into', 'through', 'during', 'before',
                       'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over',
                       'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why',
                       'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such',
                       'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 'can',
                       'just', 'should', 'now', '', 'a', '£m'}

# clean up headlines and return set of words in headline
def cleanHeadline(hl):
    hl = hl.lower()
    hl = re.sub(r"[-,\"@\'?\.$%_\d\+\:]", "", hl, flags=re.I) # rid of punctuation
    hl = re.sub(r"\s+", " ", hl, flags=re.I) # rid of multiple spaces
#     hl = re.split(r"\s+", hl)
    return hl

# TF-IDF

In [15]:
# helper functions for tf-idf
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]

    score_vals = []
    feature_vals = []

    for idx, score in sorted_items:
        fname = feature_names[idx]
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

In [16]:
# get tf-idf scores for each headline
def tf_idf(data, col):
    #get the text column 
    # docs = df_sub[df_sub['category'] == "MONEY"]['headline'].tolist()
    docs = data[col].tolist()

    cv=CountVectorizer(max_df=1.0,stop_words=stopwords)
    word_count_vector=cv.fit_transform(docs)

    # compute IDF
    tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
    tfidf_transformer.fit(word_count_vector)

    #generate tf-idf for all headlines in your list. 
    feature_names=cv.get_feature_names()
    tf_idf_vector=tfidf_transformer.transform(cv.transform(docs))

    results=[]
    for i in range(tf_idf_vector.shape[0]):
        # get vector for a single headline
        curr_vector=tf_idf_vector[i]

        #sort the tf-idf vector by descending order of scores
        sorted_items=sort_coo(curr_vector.tocoo())

        #extract only the top n; n here is 20
        keywords=extract_topn_from_vector(feature_names,sorted_items,20)

        results.append(keywords)

    df=pd.DataFrame(zip(docs,results),columns=['doc','keywords'])
    df.head()
    return df

In [17]:
# get all keywords and their tf-idf scores into a dict 
def getAllWords(df):
    allWords = {}
    for index, row in df.iterrows():
        tmpDict = row['keywords']
        for word in tmpDict:
            if word not in allWords:
                allWords[word] = [tmpDict[word]]
            else:
                allWords[word] += [tmpDict[word]]
    return allWords

# average tf-idf scores and get top n words
def getTopWords(df, n):
    word_df = pd.DataFrame.from_dict(getAllWords(df), orient='index')
    word_df.transpose()
    avg_df = word_df.mean(axis=1) # average tf-idf scores for each word
    avg_df = pd.DataFrame(avg_df)
    avg_df = avg_df.reset_index()
    avg_df.columns = ['word', 'avg_score']
    avg_df = avg_df.sort_values('avg_score', ascending=False)
    return avg_df['word'].head(n).tolist()
    
# print(getTopWords(df, 200))

# The Guardian TF-IDF

In [6]:
guard_df = pd.read_csv('the_guardian_headlines_dataset.csv')
guard_df['Article Title'] = guard_df['Article Title'].apply(lambda x:cleanHeadline(x)) # clean headlines

# Bag of Words Model

In [9]:
# Bag of words model
def BOW(data, col):
    data[col] = data[col].apply(lambda x:cleanHeadline(x)) # clean headlines
    docLst = data[col].tolist()
    doc = "".join(docLst)
    count_vec = CountVectorizer(max_df=1.0,stop_words=stopwords)
    count_occurs = count_vec.fit_transform([doc])
    count_occur_df = pd.DataFrame(
        (count, word) for word, count in
         zip(count_occurs.toarray().tolist()[0], 
        count_vec.get_feature_names()))
    count_occur_df.columns = ['Word', 'Count']
    count_occur_df.sort_values('Count', ascending=False, inplace=True)
    return count_occur_df

# Other News Sources

In [16]:
# Reuters headlines
reuters_bus = pd.read_csv('reutersbusiness_headlines_dataset.csv')
r_bus_count = BOW(reuters_bus, 'Article Title')
print(r_bus_count.head(200))

reuters_bv = pd.read_csv('reutersbreakingviews_headlines_dataset.csv')
r_bv_count = BOW(reuters_bv, 'Article Title')
print(r_bv_count.head(200))

reuters_markets = pd.read_csv('reutersmarkets_headlines_dataset.csv')
r_markets_count = BOW(reuters_markets, 'Article Title')
print(r_markets_count.head(200))

reuters_tech = pd.read_csv('reuterstech_headlines_dataset.csv')
r_tech_count = BOW(reuters_tech, 'Article Title')
print(r_tech_count.head(200))

reuters_wealth = pd.read_csv('reuterswealth_headlines_dataset.csv')
r_wealth_count = BOW(reuters_wealth, 'Article Title')
print(r_wealth_count.head(200))

           Word  Count
41967        us   4131
33817      says   2743
40592     trade   2186
3412    billion   1568
25351       new   1453
...         ...    ...
21215  launches    158
25999     offer    158
5526       case    157
19798  investor    156
43627      wins    156

[200 rows x 2 columns]


44706

In [27]:
# get rid of duplicate words and return df of top words
def aggWords(datas):
    df = pd.DataFrame(columns=['Word', 'Count'])
    for data in datas:
        df = pd.concat([df, data[0:500]]) # gets top 500 words for each category
    df = df.drop_duplicates(subset='Word')
    return df

# get top 1000 words from reuters headlines 
df = aggWords([r_bus_count, r_bv_count, r_markets_count, r_tech_count, r_wealth_count])
print(df)
# convert top words to csv
df['Word'].head(1000).to_csv('top1000words.csv')

           Word Count
41967        us  4131
33817      says  2743
40592     trade  2186
3412    billion  1568
25351       new  1453
...         ...   ...
4586    develop    99
4635       didi    99
18022     using    98
10732    misses    98
4062   customer    97

[1013 rows x 2 columns]
