# FOX Word Clouds #

In this notebook, I will view wordclouds from 12 months of FOX news.

In [1]:
import pandas as pd
import numpy as np
import re

from datetime import datetime
from dateutil.relativedelta import relativedelta

from wordcloud import WordCloud

import nltk
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [5]:
fox_df = pd.read_csv('../data/interim/fox-last-year-sent-comb.csv')
fox_df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,sentence,start_snip,end_snip,contributor,runtime,start_time,stop_time,identifier,subjects
0,0,0,week.,0,60,FOXNEWSW,01:00:58,2019-05-28 01:00:00,2019-05-28 02:00:59,FOXNEWSW_20190528_010000_Hannity_Memorial_Day_...,"['biden', 'russia', 'alec baldwin', 'donald tr..."
1,1,1,tucker: tune in every night to the sworn enem...,0,60,FOXNEWSW,01:00:58,2019-05-28 01:00:00,2019-05-28 02:00:59,FOXNEWSW_20190528_010000_Hannity_Memorial_Day_...,"['biden', 'russia', 'alec baldwin', 'donald tr..."
2,2,2,have a great memorial day evening. see you tom...,0,60,FOXNEWSW,01:00:58,2019-05-28 01:00:00,2019-05-28 02:00:59,FOXNEWSW_20190528_010000_Hannity_Memorial_Day_...,"['biden', 'russia', 'alec baldwin', 'donald tr..."
3,3,4,"sean: looking to the special edition of ""ha...",0,60,FOXNEWSW,01:00:58,2019-05-28 01:00:00,2019-05-28 02:00:59,FOXNEWSW_20190528_010000_Hannity_Memorial_Day_...,"['biden', 'russia', 'alec baldwin', 'donald tr..."
4,4,5,let's go to a flashback.,0,60,FOXNEWSW,01:00:58,2019-05-28 01:00:00,2019-05-28 02:00:59,FOXNEWSW_20190528_010000_Hannity_Memorial_Day_...,"['biden', 'russia', 'alec baldwin', 'donald tr..."


In [6]:
fox_df = fox_df.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'])

In [7]:
len(fox_df)

2474856

In [8]:
fox_df['start_time'] = pd.to_datetime(fox_df['start_time'])
fox_df['stop_time'] = pd.to_datetime(fox_df['stop_time'])
fox_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2474856 entries, 0 to 2474855
Data columns (total 9 columns):
 #   Column       Dtype         
---  ------       -----         
 0   sentence     object        
 1   start_snip   int64         
 2   end_snip     int64         
 3   contributor  object        
 4   runtime      object        
 5   start_time   datetime64[ns]
 6   stop_time    datetime64[ns]
 7   identifier   object        
 8   subjects     object        
dtypes: datetime64[ns](2), int64(2), object(5)
memory usage: 169.9+ MB


In [15]:
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say', 'could',
                           '_', 'be', 'know', 'good', 'go', 'get', 'do', 'done', 'try', 'many',
                           'some', 'nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily',
                           'lot', 'lack', 'make', 'want', 'seem', 'run', 'need', 'even', 'right',
                           'line', 'even', 'also', 'may', 'take', 'come', 'hi', 'ha', 'le', 'u', 'wa', 'thi',
                           'to', 'one'])

In [16]:
def clean_sent(sentences):
    for sent in sentences:
        sent = re.sub('\s+', ' ', sent)  # remove newline chars
        sent = re.sub("\'", "", sent)  # remove single quotes
        sent = re.sub("([\d,\,\./!#$%&\'\":;>\?@\[\]`)(\+])+", "", sent) # remove digits and remove punctuation
        sent = re.sub("([-])+", " ", sent)
        yield(sent)  

In [17]:
vect = CountVectorizer(strip_accents='unicode', stop_words=stop_words, 
                       min_df=2, max_df=0.3, ngram_range=(2,2))

In [18]:
start_date = fox_df['start_time'].min()
month_df_list = []
for i in range(12):
    month_start = start_date + relativedelta(months=i)
    month_end = month_start + relativedelta(months=1)
    month_df = fox_df[(fox_df['start_time'] >= month_start) & (fox_df['start_time'] < month_end)]
    print(len(month_df))
    month_df_list.append(month_df)
    
print(len(month_df_list))

222589
202614
223310
236944
168128
249392
214486
234296
234291
193337
153342
142127
12


In [19]:
count_dicts = []
for i in range(12):
    corpus = list(clean_sent(month_df_list[i].sentence.values.tolist()))
    count_data = vect.fit_transform(corpus)
    words = vect.get_feature_names()
    total_counts = np.zeros(len(words))
    for t in count_data:
        total_counts+=t.toarray()[0]
    count_dicts.append(dict(zip(words, total_counts)))
    print(len(count_dicts))

1
2
3
4
5
6
7
8
9
10
11
12


In [20]:
# Create a WordCloud object
wordcloud = WordCloud(background_color="white", max_words=5000, contour_width=3, contour_color='steelblue')
for i in range(12):
    # Generate a word cloud
    wordcloud.generate_from_frequencies(count_dicts[i])
    # Visualize the word cloud
    wordcloud.to_image()
    wordcloud.to_file('../reports/figures/month_' + str(i) + 'fox.png')

In [21]:
for i in range(12):
    df = pd.DataFrame(count_dicts[i], index=[0])
    df.to_csv(r'../reports/fox_month_' + str(i) + 'word_frequencies.csv')