In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
import math
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import re

%matplotlib inline

In [2]:
data = pd.read_csv('NewsArticles.csv')

In [3]:
# The data parsed in with extra blank columns due to encoding formatting.
data.head(10)

Unnamed: 0,article_id,publish_date,article_source_link,title,subtitle,text,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 134,Unnamed: 135,Unnamed: 136,Unnamed: 137,Unnamed: 138,Unnamed: 139,Unnamed: 140,Unnamed: 141,Unnamed: 142,Unnamed: 143
0,1,2/7/17,http://abcnews.go.com/Politics/pence-break-tie...,"Betsy DeVos Confirmed as Education Secretary, ...",,Michigan billionaire education activist Betsy ...,,,,,...,,,,,,,,,,
1,2,2/7/17,http://abcnews.go.com/Politics/wireStory/melan...,Melania Trump Says White House Could Mean Mill...,,First lady Melania Trump has said little about...,,,,,...,,,,,,,,,,
2,3,2/7/17,http://abcnews.go.com/Politics/wireStory/trump...,"As Trump Fears Fraud, GOP Eliminates Election ...",,A House committee voted on Tuesday to eliminat...,,,,,...,,,,,,,,,,
3,4,2/7/17,http://abcnews.go.com/Politics/appeals-court-d...,Appeals Court to Decide on Challenge to Trump'...,,"This afternoon, three federal judges from the ...",,,,,...,,,,,,,,,,
4,5,2/7/17,http://abcnews.go.com/US/23-states-winter-weat...,At Least 4 Tornadoes Reported in Southeast Lou...,,At least four tornadoes touched down in Louisi...,,,,,...,,,,,,,,,,
5,6,2/7/17,http://abcnews.go.com/International/wireStory/...,Mother of Backpacker Slain in Australia Critic...,,The mother of a backpacker slain in an Austral...,,,,,...,,,,,,,,,,
6,7,2/7/17,http://abcnews.go.com/Politics/trumps-labor-se...,Trump's Labor Secretary Pick Andrew Puzder Adm...,,"Donald Trump's pick for labor secretary, Andre...",,,,,...,,,,,,,,,,
7,8,2/7/17,http://abcnews.go.com/International/wireStory/...,Iran's Top Leader Mocks 'Newcomer' Trump,,"Iran's supreme leader said Tuesday that ""newco...",,,,,...,,,,,,,,,,
8,9,2/7/17,http://abcnews.go.com/International/wireStory/...,EU to Britain: Pay Up for What You Ordered Bef...,,The European Union is warning Britain that any...,,,,,...,,,,,,,,,,
9,10,2/7/17,http://abcnews.go.com/US/multi-state-manhunt-s...,Multi-State Manhunt in Southeast Intensifies f...,,A manhunt is intensifying in the Southeast for...,,,,,...,,,,,,,,,,


In [4]:
# All 'Unnamed' columns can be dropped.
data.columns

Index(['article_id', 'publish_date', 'article_source_link', 'title',
       'subtitle', 'text', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8',
       'Unnamed: 9',
       ...
       'Unnamed: 134', 'Unnamed: 135', 'Unnamed: 136', 'Unnamed: 137',
       'Unnamed: 138', 'Unnamed: 139', 'Unnamed: 140', 'Unnamed: 141',
       'Unnamed: 142', 'Unnamed: 143'],
      dtype='object', length=144)

In [5]:
# Keeping what we need, storing to df. 
df = pd.DataFrame()
cols_keep = ['article_id', 'publish_date', 'article_source_link', 'title',
       'subtitle', 'text']

for col in cols_keep:
    df[col] = data[col]

In [6]:
df.head()

Unnamed: 0,article_id,publish_date,article_source_link,title,subtitle,text
0,1,2/7/17,http://abcnews.go.com/Politics/pence-break-tie...,"Betsy DeVos Confirmed as Education Secretary, ...",,Michigan billionaire education activist Betsy ...
1,2,2/7/17,http://abcnews.go.com/Politics/wireStory/melan...,Melania Trump Says White House Could Mean Mill...,,First lady Melania Trump has said little about...
2,3,2/7/17,http://abcnews.go.com/Politics/wireStory/trump...,"As Trump Fears Fraud, GOP Eliminates Election ...",,A House committee voted on Tuesday to eliminat...
3,4,2/7/17,http://abcnews.go.com/Politics/appeals-court-d...,Appeals Court to Decide on Challenge to Trump'...,,"This afternoon, three federal judges from the ..."
4,5,2/7/17,http://abcnews.go.com/US/23-states-winter-weat...,At Least 4 Tornadoes Reported in Southeast Lou...,,At least four tornadoes touched down in Louisi...


In [7]:
# Lets take a look at the first article by ABC News
print(df['text'][0])

Michigan billionaire education activist Betsy DeVos was confirmed today to serve as the secretary of education in President Trump's administration, after Vice President Mike Pence cast a tie-breaking vote in the Senate. The Senate voted on DeVos"?highly contentious nomination this afternoon, and the tally was split evenly, requiring Pence to use his authority as president of the upper chamber of Congress to break the impasse. This was the first time that a vice president has broken a tie to confirm a Cabinet nominee. Pence read the vote count 50-50 and then voted himself, rendering the tally 51-50. The day before the vote, Democrats staged a 24-hour marathon of speeches, with more than 30 lawmakers taking to the floor to urge at least one additional Republican to vote against DeVos and block her confirmation. "It is hard to imagine a worse choice,"?Sen. Elizabeth Warren, D-Mass., said before she read letters from constituents urging her to vote no. DeVos stirred up vehement opposition 

In [8]:
# Going to start with a quick latent semantic analysis of just the article text.
corpus = df['text']

In [9]:
# Number of documents in the corpus
len(corpus)

3824

In [10]:
corpus = corpus.astype(str)

In [11]:
# Convert all text to lower-case
corpus = [x.lower() for x in corpus]

In [12]:
print(corpus[0])

michigan billionaire education activist betsy devos was confirmed today to serve as the secretary of education in president trump's administration, after vice president mike pence cast a tie-breaking vote in the senate. the senate voted on devos"?highly contentious nomination this afternoon, and the tally was split evenly, requiring pence to use his authority as president of the upper chamber of congress to break the impasse. this was the first time that a vice president has broken a tie to confirm a cabinet nominee. pence read the vote count 50-50 and then voted himself, rendering the tally 51-50. the day before the vote, democrats staged a 24-hour marathon of speeches, with more than 30 lawmakers taking to the floor to urge at least one additional republican to vote against devos and block her confirmation. "it is hard to imagine a worse choice,"?sen. elizabeth warren, d-mass., said before she read letters from constituents urging her to vote no. devos stirred up vehement opposition 

In [13]:
# Import NTLK stopset
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jsche4/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [14]:
# This cell will be used to add new exclusion words to the stopset before and/or after model testing.
# Note: Most of the words below were added over the course of numerous output testing efforts.

stopset = set(stopwords.words('english'))

stopset.update(['facebook','whatsapp','digg','reddit','newsvine','permalink','tumblr','linkedin','http','dw','com'])

In [15]:
# Define the vectorizer model
vectorizer = TfidfVectorizer(stop_words=stopset, use_idf=True, ngram_range=(2, 4),smooth_idf=True)

# Fit the corpus data
X = vectorizer.fit_transform(corpus)

In [16]:
# First article vectorized
print(X[0])

  (0, 1654612)	0.0368487587678
  (0, 352153)	0.0368487587678
  (0, 848000)	0.0368487587678
  (0, 102445)	0.0368487587678
  (0, 340347)	0.0295069368516
  (0, 770820)	0.035102546157
  (0, 595129)	0.0338635885925
  (0, 2697259)	0.0338635885925
  (0, 2397214)	0.035102546157
  (0, 2366111)	0.0629069935165
  (0, 848565)	0.0368487587678
  (0, 2033749)	0.0163211854688
  (0, 2744989)	0.0149115546552
  (0, 118875)	0.0368487587678
  (0, 2844337)	0.0350351849327
  (0, 2031463)	0.0214156528448
  (0, 1659726)	0.0209618977437
  (0, 1926173)	0.0338635885925
  (0, 451412)	0.0338635885925
  (0, 2681722)	0.0321173759817
  (0, 383519)	0.0321173759817
  (0, 2862626)	0.0598348165394
  (0, 2386065)	0.0368487587678
  (0, 2386211)	0.0321173759817
  (0, 2863243)	0.035102546157
  :	:
  (0, 2558388)	0.0368487587678
  (0, 2093894)	0.0368487587678
  (0, 2351944)	0.0368487587678
  (0, 594275)	0.0368487587678
  (0, 2862632)	0.0368487587678
  (0, 2385587)	0.0368487587678
  (0, 1203416)	0.035102546157
  (0, 848398)	0.0

In [17]:
# The current shape is (documents, terms)
X.shape

(3824, 3012078)

In [18]:
# Defining the TruncatedSVD model

# Params: n_components=100 for LSA per sk-learn doc, n_iter=5 (default, and should be adjusted during testing) 
lsa = TruncatedSVD(n_components=100, n_iter=5)

# Fit the model
lsa.fit(X)


TruncatedSVD(algorithm='randomized', n_components=100, n_iter=5,
       random_state=None, tol=0.0)

In [19]:
import sys
print (sys.version)

3.6.0 |Anaconda custom (x86_64)| (default, Dec 23 2016, 13:19:00) 
[GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.57)]


In [20]:
# Convert the SVD results from numerical representation, back to their appropriate word text form.
# Iterates over the enumeration of matrix components, for each: zips the terms to components, sorts them, then prints. 
terms = vectorizer.get_feature_names()
for i, comp in enumerate(lsa.components_): 
    termsInComp = zip (terms,comp)
    sortedTerms =  sorted(termsInComp, key=lambda x: x[1], reverse=True) [:10]
    print("Concept %d:" % i )
    for term in sortedTerms:
        print(term[0])
    print (" ")

Concept 0:
awards coverage
awards coverage sent
awards coverage sent right
bits industry
bits industry awards
bits industry awards coverage
celebrity news
celebrity news hilarious
celebrity news hilarious late
coverage sent
 
Concept 1:
north korea
kim jong
north korean
jong nam
kim jong nam
south korea
kuala lumpur
white house
jong un
kim jong un
 
Concept 2:
white house
mr trump
donald trump
trump first
president donald
president donald trump
intelligence committee
first 100
first 100 days
100 days
 
Concept 3:
provisional suspensions
russian cross
russian cross country
cross country
alexey petukhov
evgenia shapovalova
ivanova evgenia
ivanova evgenia shapovalova
julia ivanova
julia ivanova evgenia
 
Concept 4:
trump first 100 days
trump first 100
first 100 days
first 100
100 days
trump first
100 days impact
days impact
first 100 days impact
huffington post
 
Concept 5:
orly airport
car later found
die allah
car later
south paris
ben belgacem
later found
put gun
garges les
garges les 

Concept 53:
la la
la la land
la land
ukrainian military
contact group
south africa
lavrov said
best picture
however ukrainian
however ukrainian military
 
Concept 54:
al bab
ukrainian military
syrian government
contact group
impact learn
100 days impact learn
days impact learn
march 20
however ukrainian
however ukrainian military
 
Concept 55:
year old
mr trump
white house
pic twitter
anti doping
ms mclaughlin
tennis player
tennis tournament
independence referendum
summer olympics
 
Concept 56:
northern ireland
mr kenny
march 20
mr brokenshire
vladimir putin
president vladimir
president vladimir putin
le pen
climate change
sioux tribe
 
Concept 57:
aerospace force
national insurance
vitaly churkin
mr trump
press office
nuclear weapons
combat readiness
human rights
mr bailey
moscow february
 
Concept 58:
la la
la la land
la land
mr bailey
le pen
white house
best picture
nuclear weapons
northern ireland
intelligence committee
 
Concept 59:
south africa
al bab
national insurance
northern 

From the LSA, I was able to get a quick snapshot at the most significant concepts across the entire corpus of news articles. These articles are not based on one topic, rather are random as seen above some may involve entertainment/celebrity news while others state of national affairs and politics. For this model, I'm mainly interested in those articles which can potentially carry a positive or negative bias politically in some way. I am not at all concerned with sports, entertainment, or the latest viral trend on youtube (unless it happens to be polically related!). 

I will implement K-means clustering on the titles of the articles to see if I can classify/label them into categories based on their topic. After, I can then disregard the non-relevant articles and begin work on the bias classifier.

In [21]:
titles = df['title']

In [22]:
print(titles)

0       Betsy DeVos Confirmed as Education Secretary, ...
1       Melania Trump Says White House Could Mean Mill...
2       As Trump Fears Fraud, GOP Eliminates Election ...
3       Appeals Court to Decide on Challenge to Trump'...
4       At Least 4 Tornadoes Reported in Southeast Lou...
5       Mother of Backpacker Slain in Australia Critic...
6       Trump's Labor Secretary Pick Andrew Puzder Adm...
7                Iran's Top Leader Mocks 'Newcomer' Trump
8       EU to Britain: Pay Up for What You Ordered Bef...
9       Multi-State Manhunt in Southeast Intensifies f...
10      Flu Takes a Toll in NYC With 4 Children Report...
11      Romania Protests Endure as President Says Coun...
12      Hillary Clinton Releases Video Statement: 'Fut...
13      Homeland Security Secretary John Kelly Defends...
14      2 Other Times Kellyanne Conway Referred to Bow...
15      Trump Says 'Very Dishonest Press Does Not Want...
16      Tom Brady Says Wife Gisele Bundchen Wants Him ...
17      Texas 

In [24]:
# Cast to string type, and lowercase all chars
titles = titles.astype(str)
titles = [x.lower() for x in titles]

In [25]:
# Define the vectorizer model for titles (K-means prep)
vectorizer = TfidfVectorizer(stop_words=stopset, use_idf=True, ngram_range=(2, 4),smooth_idf=True)

# Fit the corpus data
X = vectorizer.fit_transform(titles)

In [26]:
titles[0]

'betsy devos confirmed as education secretary, with pence casting historic tie-breaking vote'

In [28]:
print(X[0])

  (0, 5108)	0.173227040217
  (0, 12671)	0.189327738927
  (0, 9708)	0.189327738927
  (0, 14274)	0.173227040217
  (0, 41044)	0.198746043907
  (0, 33399)	0.198746043907
  (0, 7465)	0.198746043907
  (0, 21109)	0.189327738927
  (0, 46737)	0.189327738927
  (0, 6184)	0.189327738927
  (0, 5109)	0.189327738927
  (0, 12672)	0.189327738927
  (0, 9709)	0.189327738927
  (0, 14279)	0.198746043907
  (0, 41045)	0.198746043907
  (0, 33400)	0.198746043907
  (0, 7466)	0.198746043907
  (0, 21110)	0.189327738927
  (0, 46738)	0.189327738927
  (0, 5110)	0.189327738927
  (0, 12673)	0.189327738927
  (0, 9710)	0.198746043907
  (0, 14280)	0.198746043907
  (0, 41046)	0.198746043907
  (0, 33401)	0.198746043907
  (0, 7467)	0.198746043907
  (0, 21111)	0.189327738927


In [29]:
def find_k (X, k_range, sample_percent=1):
    """
    k_range: a list of possible k values
    X, the data we're clustering on
    """
    from sklearn.cluster import KMeans
    import matplotlib.pyplot as plt
    import numpy as np
    from scipy.spatial.distance import cdist
    from sklearn.metrics import pairwise_distances

    N = X.shape[0]
    sampleSize = X.shape[0] * sample_percent

    if sampleSize > 0:
        index = np.arange(np.shape(X)[0])
        np.random.shuffle(index)
        X =  X[index, :]


    mean_distortions=[]
    for k in k_range:
        #cluster using k, then calculate the mean distortion (average distance to closest centroid)
        kmeans_model = KMeans(n_clusters=k, init='k-means++', n_jobs=-1).fit(X)
        mean_distortions.append(sum(np.min(pairwise_distances(X, kmeans_model.cluster_centers_,
                                                              metric='euclidean'),axis=1)) / X.shape[0])


    #visualize results
    plt.plot(k_range, mean_distortions)
    plt.xlabel("K Value")
    plt.ylabel("Mean Distortion")
    plt.title("Elbow Graph for Mean Distortion per K")
    plt.show()

In [32]:
df.columns

Index(['article_id', 'publish_date', 'article_source_link', 'title',
       'subtitle', 'text'],
      dtype='object')

In [33]:
df.article_source_link

0       http://abcnews.go.com/Politics/pence-break-tie...
1       http://abcnews.go.com/Politics/wireStory/melan...
2       http://abcnews.go.com/Politics/wireStory/trump...
3       http://abcnews.go.com/Politics/appeals-court-d...
4       http://abcnews.go.com/US/23-states-winter-weat...
5       http://abcnews.go.com/International/wireStory/...
6       http://abcnews.go.com/Politics/trumps-labor-se...
7       http://abcnews.go.com/International/wireStory/...
8       http://abcnews.go.com/International/wireStory/...
9       http://abcnews.go.com/US/multi-state-manhunt-s...
10      http://abcnews.go.com/Health/flu-takes-toll-ny...
11      http://abcnews.go.com/International/wireStory/...
12      http://abcnews.go.com/US/wireStory/hillary-cli...
13      http://abcnews.go.com/Politics/homeland-securi...
14      http://abcnews.go.com/Politics/times-kellyanne...
15      http://abcnews.go.com/Politics/trump-dishonest...
16      http://abcnews.go.com/Entertainment/tom-brady-...
17      http:/

In [36]:
# Cleanning the Article URLs, only want the domains for classifying between news organizations.
i = 0
from urllib.parse import urlparse
for url in df.article_source_link:
    domain = url.split("//")[-1].split("/")[0]
    df.article_source_link[i] = domain
    i+=1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [38]:
df.article_source_link.value_counts()

www.aljazeera.com           559
tass.com                    485
abcnews.go.com              474
www.rte.ie                  443
www.dw.com                  436
www.huffingtonpost.com      436
europe.chinadaily.com.cn    360
www.bbc.co.uk               355
www.cnn.com                 276
Name: article_source_link, dtype: int64

In [39]:
# Definitely could have done this more efficiently, but now I want only the news organization names.
# Labeling as domain_countrycode (SA=Saudi Arabia, RU=Russia, US=United States, IE=Ireland, GE=Germany, CN=China, UK=United Kingdom)
i = 0
from urllib.parse import urlparse
for url in df.article_source_link:
    if (url == 'www.aljazeera.com'):
        df.article_source_link[i] = 'ALJAZEERA_SA'
    if (url == 'tass.com'):
        df.article_source_link[i] = 'TASS_RU'
    if (url == 'abcnews.go.com'):
        df.article_source_link[i] = 'ABC_US'
    if (url == 'www.rte.ie'):
        df.article_source_link[i] = 'RTE_IE'
    if (url == 'www.dw.com'):
        df.article_source_link[i] = 'DW_GE'
    if (url == 'www.huffingtonpost.com'):
        df.article_source_link[i] = 'HUFFINGTONPOST_US'
    if (url == 'europe.chinadaily.com.cn'):
        df.article_source_link[i] = 'CHINADAILY_CN'
    if (url == 'www.bbc.co.uk'):
        df.article_source_link[i] = 'BBC_UK'
    if (url == 'www.cnn.com'):
        df.article_source_link[i] = 'CNN_US'
    i+=1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a

In [42]:
df['article_source_link'].value_counts()

ALJAZEERA_SA         559
TASS_RU              485
ABC_US               474
RTE_IE               443
HUFFINGTONPOST_US    436
DW_GE                436
CHINADAILY_CN        360
BBC_UK               355
CNN_US               276
Name: article_source_link, dtype: int64

In [43]:
df.head()

Unnamed: 0,article_id,publish_date,article_source_link,title,subtitle,text
0,1,2/7/17,ABC_US,"Betsy DeVos Confirmed as Education Secretary, ...",,Michigan billionaire education activist Betsy ...
1,2,2/7/17,ABC_US,Melania Trump Says White House Could Mean Mill...,,First lady Melania Trump has said little about...
2,3,2/7/17,ABC_US,"As Trump Fears Fraud, GOP Eliminates Election ...",,A House committee voted on Tuesday to eliminat...
3,4,2/7/17,ABC_US,Appeals Court to Decide on Challenge to Trump'...,,"This afternoon, three federal judges from the ..."
4,5,2/7/17,ABC_US,At Least 4 Tornadoes Reported in Southeast Lou...,,At least four tornadoes touched down in Louisi...


In [46]:
# More than half of our subtitle feature values are missing.
df.subtitle.isnull().value_counts()

False    2337
True     1487
Name: subtitle, dtype: int64

In [48]:
# For the purpose of this analysis, I will not be using the dates (at least yet), I will pop and pickle that feature.
from sklearn.externals import joblib
dates = df.pop('publish_date')
joblib.dump(dates, 'dates_col.pkl') 

['dates_col.pkl']

In [50]:
# The article_id number appears to be sequential and meaningless. I will just drop that column entirely.
df.drop('article_id',axis=1,inplace=True)

In [52]:
# I'm still undecided how I plan to approach/use the subtitle feature yet.
# For now I will mark all the missing values and leave it in tact.
df['subtitle'].fillna("Missing", inplace=True)

In [54]:
# Finally, I will checkpoint here by saving my prepped df to pickle.
joblib.dump(df, 'df_prepped.pkl') 

['df_prepped.pkl']