<h1>Import titles using Pickle</h1>

In [1]:
# Dependencies
import pickle

def ReadFile(file_name, title_list):
    
    # Open the files ('rb' is for read binary)
    file_object = open(file_name,'rb')
    
    # Load files into list using pickle
    in_list = pickle.load(file_object)
    
    # Close files
    file_object.close()
    
    # Add list just read in to existing list
    title_list.extend(in_list)
    
    return(title_list)

In [2]:
# Crate empty list for left sites
left_title_list = []

# Read left sites
left_title_list = ReadFile("data/atlantic_titles", left_title_list)
left_title_list = ReadFile("data/mjones_titles", left_title_list)
left_title_list = ReadFile("data/newrepublic_titles", left_title_list)
left_title_list = ReadFile("data/nytimes_titles", left_title_list)
left_title_list = ReadFile("data/politico_titles", left_title_list)
left_title_list = ReadFile("data/slate_titles", left_title_list)
left_title_list = ReadFile("data/thedailybeast_titles", left_title_list)
left_title_list = ReadFile("data/theguardian_titles", left_title_list)
left_title_list = ReadFile("data/theintercept_titles", left_title_list)
left_title_list = ReadFile("data/washpost_titles", left_title_list)

#***MAKE SURE THIS NUMBER MATCHES THE NUMBER OF DOCUMENTS IN MONGODB***
print(len(left_title_list))

# Crate empty list for right sites
right_title_list = []

# Read right sites
right_title_list = ReadFile("data/americanconservative_titles", right_title_list)
right_title_list = ReadFile("data/breitbart_titles", right_title_list)
right_title_list = ReadFile("data/dailywire_titles", right_title_list)
right_title_list = ReadFile("data/economist_titles", right_title_list)
right_title_list = ReadFile("data/fiscaltimes_titles", right_title_list)
right_title_list = ReadFile("data/foxnews_titles", right_title_list)
right_title_list = ReadFile("data/nypost_titles", right_title_list)
right_title_list = ReadFile("data/reason_titles", right_title_list)
right_title_list = ReadFile("data/thehill_titles", right_title_list)
right_title_list = ReadFile("data/washtimes_titles", right_title_list)

#***MAKE SURE THIS NUMBER MATCHES THE NUMBER OF DOCUMENTS IN MONGODB***
print(len(right_title_list))

96205
105703


<h1>Add Site Bias to Dataframes</h1>

In [3]:
# Dependencies
import pandas as pd
import numpy as np

# Bias of 1 -> Left; Bias of 0 -> Right
df1 = pd.DataFrame({'title': np.array(left_title_list), 'bias': 1})
df2 = pd.DataFrame({'title': np.array(right_title_list), 'bias': 0})

print(df1.head())
print(df1.shape)
print()
print(df2.head())
print(df2.shape)

   bias                                              title
0     1  Conservative High Schoolers Want to ‘Own the L...
1     1  The Instagram Forums Where Teens Go to Debate ...
2     1  The Doomed Republican Attempt to Impeach Rod R...
3     1                          Facebook Is Probably Fine
4     1                  Secretary of a State of Confusion
(96205, 2)

   bias                                        title
0     0               Fruits Of The Quiet Revolution
1     0               Trust And Mistrust In Churches
2     0   A Democratic President From Trump Country?
3     0  ‘Arab NATO’: A Terrible Idea That Won’t Die
4     0                       TAC Fall Intern Wanted
(105703, 2)


<h1>Downsample Larger Dataframe</h1>

In [4]:
from sklearn.utils import resample

df2_downsampled = resample(df2, replace = False, n_samples = len(df1), random_state = 41)
print(df2_downsampled.shape)

(96205, 2)


<h1>Combine Left & Right Dataframes</h1>

In [5]:
df_combined = pd.concat([df1, df2_downsampled])
print()
print(df_combined.shape)

df_clean = df_combined.dropna()
print(df_clean.shape)


(192410, 2)
(192328, 2)


<h1>Logistic Regression on Words in Title</h1>

In [6]:
# Dependencies
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Create a matrix of word count using CountVectorizer
# Count single words and word pairs (ngram_range = 1-2)
vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,3))

# Fit and transform data
X = vectorizer.fit_transform(df_clean['title'])

# Create training and test split
X_train, X_test, y_train, y_test  = train_test_split(X, df_clean['bias'], random_state=41)

# Create the model and fit the training data
logreg = LogisticRegression()
logreg.fit(X_train,y_train)

# Show model's score
print(logreg.score(X_test,y_test))

coef_zip = zip(logreg.coef_.tolist()[0],list(vectorizer.get_feature_names()))
coef_list = list(coef_zip)
coef_list.sort()

0.77394867102


In [None]:
# Show top 20 words/phrases for right and left
for i in range(0,200):
    print(coef_list[i], coef_list[len(coef_list) - i - 1])

<h1>Output Model Results to JSON File</h1>

In [30]:
# Create empty list of dictionaries for titles
d = {}

# Loop through title list
for i in range(0,len(coef_list)-1):
    for item in coef_list[i]:
        if type(item) is str:
            d[item] = logreg.predict_proba(vectorizer.transform([item]))[0][0]
            
sorted_dict = sorted(d.items(), key=lambda x:x[1])

In [70]:
logreg.predict_proba(vectorizer.transform(['undocumented immigrant']))

array([[ 0.0956424,  0.9043576]])

In [71]:
print(sorted_dict[10000:20000])

[('redskins washington post', 6.0666422376454143e-05), ('annotated washington post', 0.00021066202459985028), ('say washington post', 0.00026483490116069319), ('happened washington post', 0.00032990247435149556), ('capitals washington post', 0.00035599844957823201), ('forecast washington post', 0.00035963935818028592), ('cosby washington post', 0.00036614445410820817), ('helped washington post', 0.00038637316756462425), ('favorite washington post', 0.00039762843136703641), ('explained washington post', 0.00040045942630739795), ('brooklyn washington post', 0.00044198090069147433), ('came washington post', 0.00046444443182891426), ('inside washington post', 0.00048555069095934655), ('learn washington post', 0.00048603064108609928), ('pruitt washington post', 0.00048755690874946467), ('stream washington post', 0.00049457774030614576), ('movies washington post', 0.00050186084698866651), ('separations washington post', 0.00051368390844452883), ('episode washington post', 0.00051524021376248

In [68]:
# Create empty list of dictionaries for titles
title_dict = {}

# Loop through title list
for i in range(0,len(coef_list)-1):
    for item in coef_list[i]:
        if type(item) is str:
            #title_json.append({'phrase': item, 'probab': logreg.predict_proba(vectorizer.transform([item]))[0][0]})
            title_dict[item] = logreg.predict_proba(vectorizer.transform([item]))[0][0]
            #title_list.append(title_dict)
            
#title_dict
            
import json

# Write list of dictionaries to JS file
with open ('model-data-new.json', 'w') as outfile:
    json.dump([title_list], outfile)

In [119]:
# Dependencies
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Create a matrix of word count using CountVectorizer
# Count single words and word pairs (ngram_range = 1-2)
vectorizer = CountVectorizer(stop_words='english', ngram_range=(2,3))

# Fit and transform data
X = vectorizer.fit_transform(df_combined['title'])

# Create training and test split
X_train, X_test, y_train, y_test  = train_test_split(X, df_combined['bias'], random_state=42, 
                                                    stratify=df_combined['bias'])

# Create the model and fit the training data
logreg = LogisticRegression()
logreg.fit(X_train,y_train)

#logreg.score(X_test,y_test)

a = zip(logreg.coef_.tolist()[0],list(vectorizer.get_feature_names()))
b = list(a)
b.sort()

In [23]:
print(len(vectorizer.get_feature_names()))

1769280


In [135]:
logreg.predict_proba(vectorizer.transform(['syria war']))

array([[ 0.22263179,  0.77736821]])

In [187]:
right_words = ['2nd amendment', 'daca amnesty', 'open borders', 'illegal immigrant', 'islamic state']

right_scores = []

for phrase in right_words:
    score = logreg.predict_proba(vectorizer.transform([phrase]))[0][0] * 100
    right_scores.append(score)
    
left_words = ['undocumented immigrants', 'muslim ban', 'syria war', 'family separations', 'russia inquiry']

left_scores = []

for phrase in left_words:
    score = -logreg.predict_proba(vectorizer.transform([phrase]))[0][1] * 100
    left_scores.append(score)
    
print(right_scores)
print(left_scores)

[90.550753572625112, 93.301483246755623, 94.016395249112975, 96.443661108029289, 96.712372869410416]
[-75.854714214487345, -77.511498328533079, -77.736821015758963, -80.368374524988155, -82.59889590187592]


In [197]:
import plotly
import plotly.graph_objs as go
import numpy as np

left_text = []
for i in range(0,5):
    text = left_words[i] + ' ' + str(round(left_scores[i],1)) + '% '
    left_text.append(text)
    
right_text = []
for i in range(0,5):
    text = right_words[i] + ' ' + str(round(right_scores[i],1)) + '% '
    right_text.append(text)

trace1 = go.Bar(
    y = np.arange(1,6),
    x = left_scores,
    text = left_text,
    textposition = 'auto',
    textfont = dict(size = 16, color = 'white'),
    hoverinfo = 'none',
    name = 'Left',
    orientation = 'h',
    marker = dict(color = 'blue'),
    offset = [-0.5,-1,-1,-1,0]
)
trace2 = go.Bar(
    y = np.arange(1,6),
    x = right_scores,
    text = right_text,
    textposition = 'auto',
    textfont = dict(size = 16, color = 'white'),
    hoverinfo = 'none',
    name = 'Right',
    orientation = 'h',
    marker = dict(color = 'red')
)

data = [trace1, trace2]

layout = go.Layout(
    barmode='group',
    title = '<b>Model Phrase Probability</b>',
    titlefont = dict(
        size = 24
    ),
    yaxis = dict(
        showticklabels = False
    ),
    xaxis = dict(
        title = 'Probability that article has left/right bias',
    )
)

fig = go.Figure(data=data, layout=layout)
plotly.offline.plot(fig, filename = 'joint-bar.html')

'file://C:\\Users\\spenc\\Documents\\pattern\\joint-bar.html'

In [189]:
data1 = [trace1]
data2 = [trace2]

layout = go.Layout(
    barmode='group',
    title = '<b>Left Bias Probability</b>',
    titlefont = dict(
        size = 24
    ),
    yaxis = dict(
        showticklabels = False
    ),
    xaxis = dict(
        title = 'Probability that article bias is left',
    )
)

fig = go.Figure(data=data1, layout=layout)
plotly.offline.plot(fig, filename = 'left-bar.html')

layout = go.Layout(
    barmode='group',
    title = '<b>Right Bias Probability</b>',
    titlefont = dict(
        size = 24
    ),
    yaxis = dict(
        showticklabels = False
    ),
    xaxis = dict(
        title = 'Probability that article bias is right',
    )
)

fig = go.Figure(data=data2, layout=layout)
plotly.offline.plot(fig, filename = 'right-bar.html')

'file://C:\\Users\\spenc\\Documents\\pattern\\right-bar.html'