<h1>Import titles using Pickle</h1>

In [1]:
# Dependencies
import pickle

def ReadFile(file_name, title_list):
    
    # Open the files ('rb' is for read binary)
    file_object = open(file_name,'rb')
    
    # Load files into list using pickle
    in_list = pickle.load(file_object)
    
    # Close files
    file_object.close()
    
    # Add list just read in to existing list
    title_list.extend(in_list)
    
    return(title_list)

In [2]:
# Crate empty list for left sites
left_title_list = []

# Read left sites
left_title_list = ReadFile("data/atlantic_titles", left_title_list)
left_title_list = ReadFile("data/mjones_titles", left_title_list)
left_title_list = ReadFile("data/newrepublic_titles", left_title_list)
left_title_list = ReadFile("data/nytimes_titles", left_title_list)
left_title_list = ReadFile("data/politico_titles", left_title_list)
left_title_list = ReadFile("data/slate_titles", left_title_list)
left_title_list = ReadFile("data/thedailybeast_titles", left_title_list)
left_title_list = ReadFile("data/theguardian_titles", left_title_list)
left_title_list = ReadFile("data/theintercept_titles", left_title_list)
left_title_list = ReadFile("data/washpost_titles", left_title_list)

#***MAKE SURE THIS NUMBER MATCHES THE NUMBER OF DOCUMENTS IN MONGODB***
print(len(left_title_list))

# Crate empty list for right sites
right_title_list = []

# Read right sites
right_title_list = ReadFile("data/americanconservative_titles", right_title_list)
right_title_list = ReadFile("data/breitbart_titles", right_title_list)
right_title_list = ReadFile("data/dailywire_titles", right_title_list)
right_title_list = ReadFile("data/economist_titles", right_title_list)
right_title_list = ReadFile("data/fiscaltimes_titles", right_title_list)
right_title_list = ReadFile("data/foxnews_titles", right_title_list)
right_title_list = ReadFile("data/nypost_titles", right_title_list)
right_title_list = ReadFile("data/reason_titles", right_title_list)
right_title_list = ReadFile("data/thehill_titles", right_title_list)
right_title_list = ReadFile("data/washtimes_titles", right_title_list)

#***MAKE SURE THIS NUMBER MATCHES THE NUMBER OF DOCUMENTS IN MONGODB***
print(len(right_title_list))

96205
105703


<h1>Add Site Bias to Dataframes</h1>

In [3]:
# Dependencies
import pandas as pd
import numpy as np

# Bias of 1 -> Left; Bias of 0 -> Right
df1 = pd.DataFrame({'title': np.array(left_title_list), 'bias': 1})
df2 = pd.DataFrame({'title': np.array(right_title_list), 'bias': 0})

print(df1.head())
print(df1.shape)
print()
print(df2.head())
print(df2.shape)

   bias                                              title
0     1  Conservative High Schoolers Want to ‘Own the L...
1     1  The Instagram Forums Where Teens Go to Debate ...
2     1  The Doomed Republican Attempt to Impeach Rod R...
3     1                          Facebook Is Probably Fine
4     1                  Secretary of a State of Confusion
(96205, 2)

   bias                                        title
0     0               Fruits Of The Quiet Revolution
1     0               Trust And Mistrust In Churches
2     0   A Democratic President From Trump Country?
3     0  ‘Arab NATO’: A Terrible Idea That Won’t Die
4     0                       TAC Fall Intern Wanted
(105703, 2)


<h1>Downsample Larger Dataframe</h1>

In [4]:
from sklearn.utils import resample

df2_downsampled = resample(df2, replace = False, n_samples = len(df1), random_state = 41)
print(df2_downsampled.shape)

(96205, 2)


<h1>Combine Left & Right Dataframes</h1>

In [5]:
df_combined = pd.concat([df1, df2_downsampled])
print()
print(df_combined.shape)

df_clean = df_combined.dropna()
print(df_clean.shape)


(192410, 2)
(192328, 2)


<h1>Logistic Regression on Words in Title</h1>

In [6]:
# Dependencies
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Create a matrix of word count using CountVectorizer
# Count single words and word pairs (ngram_range = 1-2)
vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,3))

# Fit and transform data
X = vectorizer.fit_transform(df_clean['title'])

# Create training and test split
X_train, X_test, y_train, y_test  = train_test_split(X, df_clean['bias'], random_state=41)

# Create the model and fit the training data
logreg = LogisticRegression()
logreg.fit(X_train,y_train)

# Show model's score
print(logreg.score(X_test,y_test))

coef_zip = zip(logreg.coef_.tolist()[0],list(vectorizer.get_feature_names()))
coef_list = list(coef_zip)
coef_list.sort()

0.77394867102


In [23]:
print(len(vectorizer.get_feature_names()))

1769280


In [69]:
logreg.predict_proba(vectorizer.transform(['healthcare']))

array([[ 0.4998531,  0.5001469]])

<h1>Output Model Results to JSON File</h1>

In [68]:
# Create empty list of dictionaries for titles
title_dict = {}

# Loop through title list and add to dictionary
for i in range(0,len(coef_list)-1):
    for item in coef_list[i]:
        if type(item) is str:
            title_dict[item] = logreg.predict_proba(vectorizer.transform([item]))[0][0]
            
import json

# Write list of dictionaries to JSON file
with open ('model-data-new.json', 'w') as outfile:
    json.dump([title_list], outfile)

In [30]:
# Create empty list of dictionaries for titles
d = {}

# Loop through title list
for i in range(0,len(coef_list)-1):
    for item in coef_list[i]:
        if type(item) is str:
            d[item] = logreg.predict_proba(vectorizer.transform([item]))[0][0]
            
sorted_dict = sorted(d.items(), key=lambda x:x[1])

In [None]:
print(sorted_dict[10000:20000])