In [1]:
# PROJECT IDEA(S)
# take ~10000 known fake
# take ~10000 known real
# combine and take ~25% to put in holdout set - do not use to model - use as verifier of model
# feature extraction - 
# can have 10 different metrics for exclamation marks: 
# total number of exclamation marks per 

# look at number of key words: "outrageous", "strong words"
# Q: how strong is the strongest word
# unique word count - word frequency
# columns: fake / not fake, trustworthiness of source, strength of strongest word found in given article

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from collections import Counter
from textblob import TextBlob

import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

from scipy import stats
import statsmodels.api as sm
from scipy.stats import ttest_ind
from matplotlib import rcParams

fake_df = pd.DataFrame.from_csv("fake.csv")
real_df = pd.DataFrame.from_csv("../uci-news-aggregator.csv")

fake_num_rows = fake_df.shape
print(fake_num_rows)

real_num_rows = real_df.shape
print(real_num_rows)

# df.head(100)
# print(df.dtypes)

(12999, 19)
(422419, 7)


In [3]:
real_df = real_df.head(12999)
# print(real_df)

In [4]:
counts_by_type = fake_df['type'].value_counts()
print(counts_by_type)

bs            11492
bias            443
conspiracy      430
hate            246
satire          146
state           121
junksci         102
fake             19
Name: type, dtype: int64


In [5]:
counts_by_url = fake_df['main_img_url'].value_counts()
# print(counts_by_url)

In [6]:
# print(fake_df['spam_score'])

In [7]:
counts_by_url = fake_df['site_url'].value_counts()
# print(counts_by_url)

In [8]:
counts_of_spam = fake_df['spam_score'].value_counts()
# print(counts_of_spam)

In [9]:
col_names = fake_df.columns.tolist()
# print(col_names)

In [10]:
counts_by_replies = fake_df['replies_count'].value_counts()
# print(counts_by_replies)

In [11]:
counts_by_author = fake_df['author'].value_counts()
# print(counts_by_author)

In [12]:
counts_by_domain_rank = fake_df['domain_rank'].value_counts()
# print(counts_by_domain_rank)

In [13]:
# PROJECT IDEA(S)
# take ~10000 known fake
# take ~10000 known real
# combine and take ~25% to put in holdout set - do not use to model - use as verifier of model
# feature extraction - 
# can have 10 different metrics for exclamation marks: 
# total number of exclamation marks per 
# 
# look at number of key words: "outrageous", "strong words"
# Q: how strong is the strongest word
# unique word count - word frequency
# columns: fake / not fake, trustworthiness of source, strength of strongest word found in given article

In [14]:
# create new "id" column in df 
# reorder column names, setting "id" as first column and delete "uuid" col 
fake_df['id'] = range(1, len(fake_df) + 1)
fake_df = fake_df.set_index('id')
fake_df = fake_df[['site_url', 'domain_rank', 'author', 'published', 'title', 'thread_title', 'text', 'ord_in_thread', 'crawled', 'country', 'language', 'spam_score', 'main_img_url', 'replies_count', 'participants_count', 'likes', 'comments', 'shares', 'type']]
print(fake_df.shape)

(12999, 19)


In [21]:
# extract text body from fake and save to file
fake_text_only = fake_df[['text']].copy()

# replace all carriage returns and tabs with spaces
for i in range(1, len(fake_text_only) + 1):
    text = fake_text_only.loc[i, 'text']
    if type(text) != float:
        text = text.split("\n")
        text = " ".join(text)
        text = text.split("\t")
        text = " ".join(text)
        fake_text_only.set_value(i, 'text', text)

fake_text_only.to_csv("fake_body_only.csv", index=False)

In [None]:
# count ratio of number of exclamation marks to words in the given string
def count_ratio_exclams(string):
    exclam = '!'
    space = " "
    num_exclams = string.count(exclam)
    num_spaces = string.count(space)
    if num_spaces == 0:
        return num_exclams
    else:
        return num_exclams / num_spaces

In [None]:
# compute the ratio of exclams to question marks + periods in the given string
def exclam_ratio_text_body(string):
    exclam = '!'
    period = '.'
    question = '?'
    num_exclams = string.count(exclam)
    num_period = string.count(period)
    num_question = string.count(question)
    if num_period + num_question == 0:
        return num_exclams
    return num_exclams / (num_period + num_question)

In [None]:
# create new empty column for ratio_exclam_in_title
fake_df.assign(ratio_exclam_in_title=0)
  
# REMOVE ROWS THAT HAVE NAN thread_title

fake_df = fake_df[fake_df['thread_title'].notnull()]
print(len(fake_df))

# correct id labels
fake_df['id'] = range(1, len(fake_df) + 1)
fake_df = fake_df.set_index('id')

for i in range(1, len(fake_df) + 1):
    thread_title = fake_df.loc[i, 'thread_title']
    count = count_ratio_exclams(thread_title)
    fake_df.set_value(i, 'ratio_exclam_in_title', count)

# counts_by_title_exclams = fake_df.total_exclam_in_title.value_counts()
# print(counts_by_title_exclams)

In [None]:
# create new empty column for total_exclam_in_text count
fake_df.assign(ratio_exclam_in_text_body=0)
  
# REMOVE ROWS THAT HAVE NAN thread_title
fake_df = fake_df[fake_df['text'].notnull()]
print(len(fake_df))

# correct id labels
fake_df['id'] = range(1, len(fake_df) + 1)
fake_df = fake_df.set_index('id')

for i in range(1, len(fake_df) + 1):
    text = fake_df.loc[i, 'text']
    count = exclam_ratio_text_body(text)
    fake_df.set_value(i, 'ratio_exclam_in_text_body', count)

ratio_in_text_body = fake_df.ratio_exclam_in_text_body.value_counts()
# print(ratio_in_text_body)

In [None]:
# create new empty column for ratio_exclams_in_text count
fake_df.assign(ratio_exclams_in_text=0)

# compute the ratio of exclamation marks to other sentence terminating punctionation
# and store in column "ratio_exclams_in_text"
for i in range(1, len(fake_df) + 1):
    text = fake_df.loc[i, 'text']
    count = exclam_ratio_text_body(text)
    fake_df.set_value(i, 'ratio_exclams_in_text_body', count)

counts_ratio_exclams = fake_df.ratio_exclams_in_text.value_counts()
# print(counts_ratio_exclams)

In [None]:
# create new empty column for ratio_exclam_in_title count
real_df.assign(ratio_exclam_in_title=0)

# compute the ratio of exclamation marks to other sentence terminating punctionation
# and store in column "ratio_exclam_in_title"
for i in range(1, len(real_df) + 1):
    thread_title = real_df.loc[i, 'TITLE']
    count = count_ratio_exclams(thread_title)
    real_df.set_value(i, 'ratio_exclam_in_title', count)

counts_ratio_exclams = real_df.ratio_exclam_in_title.value_counts()
print(counts_ratio_exclams)

In [None]:
# create series of total exclamation counts in each row's title
# for index, row in df.iterrows():
#     count = count_total_exclamation(row.title)
#     print(count)
#     df.loc[:,'total_crime'] = df.apply(get_total_crime, axis=1)
#     df.loc[index, row.total_exclam_in_title] = count


# df.loc[:, 'total_exclam_in_title'] = df.apply(count_total_exclams, axis=1)    
# count_title_exclams = df['total_exclam_in_title'].value_counts()
# print(count_title_exclams)

In [None]:
# make  copy of the fake_df containing only the thread_title & site_url
sub_fake_df = fake_df[['thread_title', 'site_url', 'ratio_exclam_in_title']].copy()

# replace all carriage returns and tabs with spaces
for i in range(1, len(sub_fake_df) + 1):
    title = sub_fake_df.loc[i, 'thread_title']
    title = title.split("\n")
    title = " ".join(title)
    title = title.split("\t")
    title = " ".join(title)
    sub_fake_df.set_value(i, 'thread_title', title)

# replace all carriage returns and tabs with spaces    
for i in range(1, len(sub_fake_df) + 1):
    url = sub_fake_df.loc[i, 'site_url']
    url = url.split("\n")
    url = " ".join(url)
    url = url.split("\t")
    url = " ".join(url)
    sub_fake_df.set_value(i, 'site_url', url)

# turn all tabs into spaces
# x = "The bananas are yellow and green"
# x = x.split(" ")
# print(x)
# x = "+".join(x)
# print(x)

In [None]:
sub_real_df = real_df[['TITLE', 'URL', 'ratio_exclam_in_title']].copy()
sub_real_df = sub_real_df.head(12941)

# correct id labels
sub_real_df['id'] = range(1, len(sub_real_df) + 1)
sub_real_df = sub_real_df.set_index('id')

# replace all carriage returns and tabs with spaces
for i in range(1, len(sub_real_df) + 1):
    title = sub_real_df.loc[i, 'TITLE']
    title = title.split("\n")
    title = " ".join(title)
    title = title.split("\t")
    title = " ".join(title)
    sub_real_df.set_value(i, 'TITLE', title)

# replace all carriage returns and tabs with spaces    
for i in range(1, len(sub_real_df) + 1):
    url = sub_real_df.loc[i, 'URL']
    url = url.split("\n")
    url = " ".join(url)
    url = url.split("\t")
    url = " ".join(url)
    sub_real_df.set_value(i, 'URL', url)

In [None]:
# create new column, 'TARGET' with 1 fake and 0 for real
sub_fake_df['TARGET'] = 1
sub_fake_df = sub_fake_df.rename(columns = {'thread_title':'TITLE', 'site_url':'URL'})
sub_real_df['TARGET'] = 0

# combine the two dataframes
combined_df = sub_fake_df.append(sub_real_df)

# reorder the id index of the combined_df set
# correct id labels
combined_df['id'] = range(1, len(combined_df) + 1)
combined_df = combined_df.set_index('id')
print(combined_df)

In [None]:
from sklearn.utils import shuffle
combined_df = shuffle(combined_df)

In [None]:
# convert combined_df into a new TAB DELIMITED csv
combined_df.to_csv("cleaned_combined_dataset.csv", sep="\t", index=False)

In [None]:
# create two sets of data: first is TRAINING SET: 75% OF DATA, 25% VALIDATOR via random num generator
sampler = np.random.rand(len(combined_df)) < 0.75
training_set = combined_df[sampler]
test_set = combined_df[~sampler]

#print(training_set)

In [None]:
training_set.to_csv("training_set.csv", sep='\t', index=False)
test_set.to_csv("holdout_set.csv", sep='\t', index=False)

In [None]:
# stem the "fake news" data
ps = PorterStemmer()
fake_blob = {} 
real_blob = {}
for i in range(len(training_set['TARGET'])):
    try:
        ss8 = str(training_set['TITLE'].iloc[i].encode('utf8'))
    except:
        ss8 = ""
    words = word_tokenize(ss8)
    x = set()
    for w in words:
        x.add(ps.stem(w).lower())

    # if this is a 'fake' row entry
    if training_set['TARGET'].iloc[i] == 1:
        for stword in x:
            if stword in fake_blob:
                fake_blob[stword] = fake_blob[stword] + 1
                #print(stword, " ", fake_blob[stword])
            else:
                fake_blob.setdefault(stword, 1)
                #print(stword," ", fake_blob[stword])
                
    # we found a 'real' row entry
    else:
        for stword in x:
            if stword in real_blob:
                real_blob[stword] = real_blob[stword] + 1
            else:
                real_blob.setdefault(stword, 1)
# print(fakeBlob)

In [None]:
# # stem the "real news" data    
# goodBlob = {}
# for i in range(len(sub_real_df['TITLE'])):
#     try:
#         ss8 = str(sub_real_df['TITLE'].iloc[i].encode('utf8'))
#     except:
#         pass
#     words = word_tokenize(ss8)
#     x = set()
#     for w in words:
#         x.add(ps.stem(w).lower())

#     for stword in x:
#         if stword in goodBlob:
#             goodBlob[stword] = goodBlob[stword] + 1
#             #print(stword, " ", goodBlob[stword])
#         else:
#             goodBlob.setdefault(stword,1)
#             #print(stword," ", goodBlob[stword])

In [None]:
riskdict = {}
for word in fake_blob:
    if word in real_blob:
        count = (fake_blob[word] + real_blob[word])
    else:
        count = fake_blob[word]
    if count >= 10:
        riskdict[word] = fake_blob[word] / count

for word in real_blob:
    if word not in fake_blob and real_blob[word] >= 10:
        riskdict[word] = 0

In [None]:
# add four columns to the results_df: fakeaggregate, goodaggregate, riskword, safeword
results_df = combined_df.copy()
results_df['fake_aggregate'] = 0
results_df['good_aggregate'] = 0
results_df['risk_word'] = 0
results_df['safe_word'] = 0

print(results_df.columns.tolist())

In [None]:
riskword = ""
for i in range(len(results_df['TITLE'])):
    fakeaggregate = 0
    goodaggregate = 0
    riskyword = 0
    safeword = 1
    try:
        ss8 = str(results_df['TITLE'].iloc[i].encode('utf8'))
    except:
        ss8 = ""
    words = word_tokenize(ss8)
    x = set()
    for w in words:
        x.add(ps.stem(w).lower())

    for stword in x:
        if stword in riskdict:
            if riskdict[stword] > riskyword:
                riskword = riskdict[stword]
            if riskdict[stword] < safeword:
                safeword = riskdict[stword]

        if stword in fake_blob:
            fakeaggregate = fake_blob[stword] + fakeaggregate

        if stword in real_blob:
            goodaggregate = real_blob[stword] + goodaggregate
    # update the results to results_df, training_df, test_df        
    results_df.set_value(i, 'fake_aggregate', fakeaggregate)
    results_df.set_value(i, 'good_aggregate', goodaggregate)
    results_df.set_value(i, 'risk_word', riskword)
    results_df.set_value(i, 'safe_word', safeword)
    
print(results_df.columns.tolist())

In [None]:
# create two sets of data: first is TRAINING SET: 75% OF DATA, 25% VALIDATOR via random num generator
sampler = np.random.rand(len(results_df)) < 0.75
new_training = results_df[sampler]
new_test = results_df[~sampler]

# new_training = new_training[['TITLE', 'TARGET', 'fake_aggregate', 'good_aggregate', 'risk_word', 'safe_word']]
new_training = new_training[['TARGET', 'ratio_exclam_in_title', 'fake_aggregate', 'good_aggregate', 'risk_word', 'safe_word']]
print(new_training)

holdout_title_features = new_test[['TITLE', 'TARGET', 'ratio_exclam_in_title', 'fake_aggregate', 'good_aggregate', 'risk_word', 'safe_word']]

new_training.to_csv("new_training.csv", sep='\t', index=False)
new_test.to_csv("new_holdout.csv", sep='\t', index=False)

holdout_title_features.to_csv("holdout_title_features.csv", sep='\t', index=False)

In [24]:
# Kolmogorov-Smirnoff Test:

prediction_df = pd.DataFrame.from_csv("good_model_prediction_results.csv")


        TARGET  Prediction  Unnamed: 3  Unnamed: 4     0     0.1    0.2  \
row_id                                                                    
0            0    0.001520         NaN         NaN  0.02  2028.0    3.0   
1            0    0.103507         NaN         NaN  0.04  2435.0    4.0   
2            0    0.002745         NaN         NaN  0.06  2655.0    8.0   
3            1    0.998342         NaN         NaN  0.08  2782.0   10.0   
4            0    0.012193         NaN         NaN  0.10  2868.0   11.0   
5            0    0.022689         NaN         NaN  0.12  2922.0   14.0   
6            0    0.025135         NaN         NaN  0.14  2971.0   16.0   
7            1    0.936227         NaN         NaN  0.16  3006.0   20.0   
8            1    0.998647         NaN         NaN  0.18  3038.0   22.0   
9            1    0.664123         NaN         NaN  0.20  3062.0   28.0   
10           0    0.027565         NaN         NaN  0.22  3084.0   33.0   
11           0    0.00218