# Imports

## Data Management

In [3]:
import datetime
import numpy as np
import pandas as pd

## Analysis and Cleaning

In [4]:
import string
import re

from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_punctuation, \
                                        strip_multiple_whitespaces, strip_numeric, \
                                        remove_stopwords, strip_short 
from gensim.models import Word2Vec

## Learning

In [5]:
from sklearn import cluster
from sklearn import metrics
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

## Visualization

In [6]:
import seaborn as sns
import matplotlib.pyplot as plt

# Data Analysis & Cleanup

In [7]:
fake = pd.read_csv('datasets/Fake.csv')
true = pd.read_csv('datasets/True.csv')
print('False Sample')
display(fake.sample(10))
print('\n\n\n\n')
print('True Sample')
display(true.sample(10))

False Sample


Unnamed: 0,title,text,subject,date
2163,Twitter TEARS Paul Ryan To BLOODY RIBBONS For...,The Congressional Budget Office s review of th...,News,"March 13, 2017"
498,House Dem Wants GOP On Record: Stop Gov’t Spe...,"Early next month, the GOP-controlled House of ...",News,"August 24, 2017"
10195,BRILLIANT TRUMP ADVISER: “The Extreme Media” H...,What a treat! Laura Ingraham interviewed Presi...,politics,"Aug 8, 2017"
1988,Dan Rather: Trump’s First Two Months In Offic...,Legendary newsman Dan Rather has lived through...,News,"March 27, 2017"
2094,Trump Supporter Threatens To Slaughter Black ...,Ever since Officer Darren Wilson murdered unar...,News,"March 19, 2017"
7028,Seth MacFarlane Wants His Fellow Bernie Suppo...,Very outspoken Bernie Sanders supporter Seth M...,News,"April 8, 2016"
7084,"No, Barney Frank Did NOT Call Bernie Supporte...",While I m not totally immune to clickbait as a...,News,"April 5, 2016"
19224,SAVAGE ANTI-TRUMP PROTESTERS Knock Out Innocen...,AIRPORTS ACROSS THE COUNTRY were inundated wit...,left-news,"Jan 30, 2017"
3948,WATCH: Anderson Cooper Schools Kellyanne Conw...,"It was a simple question, but it was enough tu...",News,"November 4, 2016"
21786,(VIDEO) BALTIMORE MAYOR REOPENS LOOTED MALL: T...,,left-news,"May 4, 2015"







True Sample


Unnamed: 0,title,text,subject,date
11745,"Food security in Middle East, North Africa det...",CAIRO (Reuters) - Food security in the Middle ...,worldnews,"December 21, 2017"
2075,Taliban condemn Trump's decision on Afghanista...,KABUL (Reuters) - A spokesman for the Afghan T...,politicsNews,"August 22, 2017"
1371,EPA watchdog expands audit of administrator's ...,WASHINGTON (Reuters) - The U.S. Environmental ...,politicsNews,"October 6, 2017"
14679,China denounces U.S. call to register Chinese ...,BEIJING (Reuters) - China denounced on Thursda...,worldnews,"November 16, 2017"
2862,"Trump not aware of 2016 meeting between son, R...",WASHINGTON (Reuters) - President Donald Trump ...,politicsNews,"July 9, 2017"
767,New York Mayor de Blasio wins re-election in w...,NEW YORK (Reuters) - Democratic New York City ...,politicsNews,"November 7, 2017"
15058,Trump: Being friends with North Korea's Kim is...,HANOI (Reuters) - U.S. President Donald Trump ...,worldnews,"November 12, 2017"
13078,"Jordan to convene Arab League, OIC meetings ov...",AMMAN (Reuters) - Jordan plans to convene emer...,worldnews,"December 5, 2017"
15651,Lebanese army says has not uncovered any assas...,BEIRUT (Reuters) - The Lebanese army said on S...,worldnews,"November 5, 2017"
7627,State Department says FBI has not approached i...,WASHINGTON (Reuters) - A U.S. State Department...,politicsNews,"October 31, 2016"


## Getting rid of unwanted strings

In [8]:
cleansed_data = []
for data in true.text:
    if "@realDonaldTrump : - " in data:
        cleansed_data.append(data.split("@realDonaldTrump : - ")[1])
    elif "(Reuters) -" in data:
        cleansed_data.append(data.split("(Reuters) - ")[1])
    else:
        cleansed_data.append(data)

true["text"] = cleansed_data
display(true.head(10))

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",The head of a conservative Republican faction ...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,Transgender people will be allowed for the fir...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,The special counsel investigation of links bet...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,Trump campaign adviser George Papadopoulos tol...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,President Donald Trump called on the U.S. Post...,politicsNews,"December 29, 2017"
5,"White House, Congress prepare for talks on spe...",The White House said on Friday it was set to k...,politicsNews,"December 29, 2017"
6,"Trump says Russia probe will be fair, but time...",President Donald Trump said on Thursday he bel...,politicsNews,"December 29, 2017"
7,Factbox: Trump on Twitter (Dec 29) - Approval ...,While the Fake News loves to talk about my so-...,politicsNews,"December 29, 2017"
8,Trump on Twitter (Dec 28) - Global Warming,"Together, we are MAKING AMERICA GREAT AGAIN! b...",politicsNews,"December 29, 2017"
9,Alabama official to certify Senator-elect Jone...,Alabama Secretary of State John Merrill said h...,politicsNews,"December 28, 2017"


## Joining title and text

In [9]:
fake['Sentences'] = fake['title'] + ' ' + fake['text']
true['Sentences'] = true['title'] + ' ' + true['text']

## Adding Labels, concatenating and mixing

In [10]:
fake['Label'] = 0
true['Label'] = 1

final_data = pd.concat([fake, true])

final_data = final_data.sample(frac=1, random_state=42).reset_index(drop=True)

## Droping uneeded columns

In [11]:
final_data = final_data.drop(['title', 'text', 'subject', 'date'], axis = 1)

display(final_data.head(10))

Unnamed: 0,Sentences,Label
0,Ben Stein Calls Out 9th Circuit Court: Committ...,0
1,Trump drops Steve Bannon from National Securit...,1
2,Puerto Rico expects U.S. to lift Jones Act shi...,1
3,OOPS: Trump Just Accidentally Confirmed He Le...,0
4,Donald Trump heads for Scotland to reopen a go...,1
5,Paul Ryan Responds To Dem’s Sit-In On Gun Con...,0
6,AWESOME! DIAMOND AND SILK Rip Into The Press: ...,0
7,STAND UP AND CHEER! UKIP Party Leader SLAMS Ge...,0
8,North Korea shows no sign it is serious about ...,1
9,Trump signals willingness to raise U.S. minimu...,1


## Processing Sentences

### Function

In [12]:
def remove_URL(s):
    regex = re.compile(r'https?://\S+|www\.\S+|bit\.ly\S+')
    return regex.sub(r'',s)

### List of functions

In [13]:
CUSTOM_FILTERS = [lambda x: x.lower(), strip_tags, remove_URL, strip_punctuation, strip_multiple_whitespaces, strip_numeric, remove_stopwords, strip_short]

### Useful info

In [14]:
words_broken_up = [preprocess_string(sentence, CUSTOM_FILTERS) for sentence in final_data.Sentences]

In [15]:
processed_data = [word for word in words_broken_up if len(word) > 0]

In [16]:
processed_labels = [label for num, label in enumerate(final_data.Label) if len(words_broken_up[num]) > 0]

# Word2Vec

In [17]:
model = Word2Vec(processed_data, min_count=1)
display(model.wv.most_similar("country"))

[('nation', 0.8257389068603516),
 ('america', 0.667441189289093),
 ('countries', 0.5821973085403442),
 ('europe', 0.5630171895027161),
 ('world', 0.519716739654541),
 ('americans', 0.4928061366081238),
 ('path', 0.4909282624721527),
 ('especially', 0.4875548779964447),
 ('fear', 0.4815024137496948),
 ('prosperous', 0.48017793893814087)]

## Sentence Vectors

In [18]:
def return_vector(model_made, x):
    try:
        return model_made[x]
    except:
        return np.zeros(100)
    
    
def sentence_vector(model_made, sentence):
    word_vectors = list(map(lambda x: return_vector(model_made, x), sentence))
    return np.average(word_vectors, axis=0).tolist()

In [19]:
X = np.array([sentence_vector(model, data) for data in processed_data])

  This is separate from the ipykernel package so we can avoid doing imports until


In [20]:
np.save('X', X)

In [24]:
a = np.load('X.npy')
a[0]

array([-0.32123074,  0.10169125,  0.72276968,  0.03206942, -0.61726338,
       -0.24283664,  0.63688195,  0.5249728 ,  0.64675754,  0.1152055 ,
       -0.4815042 , -0.89974558, -0.00762568,  0.2933363 ,  0.44644031,
        0.51858097,  0.47446933,  0.13875988, -0.0306441 ,  0.54360187,
        0.23764895,  0.64876127,  0.21412817,  0.3169899 , -0.51947409,
        0.19905782,  0.18832985,  0.38773045,  0.52204823, -0.25709477,
       -1.00790942, -0.4752112 ,  0.88008535, -0.06160234,  0.03825022,
       -0.01551319,  0.18383649, -0.2007156 , -0.33364904,  0.02455057,
        0.34962565, -0.44205281,  0.1607201 , -0.48582035, -0.14196068,
       -0.34427616,  0.2041886 ,  0.02068151,  0.35949475, -0.26297835,
       -0.6309492 ,  0.29961634, -0.27055293, -0.19646035,  0.38210335,
        0.13136664, -0.11548232, -1.02537346,  0.32271606, -0.62880832,
        0.49720156, -0.00938052, -0.3718771 , -0.37069792, -0.45816118,
       -0.19841351, -0.38273787,  0.48432291, -0.49537119, -0.32

# Clustering

In [None]:
kmeans = cluster.KMeans(n_clusters=2, verbose=1)
clustered = kmeans.fit_predict(X)

In [None]:
testing_df = pd.DataFrame({'Sentence': processed_data, 'Labels': processed_labels, 'Prediction': clustered})
display(testing_df.head(20))

## Validating

In [None]:
testing_df['assertion'] = np.logical_not(np.logical_xor(testing_df['Labels'], testing_df['Prediction']))
assertion = np.sum(testing_df.assertion)/np.sum(len(testing_df.assertion))*100

print('Data classificated correctly: ', assertion, '%')

# Visualization

## Prinicpal Component Analysis (PCA)

In [None]:
pca = PCA(n_components=2)
pca_result = pca.fit_transform(X)

PCA_df = pd.DataFrame(pca_result)
PCA_df['cluster'] = clustered
PCA_df.columns = ['x1','x2','cluster']

## T-Distributed Stochastic Neighbor Embedding (TSNE)

In [None]:
tsne = TSNE(n_components=2)
tsne_result = tsne.fit_transform(pca_result)

TSNE_df = pd.DataFrame(tsne_result)
TSNE_df['cluster'] = clustered
TSNE_df.columns = ['x1','x2','cluster']

### Plots

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(12,6))
sns.scatterplot(data=PCA_df,x='x1',y='x2',hue='cluster',legend="full",alpha=0.5,ax=ax[1])
sns.scatterplot(data=TSNE_df,x='x1',y='x2',hue='cluster',legend="full",alpha=0.5,ax=ax[0])
ax[0].set_title('Visualized on TSNE')
ax[1].set_title('Visualized on PCA')

# Custom new tests
Testing with fake news generated from https://www.thefakenewsgenerator.com/

### Onion

In [None]:
onion_data = "Flint Residents Learn To Harness Superpowers, But Trump Gets Away Again They developed superpowers after years of drinking from a lead-poisoned water supply. But just having incredible abilities doesn't make them superheroes. Not yet. Donald Trump faced off against the superpowered civilians but he got away before they could catch him"

# Preprocess article
onion_data = preprocess_string(onion_data, CUSTOM_FILTERS)

# Get sentence vector
onion_data = sentence_vector(model, onion_data)

# Get prediction
kmeans.predict(np.array([onion_data]))

### News from BBC

In [None]:
bbc_data = "Nasa Mars 2020 Mission's MiMi Aung on women in space Next year, Nasa will send a mission to Mars. The woman in charge of making the helicopter that will be sent there – which is set to become the first aircraft to fly on another planet – is MiMi Aung. At 16, MiMi travelled alone from Myanmar to the US for access to education. She is now one of the lead engineers at Nasa. We find out what it's like being a woman in space exploration, and why her mum is her biggest inspiration."

# Preprocess article
bbc_data = preprocess_string(bbc_data, CUSTOM_FILTERS)

# Get sentence vector
bbc_data = sentence_vector(model, bbc_data)

# Get prediction
kmeans.predict(np.array([bbc_data]))