In [1]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')

In [2]:
wv.similarity(w1 = 'great', w2 = 'good')

0.72915095

In [3]:
wv_great = wv['great']
wv_good = wv['good']

In [4]:
import pandas as pd

df = pd.read_csv('fake_and_real_news.csv')
df.shape

(9900, 2)

In [5]:
df.head()

Unnamed: 0,Text,label
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake
1,U.S. conservative leader optimistic of common ...,Real
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real
3,Court Forces Ohio To Allow Millions Of Illega...,Fake
4,Democrats say Trump agrees to work on immigrat...,Real


In [6]:
df.label.value_counts()

Fake    5000
Real    4900
Name: label, dtype: int64

In [7]:
df['label_num'] = df.label.map({
    'Fake': 0,
    'Real': 1
})
df.head()

Unnamed: 0,Text,label,label_num
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake,0
1,U.S. conservative leader optimistic of common ...,Real,1
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real,1
3,Court Forces Ohio To Allow Millions Of Illega...,Fake,0
4,Democrats say Trump agrees to work on immigrat...,Real,1


In [18]:
import spacy
nlp = spacy.load('en_core_web_lg')

def preprocess_and_vectorize(text):
    doc = nlp(text)
    
    filtered_tokens = []
    for token in doc:
        if token.is_punct or token.is_stop:
            continue
        filtered_tokens.append(token.lemma_) 
        
    vectors = []
    for token in filtered_tokens:
        try:
            vectors.append(wv[token])
        except KeyError:
            continue
    return np.mean(vectors, axis=0)

In [20]:
preprocess_and_vectorize("Don't worry if you don't understand").shape

(300,)

In [13]:
v1 = wv["worry"]
v2 = wv["understand"]

import numpy as np
np.mean([v1,v2],axis = 0)

array([ 0.00976562, -0.00561523, -0.08905029,  0.01330566, -0.2709961 ,
        0.14746094,  0.3408203 , -0.01840591,  0.15161133, -0.06945801,
       -0.05749512, -0.17822266, -0.03805542,  0.08730698, -0.22216797,
        0.2578125 ,  0.06481934,  0.29589844,  0.00537109, -0.1875    ,
       -0.1159668 ,  0.0715332 ,  0.08691406,  0.05912399,  0.18359375,
        0.17687988,  0.09130859, -0.22705078,  0.10522461, -0.2475586 ,
       -0.02436638,  0.01245117, -0.06616211, -0.02587891,  0.13476562,
       -0.02604675,  0.06582642,  0.0612793 ,  0.07128906,  0.13867188,
        0.03234863, -0.03295898,  0.17736816, -0.08789062, -0.21777344,
       -0.11010742, -0.08728027, -0.01922607, -0.04943848,  0.05273438,
       -0.18066406,  0.13122559, -0.07498932, -0.10064697, -0.01171875,
        0.12963867, -0.10766602, -0.14624023,  0.11303711, -0.12280273,
       -0.03540039,  0.03601074, -0.01379395,  0.01042175,  0.1105957 ,
       -0.03820801, -0.20751953,  0.1352539 , -0.0625    , -0.01

In [21]:
df['vector'] = df['Text'].apply(lambda text: preprocess_and_vectorize(text))

In [22]:
df.head()

Unnamed: 0,Text,label,label_num,vector
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake,0,"[0.024825068, 0.051911708, -0.040826198, 0.083..."
1,U.S. conservative leader optimistic of common ...,Real,1,"[0.03272779, 0.021770593, -0.0007758863, 0.044..."
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real,1,"[0.053966843, 0.016407834, -0.020710936, 0.103..."
3,Court Forces Ohio To Allow Millions Of Illega...,Fake,0,"[0.03475553, 0.029535385, -0.0011190036, 0.051..."
4,Democrats say Trump agrees to work on immigrat...,Real,1,"[-0.0016789786, 0.034640685, 0.006223632, 0.08..."


In [23]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df.vector.values,
    df.label_num,
    test_size = 0.2,
    random_state = 1,
    stratify = df.label_num
)

In [24]:
# Reshaping the X_train and X_test so as to fit for models

In [25]:
print('shape of X_train before reshaping: ', X_train.shape)
print('shape of X_test before reshaping: ', X_test.shape)

X_train_2d = np.stack(X_train)
X_test_2d = np.stack(X_test)

print('shape of X_train after reshaping: ', X_train_2d.shape)
print('shape of X_test after reshaping: ', X_test_2d.shape)

shape of X_train before reshaping:  (7920,)
shape of X_test before reshaping:  (1980,)
shape of X_train after reshaping:  (7920, 300)
shape of X_test after reshaping:  (1980, 300)


In [29]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report

clf = GradientBoostingClassifier()

clf.fit(X_train_2d, y_train)

y_pred = clf.predict(X_test_2d)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.97      0.98      1000
           1       0.97      0.99      0.98       980

    accuracy                           0.98      1980
   macro avg       0.98      0.98      0.98      1980
weighted avg       0.98      0.98      0.98      1980



In [30]:
# Make some predictions

test_news = ['Watch: Donald Trump kiling his son in a brutal fight.Exclusive on Fox news!!!']

test_news_vectors = [preprocess_and_vectorize(n) for n in test_news]
clf.predict(test_news_vectors)

array([0], dtype=int64)