# News classification using Gensim word vectors: NLP Tutorial For Beginners - 23

https://www.youtube.com/watch?v=ZrgVlfNduj8&list=PLeo1K3hjS3uuvuAXhYjV2lMEShq2UYSwX&index=26

In [1]:
import gensim.downloader as api
wv = api.load("word2vec-google-news-300")

In [2]:
wv.similarity(w1="great", w2="good")

0.72915095

In [3]:
wv_great = wv["great"]
wv_good = wv["good"]

In [7]:
wv_great.shape, wv_good.shape

((300,), (300,))

In [9]:
import pandas as pd

df = pd.read_csv("Fake_Real_Data.csv")

In [11]:
df.shape

(9900, 2)

In [12]:
df.label.value_counts()

Fake    5000
Real    4900
Name: label, dtype: int64

no class disbalance

In [10]:
df.head(2)

Unnamed: 0,Text,label
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake
1,U.S. conservative leader optimistic of common ...,Real


In [13]:
df['label_num'] = df.label.map({
    "Fake": 0,
    "Real": 1
})

In [14]:
df.head(2)

Unnamed: 0,Text,label,label_num
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake,0
1,U.S. conservative leader optimistic of common ...,Real,1


In [15]:
import spacy
nlp = spacy.load("en_core_web_lg")

In [16]:
def preprocess_and_vectorize(text):
    doc = nlp(text)
    
    filtered_tokens = []
    for token in doc:
        if token.is_punct or token.is_stop:
            continue
        filtered_tokens.append(token.lemma_)
        
    return filtered_tokens        

In [19]:
preprocess_and_vectorize("No worries if you don't understand")

['worry', 'understand']

#### sentence embedding mechanism

In [31]:
v1 = wv["worry"]
v2 = wv["understand"]

import numpy as np

np.mean([v1, v2], axis=0)[:10]

array([ 0.00976562, -0.00561523, -0.08905029,  0.01330566, -0.2709961 ,
        0.14746094,  0.3408203 , -0.01840591,  0.15161133, -0.06945801],
      dtype=float32)

In [34]:
wv.get_mean_vector(["worry", "understand"], pre_normalize=False)[:10]

array([ 0.00976562, -0.00561523, -0.08905029,  0.01330566, -0.2709961 ,
        0.14746094,  0.3408203 , -0.01840591,  0.15161133, -0.06945801],
      dtype=float32)

'pre_normalize=False' was only required to get the same result as 'np.mean'

In [33]:
np.mean([v1, v2], axis=0)[:10] == wv.get_mean_vector(["worry", "understand"], pre_normalize=False)[:10]

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])

### let's improve our function

In [35]:
def preprocess_and_vectorize(text):
    doc = nlp(text)
    
    filtered_tokens = []
    for token in doc:
        if token.is_punct or token.is_stop:
            continue
        filtered_tokens.append(token.lemma_)
        
    return wv.get_mean_vector(filtered_tokens)

In [37]:
preprocess_and_vectorize("No worries if you don't understand")[:10]

array([ 0.00235079, -0.00284596, -0.03638233,  0.00413919, -0.10635224,
        0.05758579,  0.13348952, -0.00689176,  0.05995331, -0.02875906],
      dtype=float32)

In [38]:
df['vector'] = df['Text'].apply(lambda text: preprocess_and_vectorize(text))

### train_test_split

In [39]:
from sklearn.model_selection import train_test_split
import numpy as np


X_train, X_test, y_train, y_test = train_test_split(
    np.stack(df.vector.values),  # np.stack - removal of additional array layer
    df.label_num,
    test_size=0.2,
    random_state=2022
)

In [41]:
X_train.shape

(7920, 300)

### training ML model

In [40]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report


clf = GradientBoostingClassifier()

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.98      0.98      1024
           1       0.98      0.99      0.98       956

    accuracy                           0.98      1980
   macro avg       0.98      0.98      0.98      1980
weighted avg       0.98      0.98      0.98      1980

