### Import Gensim Word2Vec Model

In [None]:
import gensim.downloader as api

wv = api.load("word2vec-google-news-300")

In [2]:
wv.similarity(w1 = "great", w2 = "good")

0.729151

In [3]:
wv_great=wv["great"]
wv_good=wv["good"]

### Load dataset

In [3]:
import pandas as pd
import numpy as np

In [4]:
# 0 :- Fake, 1 :- Real
df = pd.read_csv("./Data/news_Fake_Dataset.csv")
df.shape

(72134, 4)

In [5]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [6]:
df = df.drop(['title','Unnamed: 0'], axis=1)
df.head()

Unnamed: 0,text,label
0,No comment is expected from Barack Obama Membe...,1
1,Did they post their votes for Hillary already?,1
2,"Now, most of the demonstrators gathered last ...",1
3,A dozen politically active pastors came here f...,0
4,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


### Import spacy language model

In [7]:
import spacy 
nlp = spacy.load("en_core_web_lg")

### Function for text processing

In [8]:
def preprocess_text(text):
    # Tokenization and lemmatization
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc]
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    
    return tokens

### Function for word embedding (Text to vector)

In [9]:
def text_to_vectors(tokens):
    # Convert tokens to vectors
    vectors = []
    for token in tokens:
        if token in wv:
            vectors.append(wv[token])
    
    # Calculate the average vector
    if vectors:
        avg_vector = np.mean(vectors, axis=0)
    else:
        avg_vector = np.zeros(300)  # If no vectors found, return zero vector
    
    return avg_vector

In [10]:
text = "Your fake news text goes here"
vector = text_to_vectors(text)
print(vector)

[-1.83893621e-01  1.15242340e-01  9.02258849e-04  1.28502890e-01
  3.31447436e-03  2.15016250e-02 -1.13257363e-01 -1.91650391e-02
 -2.30155606e-02  4.63336427e-03 -5.61366715e-02 -5.05530313e-02
 -1.95312500e-01  1.57895293e-02 -1.30986750e-01  1.21651031e-01
  1.21927015e-01  1.55517578e-01 -1.90177578e-02  4.88068946e-02
 -2.50774890e-01 -5.92671260e-02  9.22639295e-02  6.77266344e-02
 -1.09454609e-01  3.56145948e-02 -1.87606141e-01  3.56976055e-02
  2.49713403e-03 -9.23170000e-02 -4.45967950e-02  3.09590884e-02
 -8.00900683e-02 -5.67839257e-02 -9.31449533e-02  5.38930483e-02
 -2.14959189e-01  5.37003241e-02 -4.56035435e-02  7.66521916e-02
 -4.09545898e-02 -1.19575830e-02  5.58498204e-02  5.75959571e-02
  4.31839488e-02 -5.74632734e-02 -8.52369219e-02 -1.62597656e-01
 -1.20854914e-01  7.11033046e-02 -1.57984197e-01  2.29003906e-01
 -5.80152422e-02  1.89867109e-01  1.08581543e-01  1.90700367e-01
 -1.89867109e-01 -1.64644659e-01  2.28271484e-02 -1.82744563e-01
 -9.34481397e-02 -1.20886

In [11]:
df.dropna(inplace=True)

In [12]:
df.isnull().sum()

text     0
label    0
dtype: int64

In [13]:
df['tokens'] = df['text'].apply(preprocess_text)

In [14]:
df['vector'] = df['tokens'].apply(text_to_vectors)

In [15]:
df.head()

Unnamed: 0,text,label,tokens,vector
0,No comment is expected from Barack Obama Membe...,1,"[comment, expect, Barack, Obama, Members, FYF9...","[0.02989684, 0.0266612, 0.018265275, 0.0700507..."
1,Did they post their votes for Hillary already?,1,"[post, vote, Hillary]","[0.045410156, -0.085250854, -0.049153645, -0.0..."
2,"Now, most of the demonstrators gathered last ...",1,"[ , demonstrator, gather, night, exercise, con...","[0.040579915, 0.06442566, 0.008007812, 0.10589..."
3,A dozen politically active pastors came here f...,0,"[dozen, politically, active, pastor, come, pri...","[0.07117676, 0.0048914626, -0.0015993273, 0.08..."
4,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1,"[rs-28, Sarmat, missile, dub, Satan, 2, replac...","[-0.012512591, 0.05977534, 0.06800325, 0.07641..."


If you want to save this new dataset that contain vector column do not save in .csv because it may convert it into a string datatype by adding some \n , etc i wasted so much time in changing this csv but it didn't worked

so to save such vector you can save in .pkl or .h5 because this can store complex data like vectors in then 

In [16]:
# To save in pickle format
# df.to_pickle("./Data/news_Fake_Vectorized.pkl")

# to load a pickle dataset
# df = pd.read_pickle("./Data/news_Fake_Vectorized.pkl")