<a href="https://colab.research.google.com/github/N00B-MA5TER/ML-Projects/blob/main/Fake_News_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing the Dependencies

In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

Data Pre-processing

In [4]:
#Loading the dataset to a pandas DataFrame
news_dataset = pd.read_csv('/content/WELFake_Dataset.csv', on_bad_lines='skip')

In [5]:
news_dataset.shape

(72134, 4)

In [6]:
#printing the first five rows of the dataset
news_dataset.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [7]:
# renaming the unnamed column "Unnamed: 0" to "id"
news_dataset.rename(columns = {'Unnamed: 0': 'id'}, inplace=True)

In [8]:
news_dataset.head()

Unnamed: 0,id,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [9]:
#counting the number of missing values in the dataset
news_dataset.isnull().sum()

Unnamed: 0,0
id,0
title,558
text,39
label,0


In [10]:
# replacing the null values with empty string
news_dataset = news_dataset.fillna('')

In [11]:
#separating the data & label
X = news_dataset.drop(columns = 'label', axis = 1)
Y = news_dataset['label']

In [12]:
print(X)
print(Y)

          id                                              title  \
0          0  LAW ENFORCEMENT ON HIGH ALERT Following Threat...   
1          1                                                      
2          2  UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...   
3          3  Bobby Jindal, raised Hindu, uses story of Chri...   
4          4  SATAN 2: Russia unvelis an image of its terrif...   
...      ...                                                ...   
72129  72129  Russians steal research on Trump in hack of U....   
72130  72130   WATCH: Giuliani Demands That Democrats Apolog...   
72131  72131  Migrants Refuse To Leave Train At Refugee Camp...   
72132  72132  Trump tussle gives unpopular Mexican leader mu...   
72133  72133  Goldman Sachs Endorses Hillary Clinton For Pre...   

                                                    text  
0      No comment is expected from Barack Obama Membe...  
1         Did they post their votes for Hillary already?  
2       Now, most 

Stemming:

Stemming is the process of reducing a word to it's root word

Example : actor, actress, acting --> act

In [13]:
port_stem = PorterStemmer()

In [14]:
def stemming(content):
  stemmed_content = re.sub('[^a-zA-Z]', ' ', content)
  stemmed_content = stemmed_content.lower()
  stemmed_content = stemmed_content.split()
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content = ' '.join(stemmed_content)
  return stemmed_content

In [15]:
news_dataset['title'] = news_dataset['title'].apply(stemming)

In [16]:
print(news_dataset['title'])

0        law enforc high alert follow threat cop white ...
1                                                         
2        unbeliev obama attorney gener say charlott rio...
3        bobbi jindal rais hindu use stori christian co...
4        satan russia unv imag terrifi new supernuk wes...
                               ...                        
72129    russian steal research trump hack u democrat p...
72130    watch giuliani demand democrat apolog trump ra...
72131         migrant refus leav train refuge camp hungari
72132    trump tussl give unpopular mexican leader much...
72133           goldman sach endors hillari clinton presid
Name: title, Length: 72134, dtype: object


In [17]:
#separating the data and label
X = news_dataset['title'].values
Y = news_dataset['label'].values

In [18]:
print(X)

['law enforc high alert follow threat cop white blacklivesmatt fyf terrorist video'
 ''
 'unbeliev obama attorney gener say charlott rioter peac protest home state north carolina video'
 ... 'migrant refus leav train refuge camp hungari'
 'trump tussl give unpopular mexican leader much need shot arm'
 'goldman sach endors hillari clinton presid']


In [19]:
print(Y)

[1 1 1 ... 0 0 1]


In [20]:
#converting the textual data to numerical data
vectorizer = TfidfVectorizer()

#Tf : Term frequency
# Basically counts the number of times a word is repeated in a document

#idf : Inverse document frequency
# It finds the words that have been repeated a lot of times and reduces the significance value of those words

vectorizer.fit(X)

X = vectorizer.transform(X)

In [21]:
print(X)

  (0, 407)	0.3190180925014663
  (0, 1802)	0.33473541566384035
  (0, 3679)	0.24871262252022117
  (0, 5509)	0.31820565801047196
  (0, 6425)	0.28932771754845743
  (0, 6730)	0.48553136502134386
  (0, 7887)	0.26746434949988324
  (0, 9699)	0.22829788917209384
  (0, 17260)	0.24871262252022117
  (0, 17363)	0.2542650376115143
  (0, 18648)	0.1297506867782943
  (0, 19106)	0.19134939529376566
  (2, 1049)	0.28404017886581956
  (2, 2673)	0.30809679188606154
  (2, 2919)	0.3639616996972358
  (2, 6880)	0.2652283770602196
  (2, 8020)	0.2692285294185893
  (2, 11864)	0.2231406266784195
  (2, 12011)	0.16878852994653004
  (2, 12744)	0.27904818164471595
  (2, 13591)	0.22687620695463123
  (2, 14591)	0.3580030298678158
  (2, 15094)	0.1609967301122813
  (2, 16446)	0.1999703023632961
  (2, 18034)	0.35962437110547785
  :	:
  (72130, 17778)	0.13227219506940732
  (72130, 18936)	0.25302499393443006
  (72131, 2566)	0.3967249021272091
  (72131, 8206)	0.46269177743112333
  (72131, 9752)	0.3384827653769501
  (72131, 109

Splitting the dataset into training and test data

In [22]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y, random_state = 2)

Training the model : Logistic Regression

In [23]:
model = LogisticRegression()

In [24]:
model.fit(X_train, Y_train)

Evaluation : Accuracy Score

In [25]:
#accuracy score on the training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [26]:
print("Accuracy score of training data : ", training_data_accuracy)

Accuracy score of training data :  0.9193858630668723


In [27]:
#accuracy score on the test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [28]:
print("Accuracy score of test data : ", test_data_accuracy)

Accuracy score of test data :  0.900603035974215


Making a Predictive System

In [29]:
X_new = X_test[1]

prediction = model.predict(X_new)
print(prediction)

if (prediction[0] == 0):
  print('The news is Real')
else:
  print('The news is Fake')

[0]
The news is Real


In [30]:
print(Y_test[1])

0


# Task
Test the accuracy of a logistic regression model trained on data preprocessed using lemmatization and Word2Vec, following all other previous preprocessing steps, and compare its accuracy to the previous model that used stemming and TF-IDF.

## Implement lemmatization

### Subtask:
Create a new function for lemmatization to replace the stemming function.


**Reasoning**:
The subtask requires creating a new function for lemmatization, which involves importing necessary libraries, downloading NLTK data, instantiating a lemmatizer, and defining the function as per the instructions. These steps can be grouped into a single code block.



In [31]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()

def lemmatization(content):
  lemmatized_content = re.sub('[^a-zA-Z]', ' ', content)
  lemmatized_content = lemmatized_content.lower()
  lemmatized_content = lemmatized_content.split()
  lemmatized_content = [wordnet_lemmatizer.lemmatize(word, pos='a') for word in lemmatized_content if not word in stopwords.words('english')]
  lemmatized_content = [wordnet_lemmatizer.lemmatize(word, pos='n') for word in lemmatized_content if not word in stopwords.words('english')]
  lemmatized_content = [wordnet_lemmatizer.lemmatize(word, pos='v') for word in lemmatized_content if not word in stopwords.words('english')]
  lemmatized_content = [wordnet_lemmatizer.lemmatize(word, pos='r') for word in lemmatized_content if not word in stopwords.words('english')]
  lemmatized_content = ' '.join(lemmatized_content)
  return lemmatized_content

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Apply lemmatization

### Subtask:
Apply the new lemmatization function to the 'title' column of the dataset.


**Reasoning**:
Apply the lemmatization function to the 'title' column of the news_dataset DataFrame and display the first few rows to verify the changes.



In [32]:
news_dataset['title'] = news_dataset['title'].apply(lemmatization)
news_dataset.head()

Unnamed: 0,id,title,text,label
0,0,law enforc high alert follow threat cop white ...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,unbeliev obama attorney gener say charlott rio...,"Now, most of the demonstrators gathered last ...",1
3,3,bobbi jindal rais hindu use stori christian co...,A dozen politically active pastors came here f...,0
4,4,satan russia unv imag terrifi new supernuk wes...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


## Implement word2vec

### Subtask:
Train a Word2Vec model on the preprocessed text data.


**Reasoning**:
Import the Word2Vec class from gensim.models and tokenize the lemmatized text data in the 'title' column. Then, instantiate and train a Word2Vec model on the tokenized data.



In [33]:
!pip install gensim



In [34]:
from gensim.models import Word2Vec

tokenized_titles = [title.split() for title in news_dataset['title']]

# Instantiate and train Word2Vec model
word2vec_model = Word2Vec(sentences=tokenized_titles, vector_size=100, window=5, min_count=1, workers=4)
word2vec_model.train(tokenized_titles, total_examples=len(tokenized_titles), epochs=10)



(5911045, 6455100)

In [35]:
# Function to create document vectors
def create_document_vector(tokens, model):
    # Filter out tokens not in the vocabulary
    tokens = [token for token in tokens if token in model.wv]
    if not tokens:
        return np.zeros(model.vector_size)
    # Calculate the mean of the word vectors
    return np.mean(model.wv[tokens], axis=0)

# Create document vectors for the entire dataset
X_word2vec = np.array([create_document_vector(tokens, word2vec_model) for tokens in tokenized_titles])
Y_word2vec = news_dataset['label'].values

# Split data
X_train_w2v, X_test_w2v, Y_train_w2v, Y_test_w2v = train_test_split(X_word2vec, Y_word2vec, test_size=0.2, stratify=Y_word2vec, random_state=2)

# Train Logistic Regression Model
model_w2v = LogisticRegression()
model_w2v.fit(X_train_w2v, Y_train_w2v)

# Evaluate Model
X_train_prediction_w2v = model_w2v.predict(X_train_w2v)
training_data_accuracy_w2v = accuracy_score(X_train_prediction_w2v, Y_train_w2v)

X_test_prediction_w2v = model_w2v.predict(X_test_w2v)
test_data_accuracy_w2v = accuracy_score(X_test_prediction_w2v, Y_test_w2v)

# Compare Results
print("Accuracy score of training data (Word2Vec + Lemmatization): ", training_data_accuracy_w2v)
print("Accuracy score of test data (Word2Vec + Lemmatization): ", test_data_accuracy_w2v)
print("Accuracy score of test data (TF-IDF + Stemming): ", test_data_accuracy)

Accuracy score of training data (Word2Vec + Lemmatization):  0.8498795640043669
Accuracy score of test data (Word2Vec + Lemmatization):  0.8536771331531157
Accuracy score of test data (TF-IDF + Stemming):  0.900603035974215
