<a href="https://colab.research.google.com/github/Hari02sha26/ML_Projects/blob/main/Fake_News_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [41]:
import numpy as np
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Downloading stop words

In [24]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [25]:
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

# Data Collection and pre-processing

In [9]:
# reading the csv file

data = pd.read_csv('/content/WELFake_Dataset.csv',on_bad_lines='skip',engine='python')

In [10]:
# reading the first five rows

data.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [11]:
# checking the total number of rows and columns

data.shape

(72154, 4)

In [79]:
# counting total number of null values present in the table

data.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
title,565
text,57
label,20


In [20]:
# handling missing values with help of dropping the rows

drop_data = data.dropna(how='any')

In [21]:
drop_data.shape

(71537, 4)

In [22]:
drop_data.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
title,0
text,0
label,0


In [31]:
# assigning the values foe x and y

x = drop_data['text']
y = drop_data['label']

In [27]:
# performing the stemming process

port_stem = PorterStemmer()

In [28]:
def stemmer(content):
  stemmed_content = re.sub('[^a-zA-Z]', ' ', content)
  stemmed_content = stemmed_content.lower()
  stemmed_content = stemmed_content.split()
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content = ' '.join(stemmed_content)
  return stemmed_content

In [32]:
# apply stemming for x

stemmed_x = x.apply(stemmer)

In [33]:
print(stemmed_x)

0        comment expect barack obama member fyf fukyofl...
2        demonstr gather last night exercis constitut p...
3        dozen polit activ pastor came privat dinner fr...
4        rs sarmat missil dub satan replac ss fli mile ...
5        say one time someon su southern poverti law ce...
                               ...                        
72149    washington reuter hacker believ work russian g...
72150    know fantasyland republican never question cit...
72151    migrant refus leav train refuge camp hungari t...
72152    mexico citi reuter donald trump comb style buf...
72153    goldman sach endors hillari clinton presid gol...
Name: text, Length: 71537, dtype: object


In [34]:
x = stemmed_x
y = drop_data['label']

In [35]:
# perform vectorizer

vectorizer = TfidfVectorizer()

In [36]:
# apply Tf-IDF process to value x that converts alphabets to numericals values

vectorizer.fit(x)
x = vectorizer.transform(x)

In [37]:
print(x)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 13478335 stored elements and shape (71537, 161399)>
  Coords	Values
  (0, 935)	0.01983505722112524
  (0, 1279)	0.018053575973914514
  (0, 2124)	0.05447895909871961
  (0, 2772)	0.02105710410837073
  (0, 3983)	0.028598177596303027
  (0, 4247)	0.02479164036691028
  (0, 4317)	0.052509340266083455
  (0, 4827)	0.015872941311833737
  (0, 4843)	0.025882712506705208
  (0, 5983)	0.01518659610770437
  (0, 6471)	0.05961020371825159
  (0, 6807)	0.016496164239510197
  (0, 8393)	0.1321783813937761
  (0, 8930)	0.01619639285400206
  (0, 10427)	0.06952075657305665
  (0, 11371)	0.01970454113730467
  (0, 12663)	0.01643990277398983
  (0, 14003)	0.01911375362518469
  (0, 14608)	0.018680915976085895
  (0, 15368)	0.20136832162676274
  (0, 15425)	0.043675148119089395
  (0, 15536)	0.09230147611366957
  (0, 15809)	0.030527177603347524
  (0, 17969)	0.12447687717523162
  (0, 18375)	0.024815999613492006
  :	:
  (71536, 132001)	0.0337026101073971
  (71536

# Train and Test data splitting

In [38]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=2)

# Model selction --> Logisic regression

In [42]:
model = LogisticRegression()

# Training the model

In [43]:
model.fit(x_train,y_train)

# Model prediction

In [44]:
# predicting for train data

x_train_predict = model.predict(x_train)
x_train_acc = accuracy_score(x_train_predict,y_train)
print(x_train_acc)

0.9683027835537926


In [45]:
# predicting for test data

x_test_predict = model.predict(x_test)
x_test_acc = accuracy_score(x_test_predict,y_test)
print(x_test_acc)

0.9473720995247414


# Model evaluation

In [77]:
# prediction for random value

inp = x_test[1]
prediction = model.predict(inp)
print(prediction)

if (prediction[0] == '0'):
  print('This is real news')
else:
  print('This is fake news')

['0']
This is real news


In [78]:
print(y_test.iloc[1])

0
