<a href="https://colab.research.google.com/github/Mamoon5/ML-Projects/blob/main/Fake_News_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim
from gensim.models import Word2Vec,KeyedVectors
from gensim.parsing.preprocessing import remove_stopwords
from nltk import sent_tokenize
from gensim.utils import simple_preprocess
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
import nltk 
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
# loading the dataset to a pandas DataFrame
news_dataset = pd.read_csv('/content/drive/MyDrive/train.csv')

In [5]:
news_dataset.shape

(20800, 5)

In [6]:
# print the first 5 rows of the dataframe
news_dataset.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [7]:
# counting the number of missing values in the dataset
news_dataset.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [8]:
# dropping the null values
news_dataset = news_dataset.dropna()

In [9]:
# merging the news title & news text
news_dataset['content'] = news_dataset['title']+' '+news_dataset['text']

In [10]:
print(news_dataset['content'])

0        House Dem Aide: We Didn’t Even See Comey’s Let...
1        FLYNN: Hillary Clinton, Big Woman on Campus - ...
2        Why the Truth Might Get You Fired Why the Trut...
3        15 Civilians Killed In Single US Airstrike Hav...
4        Iranian woman jailed for fictional unpublished...
                               ...                        
20795    Rapper T.I.: Trump a ’Poster Child For White S...
20796    N.F.L. Playoffs: Schedule, Matchups and Odds -...
20797    Macy’s Is Said to Receive Takeover Approach by...
20798    NATO, Russia To Hold Parallel Exercises In Bal...
20799    What Keeps the F-35 Alive   David Swanson is a...
Name: content, Length: 18285, dtype: object


In [11]:
# separating the data & label
X = news_dataset.drop(columns='label', axis=1)
Y = news_dataset['label']

In [12]:
X.head()

Unnamed: 0,id,title,author,text,content
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,House Dem Aide: We Didn’t Even See Comey’s Let...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,"FLYNN: Hillary Clinton, Big Woman on Campus - ..."
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",Why the Truth Might Get You Fired Why the Trut...
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,15 Civilians Killed In Single US Airstrike Hav...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,Iranian woman jailed for fictional unpublished...


In [13]:
# Remove Punctuations
import string
exclude=string.punctuation

In [20]:
def preprocess(text):
  preproc = text.lower()
  #preproc = sent_tokenize(preproc)
  simple_preprocess(preproc)
  remove_stopwords(preproc)
  return preproc

In [21]:
news_dataset['content'] = news_dataset['content'].apply(preprocess)

In [22]:
news_dataset['content']

0        house dem aide: we didn’t even see comey’s let...
1        flynn: hillary clinton, big woman on campus - ...
2        why the truth might get you fired why the trut...
3        15 civilians killed in single us airstrike hav...
4        iranian woman jailed for fictional unpublished...
                               ...                        
20795    rapper t.i.: trump a ’poster child for white s...
20796    n.f.l. playoffs: schedule, matchups and odds -...
20797    macy’s is said to receive takeover approach by...
20798    nato, russia to hold parallel exercises in bal...
20799    what keeps the f-35 alive   david swanson is a...
Name: content, Length: 18285, dtype: object

In [23]:
#separating the data and label
X = news_dataset['content'].values
Y = news_dataset['label'].values

In [24]:
X[0]

'house dem aide: we didn’t even see comey’s letter until jason chaffetz tweeted it house dem aide: we didn’t even see comey’s letter until jason chaffetz tweeted it by darrell lucus on october 30, 2016 subscribe jason chaffetz on the stump in american fork, utah ( image courtesy michael jolley, available under a creative commons-by license) \nwith apologies to keith olbermann, there is no doubt who the worst person in the world is this week–fbi director james comey. but according to a house democratic aide, it looks like we also know who the second-worst person is as well. it turns out that when comey sent his now-infamous letter announcing that the fbi was looking into emails that may be related to hillary clinton’s email server, the ranking democrats on the relevant committees didn’t hear about it from comey. they found out via a tweet from one of the republican committee chairmen. \nas we now know, comey notified the republican chairmen and democratic ranking members of the house in

In [25]:
Y.shape

(18285,)

In [34]:
vectorizer = TfidfVectorizer()
vectorizer.fit(X)

X = vectorizer.transform(X)

In [35]:
print(X)

  (0, 146043)	0.008427911828900101
  (0, 145719)	0.043305979851628526
  (0, 145677)	0.009877205814900116
  (0, 145667)	0.01035813177877744
  (0, 144647)	0.0389773904668947
  (0, 144641)	0.0837096343262954
  (0, 144575)	0.03310503474512966
  (0, 144520)	0.012117587649828555
  (0, 144054)	0.024432764966366916
  (0, 143890)	0.022833331047837555
  (0, 143645)	0.008007130605435949
  (0, 143300)	0.03569654990819754
  (0, 143095)	0.007795908122223867
  (0, 143063)	0.024482433332852088
  (0, 143001)	0.02410754745444914
  (0, 142755)	0.01496991233928377
  (0, 142649)	0.011104605086675547
  (0, 142540)	0.026263853075255694
  (0, 142442)	0.012486016632018732
  (0, 142280)	0.061804057780683014
  (0, 142211)	0.010761408665365818
  (0, 141987)	0.05300365180025859
  (0, 141148)	0.02503609271620633
  (0, 141123)	0.019256080806295508
  (0, 141099)	0.015588259513800959
  :	:
  (18284, 6415)	0.014436491423095867
  (18284, 6341)	0.035668383290156376
  (18284, 5982)	0.0112094160160227
  (18284, 5190)	0.016

In [36]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify=Y, random_state=2)

In [37]:
model = LogisticRegression()

In [38]:
model.fit(X_train, Y_train)

LogisticRegression()

In [39]:
# accuracy score on the training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [40]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.9749111293409899


In [41]:
# accuracy score on the test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [42]:
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.9567951873120044


AttributeError: ignored