<a href="https://colab.research.google.com/github/MacroAndMicro/ML-Models/blob/main/fake_news_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing the libraries


In [69]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [70]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [71]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

# Importing the dataset

In [72]:
train_data = pd.read_csv('/content/train.csv', skiprows=[1976])


In [73]:
train_data.shape

(1975, 5)

In [74]:
train_data.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


# Feature engineering

In [75]:
train_data.isnull().sum()

id          0
title      51
author    209
text        4
label       0
dtype: int64

In [76]:
train_data.fillna('')


Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1
...,...,...,...,...,...
1970,1970,USA – China: Who Is Responsible for North Kore...,Author,Country: China North Korea’s announcements of ...,1
1971,1971,U.S. Open Quieted Those Calling for a Roof. No...,David Waldstein and Ben Rothenberg,The United States Open added a new $150 millio...,0
1972,1972,Australia Is Not as Down Under as Everyone Thi...,Michelle Innis,"SYDNEY, Australia — That map of Australia y...",0
1973,1973,7 Tory horror film posters to send a chill dow...,Poke Staff,Next Swipe left/right 7 Tory horror film poste...,1


In [119]:
train_data['content'] = train_data['author']+" "+train_data['title']

# Processing the textual data in the dataset.

In [106]:
port_stem = PorterStemmer()

In [114]:
# to convert all the words, except the stopwords to their root words
def stemming(content):
   if isinstance(content, str):
    stemmed_content = re.sub('[^a-zA-Z]', ' ', content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content
   else:
     return ''




In [120]:
train_data['content'] = train_data['content'].apply(stemming)

In [121]:
print(train_data['content'])

0       darrel lucu hous dem aid even see comey letter...
1       daniel j flynn flynn hillari clinton big woman...
2                  consortiumnew com truth might get fire
3       jessica purkiss civilian kill singl us airstri...
4       howard portnoy iranian woman jail fiction unpu...
                              ...                        
1970    author usa china respons north korean nuclear ...
1971    david waldstein ben rothenberg u open quiet ca...
1972    michel inni australia everyon think new york time
1973    poke staff tori horror film poster send chill ...
1974                                                     
Name: content, Length: 1975, dtype: object


In [122]:
x = train_data['content'].values
y = train_data['label'].values

In [123]:
# converting the text in x to numerical data.
vectorizer = TfidfVectorizer()
vectorizer.fit(x)
x = vectorizer.fit_transform(x)


# Climax : model training and testing

In [41]:
x_train, x_test, y_train, y_test =train_test_split(x, y, test_size=0.2, stratify=y, random_state=2)

In [35]:
model = LogisticRegression()

In [None]:
model.fit(x_train, y_train)

In [37]:
predicted_y = model.predict(x_train)

In [39]:
test_accuracy = accuracy_score(y_train, predicted_y)
print(test_accuracy)

0.9677215189873418


# Model testing on the test data that was splitted.

In [43]:
pred_test_y = model.predict(x_test)
test_accuracy = accuracy_score(y_test, pred_test_y)
print(test_accuracy)

0.9316455696202531


# Model trial on test.csv Radhe Radhe

In [60]:
test_data = pd.read_csv('/content/test.csv', skiprows=[2161])

In [61]:
test_data.shape

(2160, 4)

In [62]:
test_data.head()

Unnamed: 0,id,title,author,text
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...


In [63]:
test_data.isnull().sum()

id          0
title      54
author    201
text        3
dtype: int64

In [64]:
test_data.fillna('')

Unnamed: 0,id,title,author,text
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...
...,...,...,...,...
2155,22955,Bernie Sanders to California Democrats: Please...,Joel B. Pollak,Former presidential candidate Sen. Bernie Sand...
2156,22956,"They Said What?!: Find Out What Paul Krugman, ...",,Email Ever wonder what’s on the mind of today’...
2157,22957,PHOTOS: German Carnival Floats Depict Decapita...,Chris Tomlinson,German cities held their annual Rose Monday fl...
2158,22958,Davos Elite Fret About Inequality Over Vintage...,Peter S. Goodman,"DAVOS, Switzerland — You have perhaps notic..."


In [124]:
test_data['content'] = test_data['author']+" "+test_data['title']

In [125]:
test_data['content'] = test_data['content'].apply(stemming)

In [127]:
print(test_data['content'])

0       david streitfeld specter trump loosen tongu pu...
1                                                        
2       common dream nodapl nativ american leader vow ...
3       daniel victor tim tebow attempt anoth comeback...
4        truth broadcast network keiser report meme war e
                              ...                        
2155    joel b pollak berni sander california democrat...
2156                                                     
2157    chri tomlinson photo german carniv float depic...
2158    peter goodman davo elit fret inequ vintag wine...
2159    gpd pressur taliban leader make afghan issu co...
Name: content, Length: 2160, dtype: object


In [128]:
xt = test_data['content'].values

In [82]:
print(xt)

  (0, 5579)	0.09598335631366806
  (0, 5310)	0.3002678375382336
  (0, 5194)	0.12114495976256674
  (0, 5117)	0.34816949036624406
  (0, 5096)	0.09427325621328314
  (0, 4861)	0.34816949036624406
  (0, 4854)	0.34816949036624406
  (0, 4734)	0.34816949036624406
  (0, 4611)	0.30821741538902786
  (0, 4003)	0.34816949036624406
  (0, 3441)	0.09363298092790194
  (0, 2964)	0.34816949036624406
  (0, 1256)	0.21439635850468433
  (2, 5504)	0.28666520787121075
  (2, 5386)	0.2924660068695555
  (2, 4802)	0.30708285295197024
  (2, 3847)	0.22782176312329566
  (2, 3489)	0.32927386518085344
  (2, 3389)	0.3468878625094909
  (2, 2857)	0.2514372132139358
  (2, 2849)	0.28154854053307593
  (2, 1837)	0.2769715256328302
  (2, 1498)	0.30708285295197024
  (2, 1007)	0.30708285295197024
  (2, 180)	0.22254966999289477
  :	:
  (2157, 1337)	0.3460861060011662
  (2157, 1279)	0.3460861060011662
  (2157, 1011)	0.3160443745233962
  (2157, 884)	0.25085607030316653
  (2157, 767)	0.32851281962993634
  (2158, 5579)	0.0969188972843

In [129]:

vectorizer = TfidfVectorizer()
vectorizer.fit(xt)
xt = vectorizer.fit_transform(xt)


In [130]:
xt.shape

(2160, 5603)

In [134]:
test_dataset_y = model.predict(xt)
test_accuracy_1 = accuracy_score(test_dataset_y, y_train)
test_accuracy_2 = accuracy_score(test_dataset_y, y_test)
print(test_accuracy_1)
print(test_accuracy_2)

ValueError: X has 5603 features, but LogisticRegression is expecting 5144 features as input.