Import dependencies

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

Read file and examine

In [2]:
df=pd.read_csv("/content/WELFake_Dataset.csv", engine="python", on_bad_lines="skip")
print(df)
print(df.columns)
# # df.head()
# print(df.shape)

     Unnamed: 0                                              title  \
0             0  LAW ENFORCEMENT ON HIGH ALERT Following Threat...   
1             1                                                NaN   
2             2  UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...   
3             3  Bobby Jindal, raised Hindu, uses story of Chri...   
4             4  SATAN 2: Russia unvelis an image of its terrif...   
...         ...                                                ...   
7054       7034  Climate change should not be 'partisan issue,'...   
7055       7035  Democrats' new line on gun control: Do it for ...   
7056       7036  Can The American People Defeat The Oligarchy T...   
7057       7037  BUDGET DEAL Quadruples Number Of Unskilled Gue...   
7058       7038   Meet The Cowardly Republicans Who Re-Endorsed...   

                                                   text label  
0     No comment is expected from Barack Obama Membe...     1  
1        Did they post their vo

In [3]:
df.dropna(subset=['title', 'text', 'label'], inplace=True)

In [4]:
df.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
title,0
text,0
label,0


In [5]:
df = df.drop(columns=['Unnamed: 0'])

In [6]:
print(df)

                                                  title  \
0     LAW ENFORCEMENT ON HIGH ALERT Following Threat...   
2     UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...   
3     Bobby Jindal, raised Hindu, uses story of Chri...   
4     SATAN 2: Russia unvelis an image of its terrif...   
5     About Time! Christian Group Sues Amazon and SP...   
...                                                 ...   
7054  Climate change should not be 'partisan issue,'...   
7055  Democrats' new line on gun control: Do it for ...   
7056  Can The American People Defeat The Oligarchy T...   
7057  BUDGET DEAL Quadruples Number Of Unskilled Gue...   
7058   Meet The Cowardly Republicans Who Re-Endorsed...   

                                                   text label  
0     No comment is expected from Barack Obama Membe...     1  
2      Now, most of the demonstrators gathered last ...     1  
3     A dozen politically active pastors came here f...     0  
4     The RS-28 Sarmat missile, dub

In [7]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [8]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [9]:
port_stem=PorterStemmer()

In [10]:
def stemming(x):
    stemmed_content = re.sub('[^a-zA-Z]', ' ', x)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [11]:
df['title']=df['title'].apply(stemming)

In [12]:
print(df.title)

0       law enforc high alert follow threat cop white ...
2       unbeliev obama attorney gener say charlott rio...
3       bobbi jindal rais hindu use stori christian co...
4       satan russia unv imag terrifi new supernuk wes...
5       time christian group sue amazon splc design ha...
                              ...                        
7054                 climat chang partisan issu kerri say
7055           democrat new line gun control nation secur
7056                 american peopl defeat oligarchi rule
7057      budget deal quadrupl number unskil guest worker
7058    meet cowardli republican endors trump condemn ...
Name: title, Length: 6984, dtype: object


In [13]:
X=df["title"].values
y=df["label"].values

In [14]:
vectorizer=TfidfVectorizer()

In [15]:
X = vectorizer.fit_transform(df["title"])

In [16]:
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=2)

In [17]:
model=LogisticRegression().fit(X_train, y_train)

In [18]:
x_predict=model.predict(X_train)
accuracy=accuracy_score(x_predict, y_train)
print(accuracy)

0.954895292643637


In [41]:
input_string = "At U.S.-China summit, Trump presses Xi on trade, North Korea; progress cited"
input_transformed = vectorizer.transform([input_string])
prediction = model.predict(input_transformed)
print(prediction)
if prediction[0]=='1':
    print("its fake")
else:
    print("its True")

['0']
its True


In [20]:
import pickle
with open("Fake_news.pickle", "wb") as f:
    pickle.dump(model,f)

In [42]:
import pickle
with open("vectorizer.pickle", "wb") as f:
    pickle.dump(vectorizer,f)