# Fake News Detection

Importing dependency

In [64]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
import re #regular expression


Loading Dataset

In [54]:
#we use pandas to load our dataset
train_df= pd.read_csv('train.csv')

In [55]:
#check the structure of dataset
train_df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [56]:
#drop the author,title and id columns from dataset
train_df = train_df.drop("author", axis = 1)
train_df = train_df.drop("title", axis = 1)
train_df = train_df.drop("id", axis = 1)

In [57]:
#check the shape of the dataset
train_df.shape

(20800, 2)

In [58]:
# check if the dataset has any missing values
train_df.isnull().sum()

text     39
label     0
dtype: int64

In [59]:
# replacing the null values with empty string
train_df = train_df.fillna('')

In [60]:
# check if the dataset has any missing values
train_df.isnull().sum()

text     0
label    0
dtype: int64

In [61]:
#To check the value_count for each labels
train_df.label.value_counts()

1    10413
0    10387
Name: label, dtype: int64

Pre-Processing the Dataset

In [62]:
# Define a function to clean the text
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove numbers and punctuation
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    
    # Join the lemmatized tokens back into a single string
    cleaned_text = ' '.join(lemmatized_tokens)
    
    return cleaned_text

In [65]:
#clean the text
train_df['cleaned_text']=train_df['text'].apply(clean_text)

In [66]:
#to get clean dataset
train_df

Unnamed: 0,text,label,cleaned_text
0,House Dem Aide: We Didn’t Even See Comey’s Let...,1,house dem aide ’ even see comey ’ letter jason...
1,Ever get the feeling your life circles the rou...,0,ever get feeling life circle roundabout rather...
2,"Why the Truth Might Get You Fired October 29, ...",1,truth might get fired october tension intellig...
3,Videos 15 Civilians Killed In Single US Airstr...,1,video civilian killed single u airstrike ident...
4,Print \nAn Iranian woman has been sentenced to...,1,print iranian woman sentenced six year prison ...
...,...,...,...
20795,Rapper T. I. unloaded on black celebrities who...,0,rapper unloaded black celebrity met donald tru...
20796,When the Green Bay Packers lost to the Washing...,0,green bay packer lost washington redskin week ...
20797,The Macy’s of today grew from the union of sev...,0,macy ’ today grew union several great name ame...
20798,"NATO, Russia To Hold Parallel Exercises In Bal...",1,nato russia hold parallel exercise balkan pres...


Model features and labels

In [67]:
# X-features,y-labels
X = train_df["cleaned_text"]
y = train_df.label.values


Our feature for training is the cleaned_text variable and the target is the label variable.

Dataset splitting

In [68]:
#split the dataset into train and text data. the test size is 15% of the entire dataset

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.15,
    random_state=42,
    shuffle=True,
    stratify=y,
)


Pipeline approach

In [69]:
# Create a classifier model in pipeline
text_classifier = Pipeline(steps=[
                               ('pre_processing',TfidfVectorizer(lowercase=False)),
                                 ('linear_model ',LogisticRegression())
])

Train the Model

In [70]:
# train the text_classifier model
text_classifier.fit(X_train,y_train)

Pipeline(steps=[('pre_processing', TfidfVectorizer(lowercase=False)),
                ('linear_model ', LogisticRegression())])

In [71]:
# create a prediction from the test set
y_preds = text_classifier.predict(X_test)

In [72]:
# Check Accuracy
accuracy_score(y_test,y_preds)

0.944551282051282

In [73]:
# Make a Prediction
sample_text = "Ever get the feeling your life circles the roundabout rather than heads in a straight line towards the intended destination?[Hillary Clinton remains the big woman on campus in leafy,liberal Wellesley,Massachusetts.Everywhere else votes her most likely to don her inauguration dress for the remainder of her days the way Miss Havisham forever wore that wedding dress.Speaking of Great Expectations,Hillary Rodham overflowed with them 48years ago when she first addressed a Wellesley graduation class."
text_classifier.predict([sample_text])
prediction = text_classifier.predict([sample_text])
if prediction[0] == 0:
    print("Prediction of the News :  Looking FakeNews ")
else:
    print("Prediction of the News : Looking RealNews ")

Prediction of the News : Looking RealNews 


save model pipeline

In [74]:
import os

print(os.getcwd())


C:\Users\sneka


In [75]:
#create a pickel file using serialization and save the model
import pickle
pickle_out = open("FakeNewsDetection/text_classifier.pkl","wb")
pickle.dump(text_classifier,pickle_out)
pickle_out.close()