In [3]:
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbpipeline

from joblib import dump

In [5]:
fake=pd.read_csv('./dataset/Fake.csv', usecols = ['title'])
real=pd.read_csv('./dataset/True.csv', usecols = ['title'])

In [7]:
real[real.loc[:,:] == ' '] = np.NaN
print(real.isna().sum())
fake[fake.loc[:,:] == ' '] = np.NaN
print(fake.isna().sum())

title    0
dtype: int64
title    0
dtype: int64


In [8]:
real = real[~real.title.str.count('\s+').lt(4)]

# Add label column for real news
real['label'] = 0
real

Unnamed: 0,title,label
0,"As U.S. budget fight looms, Republicans flip t...",0
1,U.S. military to accept transgender recruits o...,0
2,Senior U.S. Republican senator: 'Let Mr. Muell...,0
3,FBI Russia probe helped by Australian diplomat...,0
4,Trump wants Postal Service to charge 'much mor...,0
...,...,...
21412,'Fully committed' NATO backs new U.S. approach...,0
21413,LexisNexis withdrew two products from Chinese ...,0
21414,Minsk cultural hub becomes haven from authorities,0
21415,Vatican upbeat on possibility of Pope Francis ...,0


In [9]:
# Remove sentences having less than 5 words
fake = fake[~fake.title.str.count('\s+').lt(4)]

# Add label column for fake news
fake['label'] = 1
fake

Unnamed: 0,title,label
0,Donald Trump Sends Out Embarrassing New Year’...,1
1,Drunk Bragging Trump Staffer Started Russian ...,1
2,Sheriff David Clarke Becomes An Internet Joke...,1
3,Trump Is So Obsessed He Even Has Obama’s Name...,1
4,Pope Francis Just Called Out Donald Trump Dur...,1
...,...,...
23476,McPain: John McCain Furious That Iran Treated ...,1
23477,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,1
23478,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,1
23479,How to Blow $700 Million: Al Jazeera America F...,1


In [10]:
# Combine both the real and fake news dataset
df = pd.concat([real, fake])

# Shuffle the final dataset
df = df.sample(frac=1)
df

Unnamed: 0,title,label
17026,TRUMP WAS RIGHT! Audit Reveals State Dept. Cyb...,1
7939,U.S. top court denies Obama request to rehear ...,0
15998,Man seized over Benghazi attack is Syrian link...,0
19492,Militant attack on minister's convoy kills two...,0
10403,Suspicious substance at Cruz's Houston headqua...,0
...,...,...
20949,Egypt says suspended U.S. military exercises t...,0
16141,"Spain's prosecutor calls for rebellion, sediti...",0
21548,OBAMA LIGHTS UP WHITE HOUSE HOURS AFTER GAY MA...,1
8015,"In first face-off with Clinton, Trump struggle...",0


In [11]:
# Count the distinct label values
df.label.value_counts()

label
1    23448
0    21412
Name: count, dtype: int64

In [12]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [14]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [15]:
# Text preprocessing
def preprocessing(tweet):
  text = BeautifulSoup(tweet).get_text() # Remove HTML tags
  text = re.sub("[^a-zA-Z#]", " ", text) # Remove special characters
  text = re.sub('((www.[^s]+)|(https?://[^s]+))',' ', text) # Remove URLs
  text = text.lower().split() # Convert to lowercase and split each word

  stop_w = set(stopwords.words("english")) # Use a set instead of list for faster searching
  text = [w for w in text if not w in stop_w] # Remove stop words
  text = [WordNetLemmatizer().lemmatize(w) for w in text] # Lemmatization

  return (" ".join(text))

In [16]:
# Clean the textual data
df['title'] = df['title'].apply(lambda text: preprocessing(text))
df['title'].head()

  text = BeautifulSoup(tweet).get_text() # Remove HTML tags


17026    trump right audit reveals state dept cybersecu...
7939     u top court denies obama request rehear major ...
15998    man seized benghazi attack syrian linked suspe...
19492    militant attack minister convoy kill two bysta...
10403    suspicious substance cruz houston headquarters...
Name: title, dtype: object

In [37]:

# Split into train and test sets
train, test = train_test_split(df, test_size = 0.2, stratify = df['label'], random_state = 42)

# Get the shape of datasets after splitting
train.shape, test.shape

((35888, 2), (8972, 2))

In [41]:
# Create a TF-IDF vectorizer object
tfidf_vec = TfidfVectorizer(max_features=10000, ngram_range=(1, 3))

# Data fitting and transformation
train_df = tfidf_vec.fit_transform(train.title)
test_df  = tfidf_vec.transform(test.title)

In [42]:
# Before over-sampling the minority class
train['label'].value_counts()

label
1    18758
0    17130
Name: count, dtype: int64

In [43]:
# Use SMOTE (Synthetic Minority Oversampling Technique) for dealing with class imbalance
smt = SMOTE(random_state = 18, sampling_strategy = 1.0)
smt_xtrain_df, smt_ytrain = smt.fit_resample(train_df, train.label)

In [44]:

# After over-sampling the minority class
smt_ytrain.value_counts()

label
0    18758
1    18758
Name: count, dtype: int64

In [45]:
# Model 1 - Logistic Regressor
logis_reg = LogisticRegression()
logis_reg.fit(smt_xtrain_df, smt_ytrain)

y_pred = logis_reg.predict(test_df)

# Evaluation metrics
print(classification_report(test.label, y_pred, target_names=['Negative', 'Positive']))

              precision    recall  f1-score   support

    Negative       0.93      0.95      0.94      4282
    Positive       0.95      0.94      0.94      4690

    accuracy                           0.94      8972
   macro avg       0.94      0.94      0.94      8972
weighted avg       0.94      0.94      0.94      8972



In [46]:
# Model 2 - Linear Support Vector Classifier
linear_svc = LinearSVC()
linear_svc.fit(smt_xtrain_df, smt_ytrain)

y_pred = linear_svc.predict(test_df)

# Evaluation metrics
print(classification_report(test.label, y_pred, target_names=['Negative', 'Positive']))

              precision    recall  f1-score   support

    Negative       0.95      0.95      0.95      4282
    Positive       0.95      0.95      0.95      4690

    accuracy                           0.95      8972
   macro avg       0.95      0.95      0.95      8972
weighted avg       0.95      0.95      0.95      8972



In [55]:
# Model 3 - PassiveAggressiveClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
pac = PassiveAggressiveClassifier(max_iter=50)
pac.fit(smt_xtrain_df, smt_ytrain)

y_pred = pac.predict(test_df)

# Evaluation metrics
print(classification_report(test.label, y_pred, target_names=['Negative', 'Positive']))

              precision    recall  f1-score   support

    Negative       0.94      0.93      0.93      4282
    Positive       0.93      0.95      0.94      4690

    accuracy                           0.94      8972
   macro avg       0.94      0.94      0.94      8972
weighted avg       0.94      0.94      0.94      8972



In [59]:
# Define the steps of the pipeline
pipeline = imbpipeline(steps = [['tfidf', TfidfVectorizer(max_features=10000, ngram_range=(1, 3))],
                                ['smote', SMOTE(random_state = 18, sampling_strategy = 1.0)],
                                ['classifier', LogisticRegression()]])

# Fit the pipeline model with the training data                            
pipeline.fit(train.title, train.label)

In [60]:
# Predict the label using the pipeline
check = pipeline.predict(['The UN Chief Calls for Immediate Ceasefire in War-Torn Region to Allow Humanitarian Aid'])

out = "Possible Fake News" if check==1 else "Possible Real News"
print(out)

Possible Real News


In [61]:
import joblib
joblib.dump(pipeline, filename = 'FakeNewsClassifier1.pkl')

['FakeNewsClassifier1.pkl']