# Fake News Classification

#### Importing Libraries

In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, ComplementNB, GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfTransformer
from lime.lime_text import LimeTextExplainer
from sklearn.linear_model import LogisticRegression
import random
from sklearn.ensemble import RandomForestClassifier
import time

#### Importing Data

In [2]:
df = pd.read_csv('FakeNews_data.csv')

In [3]:
df

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1
...,...,...,...,...
72129,72129,Russians steal research on Trump in hack of U....,WASHINGTON (Reuters) - Hackers believed to be ...,0
72130,72130,WATCH: Giuliani Demands That Democrats Apolog...,"You know, because in fantasyland Republicans n...",1
72131,72131,Migrants Refuse To Leave Train At Refugee Camp...,Migrants Refuse To Leave Train At Refugee Camp...,0
72132,72132,Trump tussle gives unpopular Mexican leader mu...,MEXICO CITY (Reuters) - Donald Trump’s combati...,0


#### Creating Copy of data

In [4]:
data = df.copy()

Copy of the original dataset is created to prevent changes on the original dataset

#### Data Preprocessing

In [5]:
data = data.drop('Unnamed: 0', axis = 1)

In [6]:
data['Full_text'] = data['title'] + data['text']
data = data.drop(['title','text'], axis = 1)

In [7]:
data

Unnamed: 0,label,Full_text
0,1,LAW ENFORCEMENT ON HIGH ALERT Following Threat...
1,1,
2,1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...
3,0,"Bobby Jindal, raised Hindu, uses story of Chri..."
4,1,SATAN 2: Russia unvelis an image of its terrif...
...,...,...
72129,0,Russians steal research on Trump in hack of U....
72130,1,WATCH: Giuliani Demands That Democrats Apolog...
72131,0,Migrants Refuse To Leave Train At Refugee Camp...
72132,0,Trump tussle gives unpopular Mexican leader mu...


In [8]:
data.isnull().sum()

label          0
Full_text    597
dtype: int64

In [9]:
data = data.dropna()
data.isnull().sum()

label        0
Full_text    0
dtype: int64

In [10]:
data.duplicated().sum()

8416

In [11]:
data.drop_duplicates(keep = 'first')

Unnamed: 0,label,Full_text
0,1,LAW ENFORCEMENT ON HIGH ALERT Following Threat...
2,1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...
3,0,"Bobby Jindal, raised Hindu, uses story of Chri..."
4,1,SATAN 2: Russia unvelis an image of its terrif...
5,1,About Time! Christian Group Sues Amazon and SP...
...,...,...
72127,1,WIKILEAKS EMAIL SHOWS CLINTON FOUNDATION FUNDS...
72129,0,Russians steal research on Trump in hack of U....
72130,1,WATCH: Giuliani Demands That Democrats Apolog...
72131,0,Migrants Refuse To Leave Train At Refugee Camp...


#### Data Splitting

In [12]:
x = data['Full_text']
y = data['label']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

#### Lime Model

In [13]:
class LimeModel:
    def __init__(self, text_classifier):
        self.text_classifier = text_classifier
        self.vectorizer = TfidfVectorizer()
        self.explainer = LimeTextExplainer(class_names=text_classifier.classes_)

    def fit(self, x_train, y_train):
        x_train_tfidf = self.vectorizer.fit_transform(x_train)
        self.text_classifier.fit(x_train_tfidf, y_train)

    def predict(self, x):
        x_tfidf = self.vectorizer.transform(x)
        return self.text_classifier.predict(X_tfidf)

    def predict_proba(self, x):
        x_tfidf = self.vectorizer.transform(x)
        return self.text_classifier.predict_proba(x_tfidf)

    def explain_instance(self, text_instance, true_label=None):
        random.seed(42)
        if true_label is None:
            # If true_label is not provided, explain based on the predicted label
            predicted_label = self.predict([text_instance])[0]
            explanation = self.explainer.explain_instance(
                text_instance, self.predict_proba, num_features=10, top_labels=1, labels=[predicted_label]
            )
        else:
            # Explain based on the true_label
            explanation = self.explainer.explain_instance(
                text_instance, self.predict_proba, num_features=10, top_labels=1, labels=[true_label]
            )
        return explanation

    def predict_description(self, description):
        return self.predict([description])[0]

#### Naive Bayes

In [14]:
# Create a pipeline for Naive Bayes model
nb_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', MultinomialNB()),
])

# Define the parameters for the pipeline
parameters = {
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'clf__alpha': [0.1,0.15],
    'tfidf__max_features': [None, 500, 1000, 2000],
    'tfidf__stop_words': [None, 'english'],
    'clf': [MultinomialNB(), BernoulliNB(), ComplementNB()],
    'clf__fit_prior': [True, False],
}

# Fit the pipeline with the training data and find the best parameters using GridSearchCV
nb_grid = GridSearchCV(nb_pipeline, parameters, cv=5, n_jobs=-1)
nb_grid.fit(x_train, y_train)

# Print the best parameters found by GridSearchCV
print("Best parameters: ", nb_grid.best_params_)

# Predict the disease labels using the test set
y_pred = nb_grid.predict(x_test)

# Print the classification report
print(classification_report(y_test, y_pred, digits = 2))

Best parameters:  {'clf': BernoulliNB(alpha=0.1, fit_prior=False), 'clf__alpha': 0.1, 'clf__fit_prior': False, 'tfidf__max_features': None, 'tfidf__ngram_range': (1, 2), 'tfidf__stop_words': 'english'}
              precision    recall  f1-score   support

           0       0.95      0.91      0.93      7081
           1       0.91      0.96      0.93      7227

    accuracy                           0.93     14308
   macro avg       0.93      0.93      0.93     14308
weighted avg       0.93      0.93      0.93     14308

