<a href="https://colab.research.google.com/github/Lakshmana219/ML-Work/blob/master/Fake_News_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import itertools

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import plotly.express as px

import re

In [None]:
#import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

Loading Data

In [None]:
df = pd.read_csv('data.csv')

Columns in DataFrame

In [None]:
df.columns

Index(['URLs', 'Headline', 'Body', 'Label'], dtype='object')

Shape of DataFrame

In [None]:
df.shape

(4009, 4)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4009 entries, 0 to 4008
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   URLs      4009 non-null   object
 1   Headline  4009 non-null   object
 2   Body      3988 non-null   object
 3   Label     4009 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 125.4+ KB


Missing Value counts in DF

In [None]:
df.isnull().sum()

URLs         0
Headline     0
Body        21
Label        0
dtype: int64

Out of 4009 rows of data, 21 missing values. Dropping Null values, doesn't effect the model.

In [None]:
df.dropna(inplace=True)

Sentiment size

In [None]:
sentiment_size = df['Label'].value_counts().reset_index()
sentiment_size.columns = ['sentiment', 'size']

In [None]:
sentiment_size

Unnamed: 0,sentiment,size
0,0,2120
1,1,1868


In [None]:
px.bar(sentiment_size, x='sentiment', y='size', color='size')

Drop Features

In [None]:
def drop_features(features, data):
  data.drop(features, inplace=True, axis=1)

Text Processing

In [None]:
def text_process(text):
  return " ".join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])", " ", str(text).lower()).split())

Removing Stopwords

In [None]:
def remove_stopwords(text):
  nltk_stopwords = nltk.corpus.stopwords.words('english')
  word_tokens = word_tokenize(text)
  filtered_sentence = [w for w in word_tokens if not w in nltk_stopwords]
  return " ".join(filtered_sentence)

Applying Text Process over DF Body

In [None]:
df['processed_text'] = df['Body'].apply(text_process)

Applying Remove Stopwords over DF Processed Text

In [None]:
df['removed_stopwords'] = df['processed_text'].apply(remove_stopwords)

Dropping Features DF

In [None]:
drop_features(['URLs', 'Headline', 'Body', 'processed_text'], df)

In [None]:
X = df['removed_stopwords']
y = df['Label']

Spliting the data to traing and testing

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=2)

##Count Vectorizer and Tfidf Transformer

Vectorizing both training and testing feature sets.

The count vectorizer converts the string data into a sparse matrix that provides the number of occurences of the words. The Tfidf transformer is used to find the term frequency and inverse document frequency in the word counts and provides weightage to certain words based on its uniqueness. This is less biased and can be successfully trained

In [None]:
count_vec = CountVectorizer()
tfidf_transform = TfidfTransformer(norm = 'l2', sublinear_tf = True)

# vectorizing and transforming training set 
X_train_count = count_vec.fit_transform(X_train)
# tfidf transformer on count vectorizer
X_train_tfidf = tfidf_transform.fit_transform(X_train_count) 

# vectorizing and transforming test set
X_test_count = count_vec.transform(X_test)
X_test_tfidf = tfidf_transform.transform(X_test_count)

## Passive Aggressive Classifier

The passive-aggressive algorithms are a family of algorithms for large-scale learning. They are similar to the Perceptron in that they do not require a learning rate. However, contrary to the Perceptron, they include a regularization parameter C.

In [None]:
model = PassiveAggressiveClassifier(max_iter=20)

Training the model over training data on Passive Aggressive Classifier Algrothim

In [None]:
model.fit(X_train_tfidf, y_train)

PassiveAggressiveClassifier(C=1.0, average=False, class_weight=None,
                            early_stopping=False, fit_intercept=True,
                            loss='hinge', max_iter=20, n_iter_no_change=5,
                            n_jobs=None, random_state=None, shuffle=True,
                            tol=0.001, validation_fraction=0.1, verbose=0,
                            warm_start=False)

Predicting on testing data

In [None]:
y_pred = model.predict(X_test_tfidf)

Confusion Matrix

In [None]:
confusion_matrix(y_test, y_pred)

array([[431,   4],
       [  3, 360]])

Testing Score

In [None]:
print('Score on testing data: ',model.score(X_test_tfidf,y_test))

Score on testing data:  0.9912280701754386


Classification Report

In [None]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       435
           1       0.99      0.99      0.99       363

    accuracy                           0.99       798
   macro avg       0.99      0.99      0.99       798
weighted avg       0.99      0.99      0.99       798

