**Import necessary libraries**

In [5]:
import re   #for regular expressions
import string   #for string operations
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="whitegrid")
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, roc_auc_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

**Load fake and real news datasets**

In [6]:
fake_data = pd.read_csv('Fake.csv')
real_data = pd.read_csv('True.csv')

**Add 'Class' column and assign class 0 to fake news and class 1 to real news**

In [7]:
fake_data['Class'] = 0      #fake news
real_data['Class'] = 1      #real news

**Print the shape of the fake and real news datasets**

In [8]:
print(fake_data.shape, real_data.shape)

(23481, 5) (21417, 5)


**Concatenate the two dataframes by rows**

In [9]:
all_data = pd.concat([fake_data, real_data], axis=0)

In [10]:
all_data.shape

(44898, 5)

**Columns of the concatenated dataframe**

In [11]:
all_data.columns

Index(['title', 'text', 'subject', 'date', 'Class'], dtype='object')

**Descriptive statistics for all columns**

In [12]:
all_data.describe(include='all')

Unnamed: 0,title,text,subject,date,Class
count,44898,44898.0,44898,44898,44898.0
unique,38729,38646.0,8,2397,
top,Factbox: Trump fills top jobs for his administ...,,politicsNews,"December 20, 2017",
freq,14,627.0,11272,182,
mean,,,,,0.477015
std,,,,,0.499477
min,,,,,0.0
25%,,,,,0.0
50%,,,,,0.0
75%,,,,,1.0


**Checking null values**

In [13]:
all_data.isnull().sum()

title      0
text       0
subject    0
date       0
Class      0
dtype: int64

**Shuffle the data**

In [14]:
all_data = all_data.sample(frac=1).reset_index(drop=True)

**Drop duplicated rows**

In [15]:
all_data.drop_duplicates(inplace=True)

In [16]:
all_data.head()

Unnamed: 0,title,text,subject,date,Class
0,Trump’s Star May Be Removed From The Hollywoo...,When a celebrity is given a star on the Hollyw...,News,"April 4, 2016",0
1,Texas ‘Responsible Gun Owner’ Shoots 6-Year-O...,When Texas governor Greg Abbott said that Texa...,News,"January 2, 2016",0
2,German SPD votes for talks with Merkel's conse...,BERLIN (Reuters) - German Social Democratic Pa...,worldnews,"December 7, 2017",1
3,Croatia parliament debates government no-confi...,ZAGREB (Reuters) - The Croatian parliament beg...,worldnews,"November 10, 2017",1
4,Kind Woman Brings Beer to Reporter During Hurr...,A kind woman brought Fox News reporter Casey S...,politics,"Aug 26, 2017",0


**Function to clean and preprocess text data**

In [17]:
def wordopt(text):
    text = text.lower()     # convert text to lower-case
    text = re.sub('\[.*?\]', '', text)  # removes square brackets
    text = re.sub("\\W"," ",text)   # remove special characters
    text = re.sub('https?://\S+|www\.\S+', '', text)    # remove links/URLs
    text = re.sub('<.*?>+', '', text)   # remove HTML tags
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)     # remove punctuations
    text = re.sub('\n', '', text)   # remove new line characters
    text = re.sub('\w*\d\w*', '', text)    # remove words containing numbers/digits
    return text     # return the cleaned text

**Apply the wordopt function to clean the 'text' column in all_data**

In [18]:
all_data['text'] = all_data['text'].apply(wordopt)

**Assign the 'text' column to x and 'Class' column to y**

In [19]:
x = all_data['text']
y = all_data['Class']

**Split data into training and testing sets**

In [20]:
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.3, random_state=365)

In [21]:
print(train_x.shape, test_x.shape)

(31282,) (13407,)


**Initialize TF-IDF vectorizer**

In [22]:
vectorization = TfidfVectorizer()

**Vectorize training and test data using TF-IDF**

In [23]:
xv_train = vectorization.fit_transform(train_x)
xv_test = vectorization.transform(test_x)

**Initialize a LOGISTIC REGRESSION model**

In [24]:
lr = LogisticRegression()

**Fit training data into the LR model**

In [25]:
lr.fit(xv_train, train_y)

In [26]:
print(f"Accuracy score for Logistic Regression training dataset: {lr.score(xv_train, train_y)}")

Accuracy score for training dataset: 0.9903778530784477


**Predict on test data**

In [27]:
pred_lr = lr.predict(xv_test)

In [44]:
print(f"Accuracy score for Logistic Regression testing dataset: {accuracy_score(test_y, pred_lr)}")

Accuracy score for Logistic Regression testing dataset: 0.9873200566868054


**Initialize DECISION TREE CLASSIFIER model**

In [29]:
DTC = DecisionTreeClassifier()

**Fit training data into the DTC model**

In [30]:
DTC.fit(xv_train, train_y)

In [31]:
print(f"Accuracy score for Decision Tree Classifier training dataset: {DTC.score(xv_train, train_y)}")

Accuracy score for training dataset: 0.9999680327344799


**Predict on test data**

In [32]:
pred_dtc = DTC.predict(xv_test)

In [33]:
print(f"Accuracy score for testing dataset: {accuracy_score(test_y, pred_dtc)}")

Accuracy score for testing dataset: 0.996270604907884


**Initialize RANDOM FOREST CLASSIFIER model**

In [34]:
RFC = RandomForestClassifier()

**Fit training data into RFC model**

In [35]:
RFC.fit(xv_train, train_y)

In [36]:
print(f"Accuracy score for training dataset: {RFC.score(xv_train, train_y)}")

Accuracy score for training dataset: 0.9999680327344799


**Predict on test data**

In [37]:
pred_rfc = RFC.predict(xv_test)

In [38]:
print(f"Accuracy score for testing dataset: {accuracy_score(test_y, pred_rfc)}")

Accuracy score for testing dataset: 0.9890355784291788


**Initialize GRADIENT BOOSTING CLASSIFIER model**

In [39]:
GBC = GradientBoostingClassifier()

**Fit training data into GBC model**

In [40]:
GBC.fit(xv_train, train_y)

In [41]:
print(f"Accuracy score for training dataset: {GBC.score(xv_train, train_y)}")

Accuracy score for training dataset: 0.9968352407135094


**Predict on test data**

In [42]:
pred_gbc = GBC.predict(xv_test)

In [43]:
print(f"Accuracy score for testing dataset: {accuracy_score(test_y, pred_gbc)}")

Accuracy score for testing dataset: 0.9961214291041993


**Accuracies: LR, DTC, RFC, GBC**

In [49]:
print(f"Accuracy by Logistic Regression: {accuracy_score(test_y, pred_lr)*100:.2f}%")
print("------------------------------------------------")
print(f"Accuracy by Decision Tree Classifier: {accuracy_score(test_y, pred_dtc)*100:.2f}%")
print("------------------------------------------------")
print(f"Accuracy by Random Forest Classifier: {accuracy_score(test_y, pred_rfc)*100:.2f}%")
print("------------------------------------------------")
print(f"Accuracy by Gradient Boosting Classifier: {accuracy_score(test_y, pred_gbc)*100:.2f}%")

Accuracy by Logistic Regression: 98.73%
------------------------------------------------
Accuracy by Decision Tree Classifier: 99.63%
------------------------------------------------
Accuracy by Random Forest Classifier: 98.90%
------------------------------------------------
Accuracy by Gradient Boosting Classifier: 99.61%
