### Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
import string



In [2]:
# Create a Kaggle API client
import os
os.environ['KAGGLE_USERNAME'] = 'mustabshiribnamin'
os.environ['KAGGLE_KEY'] = '0eff4183442e547cfe97d8be3e922d42' 
import kaggle as kg
kg.api.dataset_download_files(dataset = "bhavikjikadara/fake-news-detection", path='dataset', unzip=True)

Dataset URL: https://www.kaggle.com/datasets/bhavikjikadara/fake-news-detection


OSError: [Errno 22] Invalid argument: 'dataset\\fake.csv'

### View Data

In [3]:
true_df = pd.read_csv('dataset/true.csv')
fake_df = pd.read_csv('dataset/fake.csv')

In [4]:
true_df.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [5]:
true_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21417 entries, 0 to 21416
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    21417 non-null  object
 1   text     21417 non-null  object
 2   subject  21417 non-null  object
 3   date     21417 non-null  object
dtypes: object(4)
memory usage: 669.4+ KB


In [6]:
fake_df.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [7]:
fake_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23481 entries, 0 to 23480
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    23481 non-null  object
 1   text     23481 non-null  object
 2   subject  23481 non-null  object
 3   date     23481 non-null  object
dtypes: object(4)
memory usage: 733.9+ KB


### Data Preprocessing

In [8]:
true_df.duplicated().sum()

206

In [9]:
true_df.drop_duplicates(inplace=True)

In [10]:
fake_df.duplicated().sum()

3

In [11]:
fake_df.drop_duplicates(inplace=True)

In [12]:
true_df.isnull().sum()

title      0
text       0
subject    0
date       0
dtype: int64

In [13]:
fake_df.isnull().sum()

title      0
text       0
subject    0
date       0
dtype: int64

In [14]:
# labeling the true and false state
true_df['label'] = 0
fake_df['label'] = 1

In [15]:
# concating true and fake data
df = pd.concat([fake_df, true_df], ignore_index=True)
# shuffling the rows of the DataFrame and resetting the index
df = df.sample(frac=1).reset_index(drop=True)

In [16]:
#replace underscores and making title format
df.columns = df.columns.str.replace('_', ' ').str.title()

In [17]:
df.head()

Unnamed: 0,Title,Text,Subject,Date,Label
0,Anti-Semitism Is Rapidly Rising And It’s All ...,When Hillary Clinton called half of Trump supp...,News,"October 19, 2016",1
1,RT EXCLUSIVE: Peter Lavelle interviews Dr. Ron...,"21st Century Wire says In 2008 and 2012, Ron P...",US_News,"November 12, 2016",1
2,U.N. urges Sudan to improve plight of Darfur's...,GENEVA (Reuters) - The United Nations urged Su...,worldnews,"November 21, 2017",0
3,Pennsylvania Church Demonstrates How To Deal ...,While many churches defend pastors who have ra...,News,"October 13, 2016",1
4,Trump vows to cut plane costs after meeting de...,WASHINGTON (Reuters) - President-elect Donald ...,politicsNews,"December 21, 2016",0


In [18]:
# Clean text
stopword = set(stopwords.words('english'))
stemmer = PorterStemmer()

def clean(text):
    text = str(text).lower()
    # remove text within square brackets
    text = re.sub('\[.*?\]', '', text) 
    # remove http links
    text = re.sub('https?://\S+|www\.\S+', '', text)
    # remove html tags
    text = re.sub('<.*?>+', '', text)
    # remove punctuation
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    # remove newline chars
    text = re.sub('\n', '', text)
    # remove all word containing numbers
    text = re.sub('\w*\d\w*', '', text)
    # remove stopwords
    text = [word for word in text.split(' ') if word not in stopword]
    # applies stemming to words
    text = [stemmer.stem(word) for word in text]
    text = " ".join(text)
    return text

df['Text'] = df['Text'].apply(lambda x: clean(x))
df['Title'] = df['Title'].apply(lambda x: clean(x))

In [19]:
df.head()

Unnamed: 0,Title,Text,Subject,Date,Label
0,antisemit rapidli rise it’ thank trump support,hillari clinton call half trump support deplo...,News,"October 19, 2016",1
1,rt exclus peter lavel interview dr ron paul tr...,centuri wire say ron paul creat libertarian...,US_News,"November 12, 2016",1
2,un urg sudan improv plight darfur displac peopl,geneva reuter unit nation urg sudan end viole...,worldnews,"November 21, 2017",0
3,pennsylvania church demonstr deal childmolest...,mani church defend pastor rape molest children...,News,"October 13, 2016",1
4,trump vow cut plane cost meet defens execut,washington reuter presidentelect donald trump...,politicsNews,"December 21, 2016",0


### Model Training

In [20]:

# For learning setup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import pickle

# Define TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Vectorize the text data
X = vectorizer.fit_transform(df['Text'])

# Define target variable
y = df['Label']

# Save the TF-IDF vectorizer
with open('Models/tfidf_vectorizer.pkl', 'wb') as file:
    pickle.dump(vectorizer, file)


In [21]:
# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X_train, y_train)

lr.score(X_test, y_test)

0.9892593421347058

In [25]:
# Save Logistic Regression model
with open('Models/logistic_regression.pkl', 'wb') as file:
    pickle.dump(lr, file)

In [26]:
from sklearn.naive_bayes import MultinomialNB

nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

nb_model.score(X_test,y_test)

0.9541284403669725

In [27]:
# Save Naive Bayes model
with open('Models/naive_bayes.pkl', 'wb') as file:
    pickle.dump(nb_model, file)

In [28]:
from sklearn import svm

linear_svm = svm.SVC(kernel='linear')
linear_svm.fit(X_train, y_train)

linear_svm.score(X_test, y_test)

KeyboardInterrupt: 

In [None]:
# Save Linear SVM model
with open('Models/linear_svm.pkl', 'wb') as file:
    pickle.dump(linear_svm, file)

In [27]:
from sklearn import svm

poly_svm = svm.SVC(kernel='poly')
poly_svm.fit(X_train, y_train)

poly_svm.score(X_test, y_test)

0.9539046766614455

In [30]:
from sklearn import svm

sigmoid_svm = svm.SVC(kernel='sigmoid')
sigmoid_svm.fit(X_train, y_train)

sigmoid_svm.score(X_test, y_test)

0.9955247258894607

In [28]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

rfc.score(X_test,y_test)

0.9913850973372119

In [None]:
# Save Linear SVM model
with open('Models/random_forest.pkl', 'wb') as file:
    pickle.dump(rfc, file)

In [31]:
# import joblib

# # Save the model
# joblib.dump(rfc, 'random_forest.pkl')
# joblib.dump(nb_model, 'naive_bayes.pkl')
# joblib.dump(lr, 'logistic_regression.pkl')
# joblib.dump(linear_svm, 'linear_svm.pkl')

['linear_svm.pkl']