In [47]:
import numpy as np
import pandas as pd
import re
import string
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import seaborn as sns
import matplotlib.pyplot as plt

In [49]:
data_fake = pd.read_csv(r"C:\Users\monil\Desktop\Graduate Project\resources\datasets\Fake.csv")
data_true = pd.read_csv(r"C:\Users\monil\Desktop\Graduate Project\resources\datasets\True.csv")              

In [50]:
data_fake['label'] = 1  # 1 for fake news
data_true['label'] = 0  # 0 for real news

# Remove 'subject' and 'date' columns
data_fake = data_fake.drop(columns=['title','subject', 'date'])
data_true = data_true.drop(columns=['title','subject', 'date'])

In [51]:
data_fake.shape, data_true.shape

((23481, 2), (21417, 2))

In [55]:
data_fake.head()

Unnamed: 0,text,label
0,Donald Trump just couldn t wish all Americans ...,1
1,House Intelligence Committee Chairman Devin Nu...,1
2,"On Friday, it was revealed that former Milwauk...",1
3,"On Christmas day, Donald Trump announced that ...",1
4,Pope Francis used his annual Christmas Day mes...,1


In [57]:
data_true.head()

Unnamed: 0,text,label
0,WASHINGTON (Reuters) - The head of a conservat...,0
1,WASHINGTON (Reuters) - Transgender people will...,0
2,WASHINGTON (Reuters) - The special counsel inv...,0
3,WASHINGTON (Reuters) - Trump campaign adviser ...,0
4,SEATTLE/WASHINGTON (Reuters) - President Donal...,0


In [59]:
# Merge the datasets
data = pd.concat([data_fake, data_true], ignore_index=True)

# Shuffle the data
data = data.sample(frac=1, random_state=42).reset_index(drop=True)

In [61]:
data.isnull().sum()

text     0
label    0
dtype: int64

In [65]:
data.head()

Unnamed: 0,text,label
0,"21st Century Wire says Ben Stein, reputable pr...",1
1,WASHINGTON (Reuters) - U.S. President Donald T...,0
2,(Reuters) - Puerto Rico Governor Ricardo Rosse...,0
3,"On Monday, Donald Trump once again embarrassed...",1
4,"GLASGOW, Scotland (Reuters) - Most U.S. presid...",0


In [67]:
def wordopt(text): 
    text = text.lower() 
    text = re.sub(r'\[.*?\]','', text) 
    text = re.sub(r"\\W", "", text) 
    text = re.sub(r'https?://\S+/www\.\S+',"", text) 
    text = re.sub(r"<.*?>+","", text) 
    text = re.sub(r'[%s]' % re.escape(string.punctuation), "", text) 
    text = re.sub(r'\n',"", text) 
    text = re.sub(r'\w*\d\w*',"", text) 
    return text 


In [69]:
data['text'] = data['text'].apply(wordopt)

In [73]:
x = data['text']
y = data['label']

In [87]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25) 

In [89]:
from sklearn.feature_extraction.text import TfidfVectorizer 

vectorization = TfidfVectorizer() 
xv_train = vectorization.fit_transform(x_train) 
xv_test = vectorization.transform(x_test)

In [90]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression() 
LR.fit(xv_train, y_train)

In [91]:
pred_lr = LR.predict(xv_test) 

In [92]:
LR.score(xv_test, y_test) 

0.9862806236080178

In [93]:
print(classification_report(y_test, pred_lr))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99      5392
           1       0.99      0.98      0.99      5833

    accuracy                           0.99     11225
   macro avg       0.99      0.99      0.99     11225
weighted avg       0.99      0.99      0.99     11225



In [101]:
import joblib

# Save the trained model and vectorizer
joblib.dump(LR, 'fake_news_model.pkl')
joblib.dump(vectorization, 'tfidf_vectorizer.pkl')
print("Model and vectorizer saved successfully!")

Model and vectorizer saved successfully!
