In [None]:
import pandas as pd
import numpy as np
import re
import string

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [None]:
fake = pd.read_csv("Fake.csv")
true = pd.read_csv("True.csv")

fake['label'] = 0   # Fake news
true['label'] = 1   # Real news


In [None]:
data = pd.concat([fake, true], axis=0)
data = data[['text', 'label']]
data = data.sample(frac=1).reset_index(drop=True)

data.head()


Unnamed: 0,text,label
0,WASHINGTON (Reuters) - Former FBI Director Jam...,1
1,MADRID (Reuters) - Spain has made its largest ...,1
2,MOSCOW (Reuters) - The Kremlin said on Monday ...,1
3,MOSCOW (Reuters) - Russian President Vladimir ...,1
4,Another horrible story about a deranged killer...,0


In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>+', '', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub(r'\n', '', text)
    text = re.sub(r'\w*\d\w*', '', text)
    return text

data['text'] = data['text'].apply(clean_text)


In [None]:
X = data['text']
y = data['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)


In [None]:
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [None]:
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)


In [None]:
pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, pred)
print("Accuracy:", accuracy)


Accuracy: 0.9842316258351893


In [None]:
import pickle

with open("model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)
