# Fake News

This project is looking at a dataset from Kaggle where we are creatign a predictive model for detection of fake news.

In [None]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [None]:
# Simplified text cleaning function without stopwords and stemming
def clean_text(text):
    text = re.sub(r'\W', ' ', text.lower())  # Remove all non-word characters
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)  # Remove all single characters
    text = re.sub(r'\^[a-zA-Z]\s+', ' ', text)  # Remove single characters from the start
    text = re.sub(r'\s+', ' ', text, flags=re.I)  # Replace multiple spaces with a single space
    text = re.sub(r'^b\s+', '', text)  # Remove prefixed 'b'
    return text

In [None]:
# Load data (Assumes that the CSV file is in the current working directory)
data = pd.read_csv('news.csv')

In [None]:
# Clean the text data
data['text_clean'] = data['text'].apply(clean_text)

In [None]:
# Feature extraction
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X = tfidf_vectorizer.fit_transform(data['text_clean'])
y = data['label']

In [None]:
# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

In [None]:
# Initialize classifier
classifier = MultinomialNB()

In [None]:
# Train classifier
classifier.fit(X_train, y_train)

In [None]:
# Predict
y_pred = classifier.predict(X_test)

In [None]:
# Evaluate
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred, target_names=label_encoder.classes_)

In [None]:
# Print the evaluation results
print(f'Accuracy: {accuracy}')
print(f'Classification Report: \n{classification_rep}')

In [None]:
# Save the model and vectorizer for later use
pd.to_pickle(classifier, 'fake_news_classifier.pkl')
pd.to_pickle(tfidf_vectorizer, 'tfidf_vectorizer.pkl')
