In [7]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score


In [8]:
import pandas as pd
import re

# Define paths to the uploaded datasets
true_data_path = 'True.csv'
fake_data_path = 'Fake.csv'

# Load the datasets into pandas DataFrames
true_df = pd.read_csv(true_data_path)
fake_df = pd.read_csv(fake_data_path)

# Check the first few rows of each dataset to understand their structure
true_df.head(), fake_df.head()

# Clean text function (remove punctuation, special characters, etc.)
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text.lower()

# Apply cleaning to both datasets (assuming the 'text' column holds the news content)
true_df['cleaned'] = true_df['text'].apply(clean_text)  # Clean the 'text' column for Real news
fake_df['cleaned'] = fake_df['text'].apply(clean_text)  # Clean the 'text' column for Fake news

# Add a 'label' column for Fake and True datasets
true_df['label'] = 0  # 0 for Real news
fake_df['label'] = 1  # 1 for Fake news

# Combine both datasets into one
data = pd.concat([true_df[['cleaned', 'label']], fake_df[['cleaned', 'label']]], axis=0)

# Check the first few rows of the combined dataset
data.head()


Unnamed: 0,cleaned,label
0,washington reuters the head of a conservative...,0
1,washington reuters transgender people will be...,0
2,washington reuters the special counsel invest...,0
3,washington reuters trump campaign adviser geo...,0
4,seattlewashington reuters president donald tr...,0


In [9]:

# Clean text function (remove punctuation, special characters, etc.)
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text.lower()

# Apply cleaning to both datasets
true_df['cleaned'] = true_df['text'].apply(clean_text)  # Assuming 'text' column for news articles
fake_df['cleaned'] = fake_df['text'].apply(clean_text)  # Same assumption

# Add a 'label' column for Fake and True datasets
true_df['label'] = 0  # 0 for Real news
fake_df['label'] = 1  # 1 for Fake news

# Combine both datasets into one
data = pd.concat([true_df[['cleaned', 'label']], fake_df[['cleaned', 'label']]], axis=0)

# Check the first few rows of the combined dataset
data.head()


Unnamed: 0,cleaned,label
0,washington reuters the head of a conservative...,0
1,washington reuters transgender people will be...,0
2,washington reuters the special counsel invest...,0
3,washington reuters trump campaign adviser geo...,0
4,seattlewashington reuters president donald tr...,0


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorize the cleaned text
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(data['cleaned'])
y = data['label']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the models
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

rf = RandomForestClassifier()
dt = DecisionTreeClassifier()
lr = LogisticRegression()
nb = MultinomialNB()

rf.fit(X_train, y_train)
dt.fit(X_train, y_train)
lr.fit(X_train, y_train)
nb.fit(X_train, y_train)

# Evaluate the models
from sklearn.metrics import accuracy_score

rf_pred = rf.predict(X_test)
dt_pred = dt.predict(X_test)
lr_pred = lr.predict(X_test)
nb_pred = nb.predict(X_test)

# Calculate accuracy
print(f"Random Forest Accuracy: {accuracy_score(y_test, rf_pred)}")
print(f"Decision Tree Accuracy: {accuracy_score(y_test, dt_pred)}")
print(f"Logistic Regression Accuracy: {accuracy_score(y_test, lr_pred)}")
print(f"Naive Bayes Accuracy: {accuracy_score(y_test, nb_pred)}")


Random Forest Accuracy: 0.9974811083123426
Decision Tree Accuracy: 0.9949622166246851
Logistic Regression Accuracy: 0.9848866498740554
Naive Bayes Accuracy: 0.9546599496221663


In [11]:

# Majority voting function
def majority_voting(*predictions):
    return np.sign(np.sum(predictions, axis=0))

# Combine the predictions using Majority Voting
predictions = np.array([rf_pred, dt_pred, lr_pred, nb_pred])
final_predictions = majority_voting(*predictions)

# Evaluate the final predictions
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, final_predictions)
print(f'Accuracy of Majority Voting: {accuracy}')

# Create a summary table of individual model predictions
model_predictions = pd.DataFrame({
    'Model': ['Random Forest', 'Decision Tree', 'Logistic Regression', 'Naive Bayes'],
    'Prediction': [rf_pred[0], dt_pred[0], lr_pred[0], nb_pred[0]],
})

# Show the predictions table along with the majority vote result
print(f"Result: {'FAKE' if final_predictions[0] == 1 else 'REAL'}\n")
print("Individual Model Predictions:")
print(model_predictions)


Accuracy of Majority Voting: 0.9659949622166247
Result: FAKE

Individual Model Predictions:
                 Model  Prediction
0        Random Forest           1
1        Decision Tree           1
2  Logistic Regression           1
3          Naive Bayes           1


In [12]:
import pickle

# Save models to a file
with open('models/rf_model.pkl', 'wb') as f:
    pickle.dump(rf, f)
with open('models/dt_model.pkl', 'wb') as f:
    pickle.dump(dt, f)
with open('models/lr_model.pkl', 'wb') as f:
    pickle.dump(lr, f)
with open('models/nb_model.pkl', 'wb') as f:
    pickle.dump(nb, f)

# Save the vectorizer
with open('models/vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)
