In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [4]:
#load data
fake_news = pd.read_csv('Fake.csv')
true_news = pd.read_csv('True.csv')

In [5]:
# Step 2: Preprocessing
# Adding 'label' column (Fake -> 0, True -> 1)
fake_news['label'] = 0
true_news['label'] = 1

In [6]:
# Combine the two datasets
news_data = pd.concat([fake_news, true_news], axis=0)

In [7]:
# Drop unnecessary columns if needed (e.g., subject, date)
news_data = news_data[['title', 'text', 'label']]

In [8]:
# Combine title and text columns into a single feature (optional)
news_data['combined_text'] = news_data['title'] + " " + news_data['text']

In [9]:
# Step 3: Text Preprocessing and Vectorization
# Use TfidfVectorizer to convert text to numerical features
tfidf = TfidfVectorizer(stop_words='english', max_df=0.7)
X = tfidf.fit_transform(news_data['combined_text'])

In [10]:
# Labels
y = news_data['label']

In [11]:
# Step 4: Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Step 5: Model Training and Evaluation

# Logistic Regression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred_logreg = logreg.predict(X_test)

In [13]:
# Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

In [17]:
# Function to print accuracy, confusion matrix, and classification report
def evaluate_model(model_name, y_test, y_pred):
    print(f"Evaluation for {model_name}")
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")
    print(f"Classification Report:\n{classification_report(y_test, y_pred)}")
    print("-" * 60)

In [18]:
# Evaluate Logistic Regression
evaluate_model("Logistic Regression", y_test, y_pred_logreg)

Evaluation for Logistic Regression
Accuracy: 0.9858574610244989
Confusion Matrix:
[[4663   70]
 [  57 4190]]
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4733
           1       0.98      0.99      0.99      4247

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980

------------------------------------------------------------


In [19]:
# Evaluate Random Forest
evaluate_model("Random Forest", y_test, y_pred_rf)

Evaluation for Random Forest
Accuracy: 0.9907572383073496
Confusion Matrix:
[[4689   44]
 [  39 4208]]
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4733
           1       0.99      0.99      0.99      4247

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980

------------------------------------------------------------
