In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Load preprocessed data
df_fake = pd.read_csv("preprocessed_Fake.csv")
df_true = pd.read_csv("preprocessed_True.csv")

# Label the data
df_fake['label'] = 0  # fake
df_true['label'] = 1  # true

# Combine the datasets
df = pd.concat([df_fake, df_true], ignore_index=True)
 
df['combined'] = df['processed_title'].fillna('') + ' ' + df['processed_text'].fillna('')

# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000)

# fit and transform data
X = vectorizer.fit_transform(df['combined'])  # Feature matrix
y = df['label']

print("TF-IDF matrix shape:", X.shape)
print("Sample features:", vectorizer.get_feature_names_out()[:20])


TF-IDF matrix shape: (44898, 5000)
Sample features: ['00' '000' '10' '100' '11' '12' '120' '13' '14' '15' '150' '16' '17' '18'
 '19' '1980' '1980s' '1990s' '1991' '1992']


In [None]:
# Classifier 1: Linear Regression:

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# create training / testing split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# train model
model = LogisticRegression()
model.fit(X_train, y_train)

# evaluate model
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))



              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4733
           1       0.99      0.99      0.99      4247

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



In [None]:
# Classifier 2 - Random Forest classifier 
from sklearn.ensemble import RandomForestClassifier

# create training / testing split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# train model
model = RandomForestClassifier(
    n_estimators=100,       # trees
    random_state=42,        # reproducibility
    n_jobs=-1               # use all CPU cores
)
model.fit(X_train, y_train)

# evaluate model
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4733
           1       1.00      1.00      1.00      4247

    accuracy                           1.00      8980
   macro avg       1.00      1.00      1.00      8980
weighted avg       1.00      1.00      1.00      8980



In [None]:
# Classifier 3 - Linear SVM Classifier 
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# use Linear SVM model
clf = make_pipeline(
    StandardScaler(with_mean=False) if hasattr(X, "tocsr") else StandardScaler(),
    LinearSVC(dual="auto", C=1.0, tol=1e-3, max_iter=5000, random_state=42)
)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4733
           1       0.99      0.99      0.99      4247

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980

