In [1]:
# 1. Imports
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

In [None]:
# 2. Load the Datasets
# Load the Kaggle dataset (Data/train.csv)
kaggle_df = pd.read_csv('Data/Data.csv')

# Load the ISOT datasets (Data/True.csv and Data/Fake.csv)
true_df = pd.read_csv('Data/True.csv')
fake_df = pd.read_csv('Data/Fake.csv')

# For ISOT, assign labels
true_df['label'] = 1
fake_df['label'] = 0

In [None]:
# 3. Combine the Datasets
# Combine the two ISOT datasets
isot_df = pd.concat([true_df, fake_df], ignore_index=True)

print("Kaggle dataset shape:", kaggle_df.shape)
print("ISOT dataset shape:", isot_df.shape)

# Combine the Kaggle and ISOT datasets into one DataFrame
df = pd.concat([kaggle_df, isot_df], ignore_index=True)

# Shuffle the combined dataset for a better mix
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
print("Combined dataset shape:", df.shape)

Kaggle dataset shape: (20800, 5)
ISOT dataset shape: (44898, 5)
Combined dataset shape: (65698, 7)


In [None]:
# 4. Handle Missing Values
# Drop rows where 'text' is missing
df = df.dropna(subset=['text'])

# Fill missing 'title' values with a placeholder
df['title'] = df['title'].fillna("No Title Provided")

if 'author' in df.columns:
    df['author'] = df['author'].fillna("Unknown")

In [5]:
# 5. Create the 'content' Column
# Create a combined text field by concatenating 'title' and 'text'
if 'content' not in df.columns:
    df['content'] = df['title'] + " " + df['text']

In [None]:
# 6. Split into Training and Testing Sets
# Define features and target variable
X = df['content']
y = df['label']

# Split the combined dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# 7. Vectorize Text using TF-IDF
# Initialize and fit the TF-IDF vectorizer
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [8]:
# 8. Train the SVM Model
# Initialize and train the SVM classifier with a linear kernel
svm_model = SVC(kernel='linear', C=1, random_state=42)
svm_model.fit(X_train_tfidf, y_train)

In [9]:
# Compute test predictions
y_test_pred = svm_model.predict(X_test_tfidf)

# Calculate the metrics
acc = accuracy_score(y_test, y_test_pred)
prec = precision_score(y_test, y_test_pred, average='weighted')
rec = recall_score(y_test, y_test_pred, average='weighted')
f1 = f1_score(y_test, y_test_pred, average='weighted')

# Print the metrics
print("Test Accuracy:", acc)
print("Test Precision:", prec)
print("Test Recall:", rec)
print("Test F1-Score:", f1)

Test Accuracy: 0.9430399025281755
Test Precision: 0.943230204127446
Test Recall: 0.9430399025281755
Test F1-Score: 0.9430083011111855
