In [1]:
# 1. Imports
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

In [2]:
# 2. Load the Datasets
# Load the Kaggle dataset (Data/train.csv)
kaggle_df = pd.read_csv('Data/Data.csv')

# Load the ISOT datasets (Data/True.csv and Data/Fake.csv)
true_df = pd.read_csv('Data/True.csv')
fake_df = pd.read_csv('Data/Fake.csv')

# For ISOT, assign labels: assume true news as 1 and fake news as 0
true_df['label'] = 1
fake_df['label'] = 0

FileNotFoundError: [Errno 2] No such file or directory: 'Data/train.csv'

In [None]:
# 3. Combine the Datasets
# Combine the two ISOT datasets
isot_df = pd.concat([true_df, fake_df], ignore_index=True)

# Optionally, you can print shapes to inspect sizes:
print("Kaggle dataset shape:", kaggle_df.shape)
print("ISOT dataset shape:", isot_df.shape)

# Combine the Kaggle and ISOT datasets into one DataFrame
df = pd.concat([kaggle_df, isot_df], ignore_index=True)

# Shuffle the combined dataset for a better mix
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
print("Combined dataset shape:", df.shape)

In [None]:
# 4. Handle Missing Values
# Drop rows where 'text' is missing, as this field is critical for classification
df = df.dropna(subset=['text'])

# Fill missing 'title' values with a placeholder
df['title'] = df['title'].fillna("No Title Provided")

# If an 'author' column exists, fill missing values with "Unknown"
if 'author' in df.columns:
    df['author'] = df['author'].fillna("Unknown")

In [None]:
# 5. Create the 'content' Column
# Create a combined text field by concatenating 'title' and 'text'
if 'content' not in df.columns:
    df['content'] = df['title'] + " " + df['text']

In [None]:
# 6. Split into Training and Testing Sets
# Define features and target variable
X = df['content']
y = df['label']

# Split the combined dataset (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# 7. Vectorize Text using TF-IDF
# Initialize and fit the TF-IDF vectorizer
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [None]:
# 8. Train the SVM Model
# Initialize and train the SVM classifier with a linear kernel
svm_model = SVC(kernel='linear', C=1, random_state=42)
svm_model.fit(X_train_tfidf, y_train)

In [None]:
# 9. Make Predictions and Print Accuracy

# Compute training predictions and accuracy
y_train_pred = svm_model.predict(X_train_tfidf)
train_accuracy = accuracy_score(y_train, y_train_pred)
print("Train Accuracy:", train_accuracy)

# Compute test predictions and accuracy
y_test_pred = svm_model.predict(X_test_tfidf)
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Test Accuracy:", test_accuracy)

# Print a detailed classification report for the test set
print("Test Classification Report:\n", classification_report(y_test, y_test_pred))