In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# ----------------------------
# 1. Load the Data
# ----------------------------
train_df = pd.read_csv('Data/train.csv')
test_df = pd.read_csv('Data/test.csv')

# ----------------------------
# 2. Handle Missing Values
# ----------------------------
# Drop rows with missing 'text' as it is critical for classification
train_df = train_df.dropna(subset=['text'])
test_df = test_df.dropna(subset=['text'])

# Fill missing 'title' and 'author' values with placeholders
train_df['title'] = train_df['title'].fillna("No Title Provided")
train_df['author'] = train_df['author'].fillna("Unknown")
test_df['title'] = test_df['title'].fillna("No Title Provided")
test_df['author'] = test_df['author'].fillna("Unknown")

# ----------------------------
# 3. Combine Text Columns (Optional)
# ----------------------------
# Combine 'title' and 'text' for a richer text representation
train_df['content'] = train_df['title'] + " " + train_df['text']
test_df['content'] = test_df['title'] + " " + test_df['text']

# ----------------------------
# 4. Define Features and Target
# ----------------------------
X_train = train_df['content']
y_train = train_df['label']

X_test = test_df['content']
# If test.csv has labels, extract them; otherwise, y_test will be None.
y_test = test_df['label'] if 'label' in test_df.columns else None

# ----------------------------
# 5. Vectorize Text using TF-IDF
# ----------------------------
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# ----------------------------
# 6. Train the SVM Model
# ----------------------------
svm_model = SVC(kernel='linear', C=1, random_state=42)
svm_model.fit(X_train_tfidf, y_train)

# ----------------------------
# 7. Make Predictions and Evaluate
# ----------------------------
y_pred = svm_model.predict(X_test_tfidf)

if y_test is not None:
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    print("SVM Accuracy:", accuracy)
    print("SVM Classification Report:\n", report)
else:
    print("Predictions on test set:")
    print(y_pred)