In [2]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# Load dataset
df = pd.read_csv("first_10000_rows.csv")  # Adjust path as needed

# Basic text preprocessing
def clean_text(text):
    text = re.sub(r"http\S+", "", text)  # Remove URLs
    text = re.sub(r"@\w+", "", text)     # Remove mentions
    text = re.sub(r"#\w+", "", text)     # Remove hashtags
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    text = text.lower().strip()          # Lowercase and trim
    return text

df['clean_text'] = df['text'].astype(str).apply(clean_text)

# Check for nulls or invalid entries
df = df.dropna(subset=['clean_text', 'label'])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    df['clean_text'], df['label'], test_size=0.2, random_state=42, stratify=df['label']
)

# TF-IDF vectorization
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Evaluate multiple SVM kernels
kernels = ['linear', 'rbf', 'poly', 'sigmoid']
results = {}

for kernel in kernels:
    print(f"\nTraining with SVM kernel: {kernel}")
    model = SVC(kernel=kernel)
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)
    acc = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred))
    results[kernel] = acc

# Display best-performing kernel
best_kernel = max(results, key=results.get)
print(f"\nBest Kernel: {best_kernel} — Accuracy: {results[best_kernel]:.4f}")



Training with SVM kernel: linear
Accuracy: 0.8655
              precision    recall  f1-score   support

           0       0.90      0.92      0.91       587
           1       0.84      0.94      0.89       679
           2       0.87      0.58      0.69       158
           3       0.90      0.84      0.87       273
           4       0.81      0.82      0.82       230
           5       0.89      0.58      0.70        73

    accuracy                           0.87      2000
   macro avg       0.87      0.78      0.81      2000
weighted avg       0.87      0.87      0.86      2000


Training with SVM kernel: rbf
Accuracy: 0.7950
              precision    recall  f1-score   support

           0       0.83      0.92      0.87       587
           1       0.72      0.96      0.82       679
           2       0.91      0.39      0.55       158
           3       0.92      0.67      0.78       273
           4       0.85      0.59      0.70       230
           5       0.95      0.26