In [4]:
# 📌 Step 1: Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

# Text vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# 📌 Step 2: Load Preprocessed Data
df = pd.read_csv('../Data/preprocessed_data.csv')
print("✅ Preprocessed Data Loaded. Shape:", df.shape)

# 📌 Step 3: Split Features and Target
X = df.drop('Appeal_Category', axis=1)
y = df['Appeal_Category']

# 📌 Step 4: Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 📌 Step 5: Define Numerical & Text Columns
text_column = 'Appeal_Text'
numerical_cols = [col for col in X.columns if col != text_column]

# 📌 Step 6: Define Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(max_features=300), text_column),
        ('num', 'passthrough', numerical_cols)
    ]
)

# 📌 Step 7: Define Models with Pipelines
models = {
    'Logistic Regression': Pipeline([
        ('preprocess', preprocessor),
        ('model', LogisticRegression(max_iter=1000))
    ]),
    'SVM': Pipeline([
        ('preprocess', preprocessor),
        ('model', SVC())
    ]),
    'Random Forest': Pipeline([
        ('preprocess', preprocessor),
        ('model', RandomForestClassifier())
    ]),
    'AdaBoost': Pipeline([
        ('preprocess', preprocessor),
        ('model', AdaBoostClassifier())
    ]),
    'Gradient Boosting': Pipeline([
        ('preprocess', preprocessor),
        ('model', GradientBoostingClassifier())
    ])
}

results = {}

# 📌 Step 8: Train and Evaluate
for name, pipeline in models.items():
    pipeline.fit(X_train, y_train)
    preds = pipeline.predict(X_test)
    acc = accuracy_score(y_test, preds)
    results[name] = acc
    print(f"\n🔹 {name} Accuracy: {acc:.4f}")
    print(classification_report(y_test, preds))

# 📌 Step 9: Display Best Model
best_model_name = max(results, key=results.get)
print(f"\n✅ Best Model Based on Accuracy: {best_model_name} ({results[best_model_name]:.4f})")


✅ Preprocessed Data Loaded. Shape: (2500, 7)

🔹 Logistic Regression Accuracy: 0.1820
              precision    recall  f1-score   support

           0       0.17      0.56      0.27        89
           1       0.19      0.40      0.26        98
           2       0.00      0.00      0.00       106
           3       0.00      0.00      0.00       107
           4       0.15      0.02      0.04       100

    accuracy                           0.18       500
   macro avg       0.10      0.20      0.11       500
weighted avg       0.10      0.18      0.11       500



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])



🔹 SVM Accuracy: 0.1780
              precision    recall  f1-score   support

           0       0.18      1.00      0.30        89
           1       0.00      0.00      0.00        98
           2       0.00      0.00      0.00       106
           3       0.00      0.00      0.00       107
           4       0.00      0.00      0.00       100

    accuracy                           0.18       500
   macro avg       0.04      0.20      0.06       500
weighted avg       0.03      0.18      0.05       500



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])



🔹 Random Forest Accuracy: 0.2060
              precision    recall  f1-score   support

           0       0.19      0.25      0.22        89
           1       0.17      0.18      0.18        98
           2       0.23      0.18      0.20       106
           3       0.25      0.26      0.26       107
           4       0.18      0.16      0.17       100

    accuracy                           0.21       500
   macro avg       0.21      0.21      0.20       500
weighted avg       0.21      0.21      0.21       500


🔹 AdaBoost Accuracy: 0.1720
              precision    recall  f1-score   support

           0       0.18      0.90      0.29        89
           1       0.10      0.04      0.06        98
           2       0.00      0.00      0.00       106
           3       0.00      0.00      0.00       107
           4       0.33      0.02      0.04       100

    accuracy                           0.17       500
   macro avg       0.12      0.19      0.08       500
weighted avg  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])



🔹 Gradient Boosting Accuracy: 0.1680
              precision    recall  f1-score   support

           0       0.12      0.13      0.12        89
           1       0.10      0.10      0.10        98
           2       0.25      0.21      0.23       106
           3       0.17      0.18      0.18       107
           4       0.21      0.21      0.21       100

    accuracy                           0.17       500
   macro avg       0.17      0.17      0.17       500
weighted avg       0.17      0.17      0.17       500


✅ Best Model Based on Accuracy: Random Forest (0.2060)
