In [None]:
import pandas as pd
import numpy as np
from datasets import Dataset
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

In [None]:
# Suppress warnings
warnings.filterwarnings("ignore", category=UserWarning, module='sklearn')

In [None]:
# Load the dataset from the saved arrow file
dataset = Dataset.from_file('/Users/icon1c/Documents/Semester 2/Natural Language Processing/NLP-Polimi-Project/Practice Models/Rishabh/dataset/train/data-00000-of-00001.arrow')

# Convert to Pandas DataFrame for analysis
df = dataset.to_pandas()

In [None]:
# Prepare the data for training
X = df['input']
y = df['output']

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [None]:
# Evaluate the model on the testing data
y_pred = lr_model.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

In [None]:
# Define hyperparameter grids for different models
param_grids = {
    'logistic_regression': {
        'C': [0.1, 1, 10],
        'solver': ['liblinear', 'saga'],
        'penalty': ['l1', 'l2']
    },
    'svm': {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf'],
        'gamma': ['scale', 'auto']
    },
    'random_forest': {
        'n_estimators': [10, 50, 100],
        'max_depth': [None, 5, 10],
        'min_samples_split': [2, 5, 10]
    }
}

In [None]:
# Train and evaluate models
results = {}
for model_name, param_grid in param_grids.items():
    if model_name == 'logistic_regression':
        model = LogisticRegression(max_iter=1000)
    elif model_name == 'svm':
        model = SVC()
    elif model_name == 'random_forest':
        model = RandomForestClassifier()
    
    clf = GridSearchCV(model, param_grid, cv=5, scoring='f1_macro', n_jobs=-1)
    clf.fit(X_train_tfidf, y_train)
    
    y_pred = clf.predict(X_test_tfidf)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')
    results[model_name] = {'accuracy': accuracy, 'f1': f1, 'best_params': clf.best_params_, 'y_pred': y_pred}

In [None]:
# Print results
for model_name, result in results.items():
    print(f"Model: {model_name}")
    print(f"Accuracy: {result['accuracy']:.2f}")
    print(f"F1-score: {result['f1']:.2f}")
    print(f"Best hyperparameters: {result['best_params']}")
    print()
    
    # Detailed evaluation
    print("Classification Report:")
    print(classification_report(y_test, result['y_pred']))
    
    # Confusion Matrix
    conf_matrix = confusion_matrix(y_test, result['y_pred'])
    plt.figure(figsize=(12, 8))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix for {model_name}')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()