In [2]:
import pandas as pd
train_df = pd.read_csv('train_processed.csv')

In [6]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier

In [7]:
# Used to pass into N-gram and Vector Space
X = train_df['Text_Processed'].tolist()
y = [score - 1 for score in train_df['Score'].tolist()]

# Splitting the dataset to 80/20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train_subset, _, y_train_subset, _ = train_test_split(X_train, y_train, train_size=0.1, random_state=42)

# You can also limit the subset of the test data if needed
X_test_subset, _, y_test_subset, _ = train_test_split(X_test, y_test, train_size=0.1, random_state=42)

classifiers = {
    "Logistic Regression": LogisticRegression(max_iter=500),
    "Naïve Bayes": MultinomialNB(),
    "Decision Tree": DecisionTreeClassifier()
}

param_grids = {
    "Logistic Regression": {
        'classifier__C': [0.1, 1, 10],
        'classifier__solver': ['liblinear']
    },
    "Decision Tree": {
        'classifier__max_depth': [5, 10, 20],  # Depth of the tree
        'classifier__min_samples_split': [2, 5, 10],  # Minimum samples to split an internal node
        'classifier__min_samples_leaf': [1, 2, 4],  # Minimum samples required at each leaf node
        'classifier__criterion': ['gini', 'entropy']  # The function to measure quality of a split
    },
    "Naïve Bayes": {
        'classifier__alpha': [0.1, 1.0, 10.0],  # for Laplace smoothing
        'classifier__fit_prior': [True, False]  # whether to learn class prior probabilities
    },
}

results = []

In [4]:
vector_space_models = {
    "Binary Representation": CountVectorizer(binary=True),
    "Frequency Count": CountVectorizer(max_features=10000, max_df=.15),
    "TF-IDF": TfidfVectorizer(max_features=10000, max_df=.15)
}

In [8]:
for clf_name, clf in classifiers.items():
    print(f"\n=== {clf_name} ===")

    for ngram_name, vectorizer in vector_space_models.items():
        print(f"-- {ngram_name} --")

        pipeline = Pipeline([
            ('vectorizer', vectorizer),
            ('classifier', clf)
        ])

        param_grid = param_grids[clf_name]

        if param_grid:
            # Perform RandomizedSearchCV
            search = RandomizedSearchCV(
                pipeline,
                param_distributions=param_grid,
                n_iter=5,
                cv=3,
                scoring='accuracy',
                n_jobs=-1,
                verbose=0
            )
            search.fit(X_train_subset, y_train_subset)
            best_model = search.best_estimator_

            # Print best hyperparameters
            print(f"Best Hyperparameters: {search.best_params_}")
        else:
            pipeline.fit(X_train_subset, y_train_subset)
            best_model = pipeline

        # Evaluate the model
        y_pred = best_model.predict(X_test_subset)

        accuracy = accuracy_score(y_test_subset, y_pred)
        precision = precision_score(y_test_subset, y_pred, average='weighted', zero_division=1)
        recall = recall_score(y_test_subset, y_pred, average='weighted', zero_division=1)
        f1 = f1_score(y_test_subset, y_pred, average='weighted', zero_division=1)

        # Print evaluation results
        print(classification_report(y_test_subset, y_pred, zero_division=1))

        # Store results
        results.append(["Vector Space", clf_name, ngram_name, accuracy, precision, recall, f1])



=== Logistic Regression ===
-- Binary Representation --




Best Hyperparameters: {'classifier__solver': 'liblinear', 'classifier__C': 0.1}
              precision    recall  f1-score   support

           0       0.59      0.49      0.53       534
           1       0.33      0.08      0.13       323
           2       0.39      0.14      0.20       463
           3       0.38      0.17      0.24       870
           4       0.75      0.96      0.84      3992

    accuracy                           0.70      6182
   macro avg       0.49      0.37      0.39      6182
weighted avg       0.63      0.70      0.64      6182

-- Frequency Count --




Best Hyperparameters: {'classifier__solver': 'liblinear', 'classifier__C': 0.1}
              precision    recall  f1-score   support

           0       0.57      0.43      0.49       534
           1       0.21      0.04      0.07       323
           2       0.35      0.12      0.18       463
           3       0.37      0.16      0.22       870
           4       0.74      0.95      0.83      3992

    accuracy                           0.69      6182
   macro avg       0.45      0.34      0.36      6182
weighted avg       0.61      0.69      0.63      6182

-- TF-IDF --




Best Hyperparameters: {'classifier__solver': 'liblinear', 'classifier__C': 1}
              precision    recall  f1-score   support

           0       0.57      0.38      0.46       534
           1       0.42      0.03      0.06       323
           2       0.37      0.06      0.11       463
           3       0.35      0.13      0.19       870
           4       0.72      0.97      0.83      3992

    accuracy                           0.69      6182
   macro avg       0.49      0.32      0.33      6182
weighted avg       0.61      0.69      0.61      6182


=== Naïve Bayes ===
-- Binary Representation --
Best Hyperparameters: {'classifier__fit_prior': False, 'classifier__alpha': 1.0}
              precision    recall  f1-score   support

           0       0.61      0.36      0.45       534
           1       0.22      0.01      0.01       323
           2       0.22      0.02      0.04       463
           3       0.28      0.15      0.20       870
           4       0.71      0.9

In [9]:
clf = LogisticRegression(solver='liblinear', C=0.1, max_iter=500)

# Create a pipeline with the classifier
pipeline_full = Pipeline([
    ('vectorizer', CountVectorizer(binary=True)),
    ('classifier', LogisticRegression(solver='liblinear', C=0.1, max_iter=500))
])

# Train the model on the full training dataset
pipeline_full.fit(X_train, y_train)

# Predict on the test set
y_pred_full = pipeline_full.predict(X_test)

# Evaluate the model on the test set
accuracy_full = accuracy_score(y_test, y_pred_full)
print(f"Accuracy on Test Set: {accuracy_full:.4f}")

# Print the classification report for more detailed evaluation
print("\nClassification Report:")
print(classification_report(y_test, y_pred_full, zero_division=1))


Accuracy on Test Set: 0.7154

Classification Report:
              precision    recall  f1-score   support

           0       0.64      0.59      0.62      5644
           1       0.38      0.09      0.14      3214
           2       0.42      0.19      0.26      4679
           3       0.44      0.19      0.27      8688
           4       0.76      0.96      0.85     39602

    accuracy                           0.72     61827
   macro avg       0.53      0.40      0.43     61827
weighted avg       0.66      0.72      0.67     61827

