In [2]:
import pickle
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [3]:
with open("../data/processed/X_train.pkl", "rb") as f:
    X_train = pickle.load(f)

with open("../data/processed/X_test.pkl", "rb") as f:
    X_test = pickle.load(f)

with open("../data/processed/y_train.pkl", "rb") as f:
    y_train = pickle.load(f)

with open("../data/processed/y_test.pkl", "rb") as f:
    y_test = pickle.load(f)

print("Data loaded successfully")


Data loaded successfully


In [4]:
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)

lr_pred = lr_model.predict(X_test)

lr_results = {
    "Model": "Logistic Regression",
    "Accuracy": accuracy_score(y_test, lr_pred),
    "Precision": precision_score(y_test, lr_pred),
    "Recall": recall_score(y_test, lr_pred),
    "F1-Score": f1_score(y_test, lr_pred)
}

lr_results


{'Model': 'Logistic Regression',
 'Accuracy': 0.8454113800887366,
 'Precision': 0.8708646454176384,
 'Recall': 0.9535609571550175,
 'F1-Score': 0.9103386004514673}

In [5]:
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

nb_pred = nb_model.predict(X_test)

nb_results = {
    "Model": "Naive Bayes",
    "Accuracy": accuracy_score(y_test, nb_pred),
    "Precision": precision_score(y_test, nb_pred),
    "Recall": recall_score(y_test, nb_pred),
    "F1-Score": f1_score(y_test, nb_pred)
}

nb_results


{'Model': 'Naive Bayes',
 'Accuracy': 0.819179574998054,
 'Precision': 0.8922594142259415,
 'Recall': 0.8874491629622624,
 'F1-Score': 0.8898477879463227}

In [6]:
svm_model = LinearSVC()
svm_model.fit(X_train, y_train)

svm_pred = svm_model.predict(X_test)

svm_results = {
    "Model": "Support Vector Machine",
    "Accuracy": accuracy_score(y_test, svm_pred),
    "Precision": precision_score(y_test, svm_pred),
    "Recall": recall_score(y_test, svm_pred),
    "F1-Score": f1_score(y_test, svm_pred)
}

svm_results


{'Model': 'Support Vector Machine',
 'Accuracy': 0.8409745465867517,
 'Precision': 0.8756385414831778,
 'Recall': 0.9403196822094013,
 'F1-Score': 0.9068271993432754}

In [7]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

rf_pred = rf_model.predict(X_test)

rf_results = {
    "Model": "Random Forest",
    "Accuracy": accuracy_score(y_test, rf_pred),
    "Precision": precision_score(y_test, rf_pred),
    "Recall": recall_score(y_test, rf_pred),
    "F1-Score": f1_score(y_test, rf_pred)
}

rf_results


{'Model': 'Random Forest',
 'Accuracy': 0.8563088658830855,
 'Precision': 0.8612467919529763,
 'Recall': 0.983921308994609,
 'F1-Score': 0.9185060921772912}

In [8]:
comparison_df = pd.DataFrame([
    lr_results,
    nb_results,
    svm_results,
    rf_results
])

comparison_df


Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score
0,Logistic Regression,0.845411,0.870865,0.953561,0.910339
1,Naive Bayes,0.81918,0.892259,0.887449,0.889848
2,Support Vector Machine,0.840975,0.875639,0.94032,0.906827
3,Random Forest,0.856309,0.861247,0.983921,0.918506


In [9]:
best_model = comparison_df.loc[comparison_df["F1-Score"].idxmax()]
best_model


Model        Random Forest
Accuracy          0.856309
Precision         0.861247
Recall            0.983921
F1-Score          0.918506
Name: 3, dtype: object