Project: Introvert or Extrovert Prediction

Task: Model Training

Candidate: Himantha Weerasingha

In [1]:
#Import libaries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import KFold
import os
# import numpy as np
import joblib


In [2]:
# Define the path for cleaned dataset
pwd = os.getcwd()
dataset_path = f"{pwd}\\Cleaned_dataset.csv"

In [3]:
# Load the dataset
df = pd.read_csv(dataset_path)
df.head()

Unnamed: 0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,4.0,0,4.0,6.0,0,13.0,5.0,1
1,9.0,1,0.0,0.0,1,0.0,3.0,0
2,9.0,1,1.0,2.0,1,5.0,2.0,0
3,0.0,0,6.0,7.0,0,14.0,8.0,1
4,3.0,0,9.0,4.0,0,8.0,5.0,1


In [4]:
# Split features and target
x_dataset = df.drop('Personality', axis=1)
y_dataset = df['Personality']

In [5]:
# Define models to compare
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "RandomForest": RandomForestClassifier(n_estimators=20),
}

In [6]:
# Set up K-Fold Cross Validation
k = 5 
kf = KFold(n_splits=k, shuffle=True, random_state=42)

In [7]:
# Define some parameters for further uses
best_avg_acc = 0.0
best_model_name = ""
Overall_best_model = None
best_log_model = None
best_rand_model = None

In [11]:
# compare each model and find the best trained model and best method
for name, model in models.items():
    accuracies = []
    precisions = []
    recalls = []
    f1s = []
    best_f1 = 0.0
    
    
    # Use Kfold method and find best trained model for each method
    for train_index, test_index in kf.split(x_dataset):
        
        # Separete training and testing dataset
        X_train, X_test = x_dataset.iloc[train_index], x_dataset.iloc[test_index]
        y_train, y_test = y_dataset.iloc[train_index], y_dataset.iloc[test_index]

        # Create a fresh model instance
        model_instance = model.__class__()
        
        # Train and predict the accuracies
        model_instance.fit(X_train, y_train)

        y_pred = model_instance.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        
        prec = precision_score(y_test, y_pred, average='weighted', zero_division=0)
        rec = recall_score(y_test, y_pred, average='weighted', zero_division=0)
        f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
        
        accuracies.append(acc)
        precisions.append(prec)
        recalls.append(rec)
        f1s.append(f1)
        
        # Save the best fold model for each method
        if acc > best_f1:
            best_f1 = acc
            if name == "LogisticRegression":               
                best_log_model = model_instance
            else:
                best_rand_model = model_instance
                
        
    
    avg_acc = sum(accuracies) / len(accuracies)
    avg_prec = sum(precisions) / len(precisions)
    avg_rec = sum(recalls) / len(recalls)
    avg_f1 = sum(f1s) / len(f1s)
    
    
    # Find the best method and extract best trained model from that method
    if avg_acc > best_avg_acc:
        best_avg_acc = avg_acc
        best_model_name = name
        if name == "LogisticRegression":
            Overall_best_model = best_log_model
        else:
            Overall_best_model = best_rand_model
    
    print(f"Model : {name}")
    print(f"K-Fold ({k}) Cross-Validation Accuracies: {accuracies}")
    # print(f"{name}: Average accuracy: {avg_acc:.4f}")
    print(f"K-Fold ({k}) Cross-Validation Precision:  {precisions}")
    print(f"K-Fold ({k}) Cross-Validation Recall:     {recalls}")
    print(f"K-Fold ({k}) Cross-Validation F1:         {f1s}")
    print(f"{name}: Average Accuracy: {avg_acc:.4f}, Precision: {avg_prec:.4f}, Recall: {avg_rec:.4f}, F1: {avg_f1:.4f}")
    print("-"*30)



Model : LogisticRegression
K-Fold (5) Cross-Validation Accuracies: [0.9274193548387096, 0.9415322580645161, 0.9212121212121213, 0.9151515151515152, 0.9212121212121213]
K-Fold (5) Cross-Validation Precision:  [0.9280922740463473, 0.942004734420006, 0.9216165837558028, 0.9167804996779432, 0.9219022881880025]
K-Fold (5) Cross-Validation Recall:     [0.9274193548387096, 0.9415322580645161, 0.9212121212121213, 0.9151515151515152, 0.9212121212121213]
K-Fold (5) Cross-Validation F1:         [0.9273780185344116, 0.9415806681863186, 0.9212301327372706, 0.9152340283060734, 0.921252668993337]
LogisticRegression: Average Accuracy: 0.9253, Precision: 0.9261, Recall: 0.9253, F1: 0.9253
------------------------------
Model : RandomForest
K-Fold (5) Cross-Validation Accuracies: [0.9032258064516129, 0.9254032258064516, 0.907070707070707, 0.8888888888888888, 0.8929292929292929]
K-Fold (5) Cross-Validation Precision:  [0.9032457421537274, 0.9253882226383101, 0.9085335004569166, 0.8899666862629825, 0.8936

In [9]:
print(f"Best_model: {best_model_name}")
print(f"\nOverall Best Model: {best_model_name} with Avg Accuracy: {best_avg_acc:.4f}")

Best_model: LogisticRegression

Overall Best Model: LogisticRegression with Avg Accuracy: 0.9253


In [10]:
# Save best model
joblib.dump(Overall_best_model, "best_model.pkl")
print("Saved as 'best_model.pkl'")

Saved as 'best_model.pkl'
