Project: Introvert or Extrovert Prediction

Task: Model Training

Candidate: Himantha Weerasingha

In [None]:
#Import libaries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
import os
# import numpy as np
import joblib


In [18]:
# Define the path for cleaned dataset
pwd = os.getcwd()
dataset_path = f"{pwd}\\Cleaned_dataset.csv"

In [19]:
# Load the dataset
df = pd.read_csv(dataset_path)
df.head()

Unnamed: 0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,4.0,0,4.0,6.0,0,13.0,5.0,1
1,9.0,1,0.0,0.0,1,0.0,3.0,0
2,9.0,1,1.0,2.0,1,5.0,2.0,0
3,0.0,0,6.0,7.0,0,14.0,8.0,1
4,3.0,0,9.0,4.0,0,8.0,5.0,1


In [None]:
# # Check for unwanted data (e.g. typos, unexpected values)
# def mising_unwanted(data_frame):
#     print('Sage_fear unique data: ',data_frame['Stage_fear'].unique())
#     print('Drained_after_socializing unique data: ', data_frame['Drained_after_socializing'].unique())
#     print('Personality unique data: ', data_frame['Personality'].unique())
#     print("\nMissing Values")
#     print(data_frame.isnull().sum())
    
# mising_unwanted(df)

Sage_fear unique data:  [0 1]
Drained_after_socializing unique data:  [0 1]
Personality unique data:  [1 0]

Missing Values
Time_spent_Alone             0
Stage_fear                   0
Social_event_attendance      0
Going_outside                0
Drained_after_socializing    0
Friends_circle_size          0
Post_frequency               0
Personality                  0
dtype: int64


In [20]:
# Split features and target
x_dataset = df.drop('Personality', axis=1)
y_dataset = df['Personality']

In [None]:
# split the dataset into train and test datasets
# X_train, X_test, y_train, y_test = train_test_split(x_dataset, y_dataset, test_size=0.1, random_state=40)

In [None]:
# #Train the model
# model = LogisticRegression()
# model.fit(X_train, y_train)

In [None]:
# # Evaluate the model
# y_pred = model.predict(X_test)
# accuracy = accuracy_score(y_test, y_pred)
# print(f"✅ Model trained. Accuracy: {accuracy:.2f}")

✅ Model trained. Accuracy: 0.93


In [21]:
# Define models to compare
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "RandomForest": RandomForestClassifier(n_estimators=20),
}

In [22]:
# Set up K-Fold Cross Validation
k = 5 
kf = KFold(n_splits=k, shuffle=True, random_state=42)

In [23]:
# Define some parameters for further uses
best_avg_acc = 0.0
best_model_name = ""
Overall_best_model = None
best_log_model = None
best_rand_model = None

In [24]:
# compare each model and find the best trained model and best method
for name, model in models.items():
    accuracies = []
    best_accuracy = 0.0
    
    # Use Kfold method and find best trained model for each method
    for train_index, test_index in kf.split(x_dataset):
        
        # Separete training and testing dataset
        X_train, X_test = x_dataset.iloc[train_index], x_dataset.iloc[test_index]
        y_train, y_test = y_dataset.iloc[train_index], y_dataset.iloc[test_index]

        # Create a fresh model instance
        model_instance = model.__class__()
        
        # Train and predict the accuracies
        model_instance.fit(X_train, y_train)

        y_pred = model_instance.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        
        accuracies.append(acc)
        
        # Save the best fold model for each method
        if acc > best_accuracy:
            best_accuracy = acc
            if name == "LogisticRegression":               
                best_log_model = model_instance
            else:
                best_rand_model = model_instance
                
        
    
    avg_acc = sum(accuracies) / len(accuracies)
    
    # Find the best method and extract best trained model from that method
    if avg_acc > best_avg_acc:
        best_avg_acc = avg_acc
        best_model_name = name
        if name == "LogisticRegression":
            Overall_best_model = best_log_model
        else:
            Overall_best_model = best_rand_model
    
    print(f"Model : {name}")
    print(f"K-Fold ({k}) Cross-Validation Accuracies: {accuracies}")
    print(f"{name}: Average accuracy: {avg_acc:.4f}")
    print("-"*30)



Model : LogisticRegression
K-Fold (5) Cross-Validation Accuracies: [0.9274193548387096, 0.9415322580645161, 0.9212121212121213, 0.9151515151515152, 0.9212121212121213]
LogisticRegression: Average accuracy: 0.9253
------------------------------
Model : RandomForest
K-Fold (5) Cross-Validation Accuracies: [0.9092741935483871, 0.9213709677419355, 0.9050505050505051, 0.8888888888888888, 0.896969696969697]
RandomForest: Average accuracy: 0.9043
------------------------------


In [158]:
print(f"Best_model: {best_model_name}")
print(f"\nOverall Best Model: {best_model_name} with Avg Accuracy: {best_avg_acc:.4f}")

Best_model: LogisticRegression

Overall Best Model: LogisticRegression with Avg Accuracy: 0.9253


In [159]:
# Save best model
joblib.dump(Overall_best_model, "best_model.pkl")
print("Saved as 'best_model.pkl'")

Saved as 'best_model.pkl'
