In [1]:
import os
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

In [2]:
data = pd.read_csv("dataset.csv")
data.head()

Unnamed: 0,Question,Answer,Age,Gender,Personality
0,I prefer quiet evenings at home rather than go...,1,45,0,Introversion
1,I enjoy spending time alone with my thoughts.,1,26,1,Introversion
2,I find large social gatherings overwhelming.,1,36,1,Introversion
3,I feel drained after spending too much time in...,1,21,0,Introversion
4,I prefer deep conversations with a few close f...,1,29,1,Introversion


In [3]:
X = data.drop(columns=['Personality'])
y = data['Personality']

# Splitting the training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=109)

In [4]:
label_encoder = LabelEncoder()

question_train = pd.DataFrame()
question_train["Question"] = label_encoder.fit_transform(X_train["Question"])
question_train = question_train.reset_index(drop=True)
X_train = X_train.drop(columns=["Question", "Age", "Gender"])
X_train = X_train.reset_index(drop=True)
X_train = pd.concat([question_train, X_train], axis=1)

question_test = pd.DataFrame()
question_test["Question"] = label_encoder.fit_transform(X_test["Question"])
question_test = question_test.reset_index(drop=True)
X_test = X_test.drop(columns=["Question", "Age", "Gender"])
X_test = X_test.reset_index(drop=True)
X_test = pd.concat([question_test, X_test], axis=1)

In [5]:
print(X_train.shape)
print(y_train.shape)

(8400, 2)
(8400,)


In [6]:
print(X_test.shape)
print(y_test.shape)

(3600, 2)
(3600,)


In [7]:
random_forest_path = "random_forest.pkl"
if os.path.exists(random_forest_path):
    with open(random_forest_path, 'rb') as file:
        rf_model = pickle.load(file)
else:
    rf_model = RandomForestClassifier(n_estimators=5)
    rf_model.fit(X_train, y_train)
    with open(random_forest_path, 'wb') as file:
        pickle.dump(rf_model, file)
    print(f"New Random Forest model trained and saved at {random_forest_path}.")

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [8]:
# Get SVM predictions and evaluate the SVM model
rf_predictions_train = rf_model.predict(X_train)
rf_predictions_test = rf_model.predict(X_test)
print("Random Forest Model Evaluation.")
print(f'Training Accuracy: {accuracy_score(y_train, rf_predictions_train) * 100:.5f}%')
print(f'Validation Accuracy: {accuracy_score(y_test, rf_predictions_test) * 100:.5f}%')

Random Forest Model Evaluation.
Training Accuracy: 100.00000%
Validation Accuracy: 100.00000%


In [9]:
print(rf_predictions_test)

['Intuition' 'Introversion' 'Intuition' ... 'Feeling' 'Judging'
 'Intuition']
<class 'numpy.ndarray'>


In [10]:
print(y_test)

8940        Intuition
3213     Introversion
7845        Intuition
8166          Feeling
1607     Extroversion
             ...     
3256          Feeling
10545       Intuition
4257          Feeling
2592          Judging
5528        Intuition
Name: Personality, Length: 3600, dtype: object
