### 🛠️ Model Pipeline

The best relationship is between extro and extra,
So We will predict extro(Big5)!

### 🥸 Preprocessing

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures

In [2]:
unused_index = ['id', 'big5Accuracy', 'mbtiAccuracy', 
                'Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Q6', 'Q7', 'Q8', 'Q9', 'Q10']

data = pd.read_csv('../data/big-five-mbti-and-tipi-test-results.zip', compression="zip")
data = data.drop(columns=unused_index)

In [3]:
kor_index = ['개방성', '성실성', '외향성', '우호성', '신경성', 
             '외향', '직관', '사고', '판단']
data.columns=kor_index

In [4]:
# Performance degrades when other columns are added due to the influence of extroversion
x = data[['외향', '판단']]
y = data['외향성']

In [5]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)
print("Train Size: ", x_train.shape)
print("Test Size: ", x_test.shape)

poly = PolynomialFeatures(degree=5)
x_train_poly = poly.fit_transform(x_train)
x_test_poly = poly.fit_transform(x_test)
print("After Poly Train Size: ", x_train_poly.shape)

Train Size:  (112, 2)
Test Size:  (28, 2)
After Poly Train Size:  (112, 21)


### 🤖 Run Model

In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

In [7]:
lr = LinearRegression()
lr.fit(x_train, y_train)

square = lr.score(x_test, y_test)
print(square)

0.383248555409454


In [8]:
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree Regressor': DecisionTreeRegressor(),
    'Random Forest Regressor': RandomForestRegressor(),
    'Gradient Boosting Regressor': GradientBoostingRegressor(),
    'XGBoost Regressor': XGBRegressor()
}

results = {}
for model_name, model in models.items():
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    mse = mean_squared_error(y_test, y_pred)
    results[model_name] = mse

for model_name, mse in results.items():
    print(f'{model_name}: Mean Squared Error = {mse:.2f}')


Linear Regression: Mean Squared Error = 67.67
Decision Tree Regressor: Mean Squared Error = 134.89
Random Forest Regressor: Mean Squared Error = 86.30
Gradient Boosting Regressor: Mean Squared Error = 87.65
XGBoost Regressor: Mean Squared Error = 108.22
