### 1. Import all the libraries and read the data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import lightgbm as lgb
import xgboost as xgb
from Utils.calculate_metrics import *

from joblib import dump, load
import warnings

warnings.filterwarnings('ignore')

In [4]:
train_data = pd.read_csv('./Data/train.csv', index_col='id')
test_data = pd.read_csv('./Data/test.csv', index_col='id')
train_data.shape, test_data.shape

((7200, 139), (800, 139))

### 2. Data preprocessing and Feature Engineering

In [26]:
def preprocess_data_base(data: pd.DataFrame):
    data = data.copy()
    data = data[data["happiness"] > 0]
    
    data.loc[:, "survey_month"] = data["survey_time"].apply(lambda line: line.split(" ")[0].split("/")[1]).astype("int64")
    data.loc[:, "survey_day"] = data["survey_time"].apply(lambda line: line.split(" ")[0].split("/")[2]).astype("int64")
    data.loc[:, "survey_hour"] = data["survey_time"].apply(lambda line: line.split(" ")[1].split(":")[0]).astype("int64")
    data = data.drop(columns='survey_time')
    
    data = data.drop(columns=['edu_other', 'property_other', 'invest_other', 'join_party'])

    
    label = data.pop('happiness')
    return data, label

def preprocess_data_dim60(train_data: pd.DataFrame, selected_features: list = None) -> pd.DataFrame:
    data = train_data.copy()
    data = data[data["happiness"] > 0]
    
    # Feature engineering
    data.loc[:, "survey_month"] = data["survey_time"].apply(lambda line: line.split(" ")[0].split("/")[1]).astype("int64")
    data.loc[:, "survey_day"] = data["survey_time"].apply(lambda line: line.split(" ")[0].split("/")[2]).astype("int64")
    data.loc[:, "survey_hour"] = data["survey_time"].apply(lambda line: line.split(" ")[1].split(":")[0]).astype("int64")
    data.loc[:, "age"] = 2015 - data["birth"]

    data = data.drop(columns='survey_time')
    data = data.drop(columns=['edu_other', 'property_other', 'invest_other', 'join_party'])

    # Replace NaN in specific columns with 0
    for column in ['work_status', 'work_yr', 'work_type', 'work_manage', 's_work_status', 's_work_type']:
        data.loc[data[column].isna(), column] = 0
    
    # Replace remaining NaN values with mode
    data = data.fillna(data.mode().iloc[0])

    # Replace negative values with the mode for integer columns
    for column in data.columns:
        if data[column].dtype == np.int64:
            data.loc[data[column] < 0, column] = data[column].mode().iloc[0]
    
    if selected_features is None:
        corr_matrix = data.corr(method='pearson', numeric_only=True)['happiness'][abs(data.corr(method='pearson', numeric_only=True)['happiness'])>0.05]
        features = corr_matrix.index.values.tolist()
        features.extend(['age'])  # Adding age explicitly as it is not necessarily part of correlation
        data_selected = data[features]
    else:
        selected_features.append('happiness')
        data_selected = data[selected_features]  # Ensure the selected features match

    label = data_selected.pop('happiness')
    return data_selected, label


train_df, train_label = preprocess_data_base(train_data)
test_df, test_label = preprocess_data_base(test_data)
train_df.shape, test_df.shape, train_label.shape, test_label.shape

TypeError: preprocess_data_base() got an unexpected keyword argument 'selected_features'

### 3. Basic Modeling:

#### 3.1 Support Vector Machine (SVM)

**Our observations:**

- SVM can't handle such complex cases. It tends to predict the majority class (Happiness = 4).
- PCA can't improve the performance of SVM.

**Conclusion:** SVM is not a good choice for this dataset.

In [15]:
from sklearn import svm
from sklearn import metrics

svc_rbf = svm.SVC(kernel='rbf', C=1)
svc_rbf.fit(train_df, train_label)

predict = svc_rbf.predict(test_df)
mse = metrics.mean_squared_error(test_label, predict)
accuracy = metrics.accuracy_score(test_label, predict)

print('MSE:', mse, end=' ')
print('Accuracy:', accuracy)

print(np.where(predict!=4))


MSE: 0.7882205513784462 Accuracy: 0.5739348370927319
(array([], dtype=int64),)


In [16]:
from sklearn.decomposition import PCA

pca = PCA(n_components = 4)
reduced_train = pca.fit_transform(train_df)
reduced_test = pca.fit_transform(test_df)

print(reduced_train.shape, reduced_test.shape)

svc_rbf = svm.SVC(kernel='rbf')
svc_rbf.fit(reduced_train, train_label)

predict = svc_rbf.predict(reduced_test)

mse = metrics.mean_squared_error(test_label, predict)
accuracy = metrics.accuracy_score(test_label, predict)

print('MSE:', mse, end=' ')
print('Accuracy:', accuracy)

np.where(predict!=4)

(7190, 4) (798, 4)
MSE: 0.7882205513784462 Accuracy: 0.5739348370927319


(array([], dtype=int64),)

### 3.2 Ridge Regression

**Our observations:**
- Betther than SVM when it comes to the mse. 
- Perform poorly in terms of the accuracy.
- It seems underfit or the model is too simple for this dataset. We try to evaluate the model with training data but the model is not able to capture the complexity of the data.

**Conclusion:**
Ridge Regression is not a good choice for this dataset.

In [17]:
from sklearn import linear_model
from sklearn import metrics

ridge = linear_model.Ridge()
ridge.fit(train_df, train_label)

predict = ridge.predict(test_df)
predict = np.round(predict)

mse = metrics.mean_squared_error(test_label, predict)
accuracy = metrics.accuracy_score(test_label, predict)

print('MSE:', mse, end=' ')
print('Accuracy:', accuracy)



MSE: 0.5827067669172933 Accuracy: 0.5764411027568922


### 3.3 Logistic Regression

**Our observations:**
- Similar to SVM, Logistic Regression can't handle such complex cases. It tends to predict the majority class (Happiness = 4).
- PCA will even make the performance worse.

**Conclusion:** 
Logistic Regression is not a good choice for this dataset.

In [18]:
from sklearn import linear_model    
from sklearn import metrics

logistic = linear_model.LogisticRegression()
logistic.fit(train_df, train_label)

predict = logistic.predict(test_df)

mse = metrics.mean_squared_error(test_label, predict)
accuracy = metrics.accuracy_score(test_label, predict)


print('MSE:', mse, end=' ')
print('Accuracy:', accuracy)

np.where(predict!=4)

MSE: 0.7882205513784462 Accuracy: 0.5739348370927319


(array([], dtype=int64),)

In [19]:
from sklearn.decomposition import PCA

pca = PCA(n_components = 8)
reduced_train = pca.fit_transform(train_df)
reduced_test = pca.fit_transform(test_df)

logistic = linear_model.LogisticRegression()
logistic.fit(reduced_train, train_label)

predict = logistic.predict(reduced_test)

mse = metrics.mean_squared_error(test_label, predict)
accuracy = metrics.accuracy_score(test_label, predict)

print('MSE:', mse, end=' ')
print('Accuracy:', accuracy)



MSE: 1.3922305764411027 Accuracy: 0.4523809523809524


### 4. Advanced Modeling:

### 4.1 LightGBM

**Our observations:**


**Conclusion:**

In [20]:
import lightgbm as lgb
from sklearn import metrics
import os

os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

lgbR = lgb.LGBMRegressor()
lgbR.fit(train_df, train_label)

predict = lgbR.predict(test_df)
predict = np.round(predict)

mse = metrics.mean_squared_error(test_label, predict)
accuracy = metrics.accuracy_score(test_label, predict)


print('MSE:', mse, end=' ')
print('Accuracy:', accuracy)




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000721 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1283
[LightGBM] [Info] Number of data points in the train set: 7190, number of used features: 75
[LightGBM] [Info] Start training from score 3.873435
MSE: 0.5726817042606517 Accuracy: 0.600250626566416


In [21]:
lgbC = lgb.LGBMClassifier()
lgbC.fit(train_df, train_label)

predict = lgbC.predict(test_df)


mse = metrics.mean_squared_error(test_label, predict)
accuracy = metrics.accuracy_score(test_label, predict)

print('MSE:', mse, end=' ')
print('Accuracy:', accuracy)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001299 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1283
[LightGBM] [Info] Number of data points in the train set: 7190, number of used features: 75
[LightGBM] [Info] Start training from score -4.380637
[LightGBM] [Info] Start training from score -2.798228
[LightGBM] [Info] Start training from score -1.940224
[LightGBM] [Info] Start training from score -0.500219
[LightGBM] [Info] Start training from score -1.734462
MSE: 0.656641604010025 Accuracy: 0.6102756892230576


In [22]:
import lightgbm as lgb
from sklearn import metrics

clf_lightgbm_modified=lgb.LGBMRegressor(metric='l2',colsample_bytree=0.8,learning_rate=0.1,
                                             max_depth=7,min_child_weight=0,min_split_gain=0.1,
                                             reg_alpha=1,reg_lambda=0.0001,subsample=0.5)   

clf_lightgbm_modified.fit(train_df, train_label)

predict = clf_lightgbm_modified.predict(test_df)
predict = np.round(predict)
mse = metrics.mean_squared_error(test_label, predict)
accuracy = metrics.accuracy_score(test_label, predict)

print('MSE:', mse, end=' ')
print('Accuracy:', accuracy)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000679 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1283
[LightGBM] [Info] Number of data points in the train set: 7190, number of used features: 75
[LightGBM] [Info] Start training from score 3.873435
MSE: 0.575187969924812 Accuracy: 0.5902255639097744


#### 4.2 XGBoost

**Our observations:**



**Conclusion:**

In [23]:
import xgboost as xgb
from sklearn import metrics


xgboost = xgb.XGBRegressor()
xgboost.fit(train_df, train_label)

predict = xgboost.predict(test_df)

predict = np.round(predict)


mse = metrics.mean_squared_error(test_label, predict)
accuracy = metrics.accuracy_score(test_label, predict)

print('MSE:', mse, end=' ')
print('Accuracy:', accuracy)

MSE: 0.6290726817042607 Accuracy: 0.5651629072681704


In [24]:
xgboost_modified=xgb.XGBRegressor(max_depth=4,min_child_weight=0.5,gamma=0.4,subsample=0.7,colsample_bytree=0.8,reg_alpha=1,reg_lambda=0.001)   

xgboost_modified.fit(train_df, train_label)

predict = xgboost_modified.predict(test_df)
predict = np.round(predict)

mse = metrics.mean_squared_error(test_label, predict)
accuracy = metrics.accuracy_score(test_label, predict)

print('MSE:', mse, end=' ')
print('Accuracy:', accuracy)

MSE: 0.656641604010025 Accuracy: 0.5701754385964912


#### 4.3 LightGBM Essambled with XGBoost

In [25]:
from sklearn import metrics
xgboost = xgb.XGBRegressor(max_depth=4,min_child_weight=0.5,gamma=0.4,subsample=0.7,colsample_bytree=0.8,reg_alpha=1,reg_lambda=0.001)   
lgbm = lgb.LGBMRegressor(metric='l2',colsample_bytree=0.8,learning_rate=0.1,
                                             max_depth=7,min_child_weight=0,
                                             reg_alpha=1,reg_lambda=0.0001,subsample=0.5)   

xgboost.fit(train_df, train_label)
lgbm.fit(train_df, train_label)

predict_xgboost = xgboost.predict(test_df)
predict_lgbm = lgbm.predict(test_df)

predict = (predict_xgboost + predict_lgbm) / 2

predict = np.round(predict)
mse = metrics.mean_squared_error(test_label, predict)
accuracy = metrics.accuracy_score(test_label, predict)

print('MSE:', mse, end=' ')
    
print('Accuracy:', accuracy)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000574 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1283
[LightGBM] [Info] Number of data points in the train set: 7190, number of used features: 75
[LightGBM] [Info] Start training from score 3.873435
MSE: 0.5852130325814536 Accuracy: 0.5877192982456141


In [13]:
np.where(predict==5)

(array([ 41,  58,  64,  73, 112, 114, 124, 178, 183, 212, 214, 242, 277,
        298, 309, 313, 341, 356, 366, 382, 392, 505, 549, 568, 584, 600,
        603, 605, 612, 642, 661, 695, 720]),)