### 1. Import all the libraries and read the data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import lightgbm as lgb
import xgboost as xgb


from joblib import dump, load
import warnings

warnings.filterwarnings('ignore')

In [2]:
train_data = pd.read_csv('./Data/train.csv', index_col='id')
test_data = pd.read_csv('./Data/test.csv', index_col='id')
train_data.shape, test_data.shape

((7200, 139), (800, 139))

### 2. Data preprocessing and Feature Engineering

In [3]:
def preprocess_data(data: pd.DataFrame) -> pd.DataFrame:
    
    data = data[data["happiness"]>0]
    
    data["survey_month"] = data["survey_time"].transform(lambda line:line.split(" ")[0].split("/")[1]).astype("int64")   #返回调查月：用空格来切分日期和时间，日期中第1项为月
    data["survey_day"] = data["survey_time"].transform(lambda line:line.split(" ")[0].split("/")[2]).astype("int64")   #返回调查日
    data["survey_hour"] = data["survey_time"].transform(lambda line:line.split(" ")[1].split(":")[0]).astype("int64")   #返回调查小时
    data=data.drop(columns='survey_time')
    
    data.drop(columns=['edu_other','property_other','invest_other', 'join_party'],inplace=True)  
    data.fillna(data.mean(),inplace=True)
    label = data.pop('happiness')

    
    return data, label



def feature_engineering(data: pd.DataFrame) -> pd.DataFrame:
    pass
    
    return data

train_df, train_label = preprocess_data(train_data)
test_df, test_label = preprocess_data(test_data)
train_df.shape, test_df.shape, train_label.shape, test_label.shape

((7190, 136), (798, 136), (7190,), (798,))

### 3. Basic Modeling:

#### 3.1 Support Vector Machine (SVM)

**Our observations:**

- SVM can't handle such complex cases. It tends to predict the majority class (Happiness = 4).
- PCA can't improve the performance of SVM.

**Conclusion:** SVM is not a good choice for this dataset.

In [4]:
from sklearn import svm
from sklearn import metrics

svc_rbf = svm.SVC(kernel='rbf', C=1)
svc_rbf.fit(train_df, train_label)

predict = svc_rbf.predict(test_df)
mse = metrics.mean_squared_error(test_label, predict)
accuracy = metrics.accuracy_score(test_label, predict)

print('MSE:', mse, end=' ')
print('Accuracy:', accuracy)

print(np.where(predict!=4))


MSE: 0.7894736842105263 Accuracy: 0.5726817042606517
(array([666]),)


In [5]:
from sklearn.decomposition import PCA

pca = PCA(n_components = 4)
reduced_train = pca.fit_transform(train_df)
reduced_test = pca.fit_transform(test_df)

print(reduced_train.shape, reduced_test.shape)

svc_rbf = svm.SVC(kernel='rbf')
svc_rbf.fit(reduced_train, train_label)

predict = svc_rbf.predict(reduced_test)

mse = metrics.mean_squared_error(test_label, predict)
accuracy = metrics.accuracy_score(test_label, predict)

print('MSE:', mse, end=' ')
print('Accuracy:', accuracy)

np.where(predict!=4)

(7190, 4) (798, 4)
MSE: 0.7894736842105263 Accuracy: 0.5726817042606517


(array([666]),)

### 3.2 Ridge Regression

**Our observations:**
- Betther than SVM when it comes to the mse. 
- Perform poorly in terms of the accuracy.
- It seems underfit or the model is too simple for this dataset. We try to evaluate the model with training data but the model is not able to capture the complexity of the data.

**Conclusion:**
Ridge Regression is not a good choice for this dataset.

In [6]:
from sklearn import linear_model
from sklearn import metrics

ridge = linear_model.Ridge()
ridge.fit(train_df, train_label)

predict = ridge.predict(test_df)
predict = np.round(predict)

mse = metrics.mean_squared_error(test_label, predict)
accuracy = metrics.accuracy_score(test_label, predict)

print('MSE:', mse, end=' ')
print('Accuracy:', accuracy)



MSE: 0.6303258145363408 Accuracy: 0.5726817042606517


### 3.3 Logistic Regression

**Our observations:**
- Similar to SVM, Logistic Regression can't handle such complex cases. It tends to predict the majority class (Happiness = 4).
- PCA will even make the performance worse.

**Conclusion:** 
Logistic Regression is not a good choice for this dataset.

In [11]:
from sklearn import linear_model    
from sklearn import metrics

logistic = linear_model.LogisticRegression()
logistic.fit(train_df, train_label)

predict = logistic.predict(test_df)

mse = metrics.mean_squared_error(test_label, predict)
accuracy = metrics.accuracy_score(test_label, predict)


print('MSE:', mse, end=' ')
print('Accuracy:', accuracy)

np.where(predict!=4)

MSE: 0.7894736842105263 Accuracy: 0.5726817042606517


(array([718]),)

In [21]:
from sklearn.decomposition import PCA

pca = PCA(n_components = 8)
reduced_train = pca.fit_transform(train_df)
reduced_test = pca.fit_transform(test_df)

logistic = linear_model.LogisticRegression()
logistic.fit(reduced_train, train_label)

predict = logistic.predict(reduced_test)

mse = metrics.mean_squared_error(test_label, predict)
accuracy = metrics.accuracy_score(test_label, predict)

print('MSE:', mse, end=' ')
print('Accuracy:', accuracy)



MSE: 0.8922305764411027 Accuracy: 0.43107769423558895


### 4. Advanced Modeling:

### 4.1 LightGBM

**Our observations:**


**Conclusion:**

In [25]:
import lightgbm as lgb
from sklearn import metrics
import os

os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

lgbm = lgb.LGBMRegressor()
lgbm.fit(train_df, train_label)

predict = lgbm.predict(test_df)
predict = np.round(predict)

mse = metrics.mean_squared_error(test_label, predict)
accuracy = metrics.accuracy_score(test_label, predict)

print('MSE:', mse, end=' ')
print('Accuracy:', accuracy)




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001360 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3098
[LightGBM] [Info] Number of data points in the train set: 7190, number of used features: 132
[LightGBM] [Info] Start training from score 3.873435
MSE: 0.5952380952380952 Accuracy: 0.5864661654135338
