In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [6]:
from imblearn.over_sampling import SMOTE

In [2]:
#List of all movie genres
target_list=['Action','Adventure','Animated','Comedy','Historical','Fantasy','Horror','Science fiction','Documentary']
#read data from csv file
data=pd.read_csv(r'./data/data_clean_v2.csv')
data

Unnamed: 0,Extraversion,Observant,Thinking,Judging,Age,way of watching movies,Action,Adventure,Animated,Comedy,...,with_With friends,with_alone,with_with boyfriend/girlfriend,with_with family,Actor/actress,director,advertisements,genres,previews,comment
0,71,64,58,93,23.0,0,5,5,3,4,...,1,0,0,0,1.0,1.0,0.0,1.0,0.0,0.0
1,51,78,76,58,21.0,1,4,2,2,3,...,1,0,0,0,1.0,1.0,0.0,0.0,1.0,0.0
2,57,63,33,39,21.0,0,2,4,5,4,...,0,0,1,0,1.0,0.0,0.0,0.0,1.0,1.0
3,83,16,65,51,23.0,0,2,2,5,5,...,0,0,1,0,1.0,0.0,0.0,1.0,0.0,1.0
4,25,59,68,32,22.0,0,4,4,2,2,...,0,0,1,0,0.0,0.0,0.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
528,60,46,70,68,20.0,0,4,3,2,4,...,0,0,0,1,1.0,0.0,0.0,1.0,1.0,0.0
529,90,35,47,32,26.0,1,2,3,4,2,...,0,1,0,0,0.0,1.0,0.0,1.0,0.0,1.0
530,75,58,60,80,22.0,0,3,2,3,4,...,1,0,0,0,1.0,0.0,0.0,0.0,1.0,1.0
531,82,90,61,70,21.0,0,4,4,4,4,...,0,0,1,0,0.0,0.0,1.0,0.0,1.0,1.0


In [3]:
#create input and output dataframe
data_x=data.drop(columns=['Action','Adventure','Animated','Comedy','Historical','Fantasy','Horror','Science fiction','Documentary'],axis=1)
data_x_without=data_x.drop(['Extraversion','Observant','Thinking','Judging'],axis=1)
data_y=data[['Action','Adventure','Animated','Comedy','Historical','Fantasy','Horror','Science fiction','Documentary']]

In [4]:
#input standardization
MM=MinMaxScaler()
data_x[['Extraversion','Observant','Thinking','Judging','Age']]=MM.fit_transform(data_x[['Extraversion','Observant','Thinking','Judging','Age']])
data_x_without['Age']=(data_x_without['Age']-min(data_x_without['Age']))/(max(data_x_without['Age'])-min(data_x_without['Age']))

In [5]:
#function of MAPE
def MAPE(true, pred):
    nonzero = np.flatnonzero(true)
    true = true.ravel()[nonzero]
    pred = pred.ravel()[nonzero]
    diff = np.abs(np.array(true) - np.array(pred))
    mape = np.nanmean(diff / true)
    return mape

In [77]:
#training and testing in random forest
error_sum_rfr=0
error_rfr=[]
error_sum_without_rfr=0
error_without_rfr=[]
for i in target_list:
    x_train, x_test ,y_train,y_test=train_test_split(data_x,data_y[i],test_size=0.2, random_state=30,shuffle=True)
    rfr = RandomForestRegressor()
    x_train, y_train = SMOTE().fit_resample(x_train,y_train)
    rfr.fit(x_train, y_train)
    rfr_y_predict = rfr.predict(x_test)
    error_sum_rfr+=MAPE(y_test,rfr_y_predict)
    error_rfr.append(MAPE(y_test,rfr_y_predict))
    x_train_wt, x_test_wt ,y_train_wt,y_test_wt=train_test_split(data_x_without,data_y[i],test_size=0.2, random_state=30,shuffle=True)
    rfr_without = RandomForestRegressor()
    x_train_wt, y_train_wt = SMOTE().fit_resample(x_train_wt,y_train_wt)
    rfr_without.fit(x_train_wt, y_train_wt)
    rfr_y_without_predict = rfr_without.predict(x_test_wt)
    error_sum_without_rfr+=MAPE(y_test_wt,rfr_y_without_predict)
    error_without_rfr.append(MAPE(y_test_wt,rfr_y_without_predict))

In [78]:
#overall error in random forest
print(error_sum_rfr/9)
print(error_sum_without_rfr/9)

0.4252888542748356
0.42691921739273764


In [95]:
#training and testing in AdaBoost
error_sum_ad=0
error_ad=[]
error_sum_without_ad=0
error_without_ad=[]
for i in target_list:
    x_train, x_test ,y_train,y_test=train_test_split(data_x,data_y[i],test_size=0.2, random_state=30,shuffle=True)
    ad = AdaBoostRegressor()
    x_train, y_train = SMOTE().fit_resample(x_train,y_train)
    ad.fit(x_train, y_train)
    ad_y_predict = ad.predict(x_test)
    error_sum_ad+=MAPE(y_test,ad_y_predict)
    error_ad.append(MAPE(y_test,ad_y_predict))
    x_train_wt, x_test_wt ,y_train_wt,y_test_wt=train_test_split(data_x_without,data_y[i],test_size=0.2, random_state=30,shuffle=True)
    ad_without = AdaBoostRegressor()
    x_train_wt, y_train_wt = SMOTE().fit_resample(x_train_wt,y_train_wt)
    ad_without.fit(x_train_wt, y_train_wt)
    ad_y_without_predict = ad_without.predict(x_test_wt)
    error_sum_without_ad+=MAPE(y_test_wt,ad_y_without_predict)
    error_without_ad.append(MAPE(y_test_wt,ad_y_without_predict))

In [96]:
#overall error in AdaBoost
print(error_sum_ad/9)
print(error_sum_without_ad/9)

0.4085303346495243
0.41303510402751914


In [115]:
#training and testing in LightGBM
params = {
    'task': 'train',
    'boosting_type': 'gbdt',  # 设置提升类型
    'objective': 'mape',  # 目标函数
    'metric': {'mape'},  # 评估函数
    'num_leaves': 50,  # 叶子节点数
    'learning_rate': 0.02,  # 学习速率
    'feature_fraction': 1,  # 建树的特征选择比例
    'bagging_fraction': 1,  # 建树的样本采样比例
    'bagging_freq': 5,  # k 意味着每 k 次迭代执行bagging
    'verbose': 1  # <0 显示致命的, =0 显示错误 (警告), >0 显示信息
}
error_sum_lgb=0
error_lgb=[]
error_sum_without_lgb=0
error_without_lgb=[]
for i in target_list:
    x_train, x_test ,y_train,y_test=train_test_split(data_x,data_y[i],test_size=0.2, random_state=30,shuffle=True)
    x_train, y_train = SMOTE().fit_resample(x_train,y_train)
    lgb_train = lgb.Dataset(x_train, y_train)
    gbm = lgb.train(params, lgb_train, num_boost_round=100)
    lgb_y_pred = gbm.predict(x_test, num_iteration=gbm.best_iteration)
    error_sum_lgb+=MAPE(y_test,lgb_y_pred)
    error_lgb.append(MAPE(y_test,lgb_y_pred))
    x_train_wt, x_test_wt ,y_train_wt,y_test_wt=train_test_split(data_x_without,data_y[i],test_size=0.2, random_state=30,shuffle=True)
    x_train_wt, y_train_wt = SMOTE().fit_resample(x_train_wt,y_train_wt)
    lgb_train_wt = lgb.Dataset(x_train_wt, y_train_wt)
    gbm_wt = lgb.train(params, lgb_train_wt, num_boost_round=100)
    lgb_y_pred_wt = gbm_wt.predict(x_test_wt, num_iteration=gbm_wt.best_iteration)
    error_sum_without_lgb+=MAPE(y_test_wt,lgb_y_pred_wt)
    error_without_lgb.append(MAPE(y_test_wt,lgb_y_pred_wt))

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1916
[LightGBM] [Info] Number of data points in the train set: 1020, number of used features: 39
[LightGBM] [Info] Start training from score 2.000000
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 910
[LightGBM] [Info] Number of data points in the train set: 1020, number of used features: 35
[LightGBM] [Info] Start training from score 2.000000


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1668
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 39
[LightGBM] [Info] Start training from score 2.000000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 640
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 35
[LightGBM] [Info] Start training from score 2.000000


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1689
[LightGBM] [Info] Number of data points in the train set: 815, number of used features: 38
[LightGBM] [Info] Start training from score 2.000000


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 676
[LightGBM] [Info] Number of data points in the train set: 815, number of used features: 34
[LightGBM] [Info] Start training from score 2.000000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1829
[LightGBM] [Info] Number of data points in the train set: 980, number of used features: 39
[LightGBM] [Info] Start training from score 2.000000


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 782
[LightGBM] [Info] Number of data points in the train set: 980, number of used features: 36
[LightGBM] [Info] Start training from score 2.000000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1775
[LightGBM] [Info] Number of data points in the train set: 905, number of used features: 39
[LightGBM] [Info] Start training from score 2.000000


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 779
[LightGBM] [Info] Number of data points in the train set: 905, number of used features: 35
[LightGBM] [Info] Start training from score 2.000000


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1700
[LightGBM] [Info] Number of data points in the train set: 850, number of used features: 39
[LightGBM] [Info] Start training from score 2.000000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 672
[LightGBM] [Info] Number of data points in the train set: 850, number of used features: 35
[LightGBM] [Info] Start training from score 2.000000


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 654
[LightGBM] [Info] Number of data points in the train set: 580, number of used features: 38
[LightGBM] [Info] Start training from score 2.000000
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 258
[LightGBM] [Info] Number of data points in the train set: 580, number of used features: 35
[LightGBM] [Info] Start training from score 2.000000


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1387
[LightGBM] [Info] Number of data points in the train set: 745, number of used features: 38
[LightGBM] [Info] Start training from score 2.000000
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 421
[LightGBM] [Info] Number of data points in the train set: 745, number of used features: 35
[LightGBM] [Info] Start training from score 2.000000


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1763
[LightGBM] [Info] Number of data points in the train set: 885, number of used features: 39
[LightGBM] [Info] Start training from score 2.000000


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 885, number of used features: 35
[LightGBM] [Info] Start training from score 2.000000


In [116]:
#overall error in LightGBM
print(error_sum_lgb/9)
print(error_sum_without_lgb/9)

0.38436977351423157
0.3905575132393722


In [117]:
print(error_rfr)
print(error_without_rfr)
print(error_ad)
print(error_without_ad)
print(error_lgb)
print(error_without_lgb)

[0.38073364485981304, 0.42346417445482876, 0.35609345794392533, 0.38637071651090343, 0.4035389408099689, 0.3754314641744549, 0.659322429906542, 0.42591433021806857, 0.41673052959501555]
[0.36821028037383174, 0.4068971962616822, 0.34843001780151317, 0.3748569574247145, 0.43276181204569053, 0.40565784008307376, 0.6512357217030116, 0.4336839304257529, 0.4205392004153687]
[0.37110128914961255, 0.393519661716194, 0.33628793824298037, 0.3706467069906128, 0.37395971888785023, 0.36977877795916464, 0.651927226633767, 0.4080587384576591, 0.4014929538078781]
[0.374216853613993, 0.4039781278566059, 0.34191681696053905, 0.3673733381988201, 0.3878934315085435, 0.395812171205055, 0.6640535950439689, 0.40537580762778663, 0.3766957942323604]
[0.3413276798451388, 0.4048108867333136, 0.33771159781617455, 0.3353841340446816, 0.3618162759083948, 0.3719335127318345, 0.5199569697404371, 0.3862278327729853, 0.4001590720351239]
[0.322837203346455, 0.38320290831907067, 0.3614975417676167, 0.34146462125960714, 0

In [118]:
#Create dataset for paires-sample t-test
res_rfr=[error_rfr,error_without_rfr]
res_rfr=pd.DataFrame(np.array(res_rfr).T)
res_ad=[error_ad,error_without_ad]
res_ad=pd.DataFrame(np.array(res_ad).T)
res_lgb=[error_lgb,error_without_lgb]
res_lgb=pd.DataFrame(np.array(res_lgb).T)

In [119]:
res_rfr.columns=['with','without']
res_ad.columns=['with','without']
res_lgb.columns=['with','without']
res_rfr.index=target_list
res_ad.index=target_list
res_lgb.index=target_list

In [120]:
res_rfr.to_csv(r'.\data\res_rfr_SMOTE.csv',index=True)
res_ad.to_csv(r'.\data\res_ad_SMOTE.csv',index=True)
res_lgb.to_csv(r'.\data\res_lgb_SMOTE.csv',index=True)