In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE

In [2]:
#List of all movie genres
target_list=['Action','Adventure','Animated','Comedy','Historical','Fantasy','Horror','Science fiction','Documentary']

In [3]:
#read data from csv file
data=pd.read_csv(r'./data/data_clean_v2.csv')

In [5]:
#create input and output dataframe
data_x=data.drop(columns=['Action','Adventure','Animated','Comedy','Historical','Fantasy','Horror','Science fiction','Documentary'],axis=1)
data_x_ex=data_x.drop(['Observant','Thinking','Judging'],axis=1)
data_x_without=data_x.drop(['Extraversion','Observant','Thinking','Judging'],axis=1)
data_y=data[['Action','Adventure','Animated','Comedy','Historical','Fantasy','Horror','Science fiction','Documentary']]

In [7]:
#input standardization
MM=MinMaxScaler()
data_x_ex[['Extraversion','Age']]=MM.fit_transform(data_x_ex[['Extraversion','Age']])
data_x_without['Age']=(data_x_without['Age']-min(data_x_without['Age']))/(max(data_x_without['Age'])-min(data_x_without['Age']))

In [11]:
#function of MAPE
def MAPE(true, pred):
    nonzero = np.flatnonzero(true)
    true = true.ravel()[nonzero]
    pred = pred.ravel()[nonzero]
    diff = np.abs(np.array(true) - np.array(pred))
    mape = np.nanmean(diff / true)
    return mape

In [38]:
#training and testing in random forest
error_sum_rfr=0
error_rfr=[]
error_sum_without_rfr=0
error_without_rfr=[]
for i in target_list:
    x_train, x_test ,y_train,y_test=train_test_split(data_x_ex,data_y[i],test_size=0.2, random_state=50,shuffle=True)
    rfr = RandomForestRegressor()
    x_train, y_train = SMOTE().fit_resample(x_train,y_train)
    rfr.fit(x_train, y_train)
    rfr_y_predict = rfr.predict(x_test)
    error_sum_rfr+=MAPE(y_test,rfr_y_predict)
    error_rfr.append(MAPE(y_test,rfr_y_predict))
    x_train_wt, x_test_wt ,y_train_wt,y_test_wt=train_test_split(data_x_without,data_y[i],test_size=0.2, random_state=50,shuffle=True)
    rfr_without = RandomForestRegressor()
    x_train_wt, y_train_wt = SMOTE().fit_resample(x_train_wt,y_train_wt)
    rfr_without.fit(x_train_wt, y_train_wt)
    rfr_y_without_predict = rfr_without.predict(x_test_wt)
    error_sum_without_rfr+=MAPE(y_test_wt,rfr_y_without_predict)
    error_without_rfr.append(MAPE(y_test_wt,rfr_y_without_predict))

In [39]:
#overall error in random forest
print(error_sum_rfr/9)
print(error_sum_without_rfr/9)

0.39142540671512643
0.38903019210799583


In [40]:
#training and testing in AdaBoost
error_sum_ad=0
error_ad=[]
error_sum_without_ad=0
error_without_ad=[]
for i in target_list:
    x_train, x_test ,y_train,y_test=train_test_split(data_x_ex,data_y[i],test_size=0.2, random_state=50,shuffle=True)
    ad = AdaBoostRegressor()
    x_train, y_train = SMOTE().fit_resample(x_train,y_train)
    ad.fit(x_train, y_train)
    ad_y_predict = ad.predict(x_test)
    error_sum_ad+=MAPE(y_test,ad_y_predict)
    error_ad.append(MAPE(y_test,ad_y_predict))
    x_train_wt, x_test_wt ,y_train_wt,y_test_wt=train_test_split(data_x_without,data_y[i],test_size=0.2, random_state=50,shuffle=True)
    ad_without = AdaBoostRegressor()
    x_train_wt, y_train_wt = SMOTE().fit_resample(x_train_wt,y_train_wt)
    ad_without.fit(x_train_wt, y_train_wt)
    ad_y_without_predict = ad_without.predict(x_test_wt)
    error_sum_without_ad+=MAPE(y_test_wt,ad_y_without_predict)
    error_without_ad.append(MAPE(y_test_wt,ad_y_without_predict))

In [41]:
#overall error in AdaBoost
print(error_sum_ad/9)
print(error_sum_without_ad/9)

0.3976049572707298
0.3916997614060008


In [42]:
#training and testing in LightGBM
params = {
    'task': 'train',
    'boosting_type': 'gbdt',  
    'objective': 'mape',  
    'metric': {'mape'},  
    'num_leaves': 50,  
    'learning_rate': 0.02, 
    'feature_fraction': 1,  
    'bagging_fraction': 1,  
    'bagging_freq': 5, 
    'verbose': 1  
}
error_sum_lgb=0
error_lgb=[]
error_sum_without_lgb=0
error_without_lgb=[]
for i in target_list:
    x_train, x_test ,y_train,y_test=train_test_split(data_x_ex,data_y[i],test_size=0.2, random_state=50,shuffle=True)
    x_train, y_train = SMOTE().fit_resample(x_train,y_train)
    lgb_train = lgb.Dataset(x_train, y_train)
    gbm = lgb.train(params, lgb_train, num_boost_round=100)
    lgb_y_pred = gbm.predict(x_test, num_iteration=gbm.best_iteration)
    error_sum_lgb+=MAPE(y_test,lgb_y_pred)
    error_lgb.append(MAPE(y_test,lgb_y_pred))
    x_train_wt, x_test_wt ,y_train_wt,y_test_wt=train_test_split(data_x_without,data_y[i],test_size=0.2, random_state=50,shuffle=True)
    x_train_wt, y_train_wt = SMOTE().fit_resample(x_train_wt,y_train_wt)
    lgb_train_wt = lgb.Dataset(x_train_wt, y_train_wt)
    gbm_wt = lgb.train(params, lgb_train_wt, num_boost_round=100)
    lgb_y_pred_wt = gbm_wt.predict(x_test_wt, num_iteration=gbm_wt.best_iteration)
    error_sum_without_lgb+=MAPE(y_test_wt,lgb_y_pred_wt)
    error_without_lgb.append(MAPE(y_test_wt,lgb_y_pred_wt))

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1090
[LightGBM] [Info] Number of data points in the train set: 950, number of used features: 36
[LightGBM] [Info] Start training from score 2.000000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 846
[LightGBM] [Info] Number of data points in the train set: 950, number of used features: 35
[LightGBM] [Info] Start training from score 2.000000


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 900
[LightGBM] [Info] Number of data points in the train set: 775, number of used features: 36
[LightGBM] [Info] Start training from score 2.000000


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 626
[LightGBM] [Info] Number of data points in the train set: 775, number of used features: 35
[LightGBM] [Info] Start training from score 2.000000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 930
[LightGBM] [Info] Number of data points in the train set: 810, number of used features: 36
[LightGBM] [Info] Start training from score 2.000000


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 668
[LightGBM] [Info] Number of data points in the train set: 810, number of used features: 35
[LightGBM] [Info] Start training from score 2.000000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1102
[LightGBM] [Info] Number of data points in the train set: 1000, number of used features: 36
[LightGBM] [Info] Start training from score 2.000000


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 898
[LightGBM] [Info] Number of data points in the train set: 1000, number of used features: 35
[LightGBM] [Info] Start training from score 2.000000


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1011
[LightGBM] [Info] Number of data points in the train set: 895, number of used features: 36
[LightGBM] [Info] Start training from score 2.000000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 772
[LightGBM] [Info] Number of data points in the train set: 895, number of used features: 35
[LightGBM] [Info] Start training from score 2.000000


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 942
[LightGBM] [Info] Number of data points in the train set: 860, number of used features: 36
[LightGBM] [Info] Start training from score 2.000000


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 708
[LightGBM] [Info] Number of data points in the train set: 860, number of used features: 35
[LightGBM] [Info] Start training from score 2.000000
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 319
[LightGBM] [Info] Number of data points in the train set: 555, number of used features: 36
[LightGBM] [Info] Start training from score 2.000000


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 231
[LightGBM] [Info] Number of data points in the train set: 555, number of used features: 35
[LightGBM] [Info] Start training from score 2.000000
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 644
[LightGBM] [Info] Number of data points in the train set: 715, number of used features: 36
[LightGBM] [Info] Start training from score 2.000000


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 414
[LightGBM] [Info] Number of data points in the train set: 715, number of used features: 35
[LightGBM] [Info] Start training from score 2.000000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1000
[LightGBM] [Info] Number of data points in the train set: 865, number of used features: 36
[LightGBM] [Info] Start training from score 2.000000


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 726
[LightGBM] [Info] Number of data points in the train set: 865, number of used features: 35
[LightGBM] [Info] Start training from score 2.000000


In [43]:
#overall error in LightGBM
print(error_sum_lgb/9)
print(error_sum_without_lgb/9)

0.3588080627798317
0.3579698860100006


In [44]:
print(error_rfr)
print(error_without_rfr)
print(error_ad)
print(error_without_ad)
print(error_lgb)
print(error_without_lgb)
res_rfr=[error_rfr,error_without_rfr]
res_rfr=pd.DataFrame(np.array(res_rfr).T)
res_ad=[error_ad,error_without_ad]
res_ad=pd.DataFrame(np.array(res_ad).T)
res_lgb=[error_lgb,error_without_lgb]
res_lgb=pd.DataFrame(np.array(res_lgb).T)
res_rfr.columns=['with','without']
res_ad.columns=['with','without']
res_lgb.columns=['with','without']
res_rfr.index=target_list
res_ad.index=target_list
res_lgb.index=target_list

[0.26304517133956384, 0.307733644859813, 0.39357710280373837, 0.3356542056074766, 0.3973909657320873, 0.34477336448598134, 0.723753894080997, 0.3614345794392524, 0.39546573208722746]
[0.2473929127725857, 0.3068596832814123, 0.3774022585669782, 0.340052310488058, 0.39221209761163023, 0.35792048286604367, 0.7181656282450674, 0.3606448598130841, 0.40062149532710295]
[0.28059179384828536, 0.33263018512689707, 0.3660804759316871, 0.3335311490125255, 0.41409406303405627, 0.3688053098509279, 0.7484236496699662, 0.3434745246963558, 0.39081346426586666]
[0.30046399036935195, 0.31282576338133733, 0.3818817581587407, 0.3267256933919783, 0.39894789686290905, 0.35052342925006663, 0.7433758349736306, 0.343915651287371, 0.3666378349786222]
[0.2627674736214837, 0.3089358255485913, 0.32525846346165854, 0.3351262624818811, 0.43351400825697145, 0.3292009601667204, 0.5411230643746142, 0.32854033159264134, 0.364806175513923]
[0.2631360481945253, 0.2875005363095978, 0.35086962658137055, 0.3262021164248981, 

In [45]:
res_rfr.to_csv(r'.\data\res_rfr_ex.csv',index=True)
res_ad.to_csv(r'.\data\res_ad_ex.csv',index=True)
res_lgb.to_csv(r'.\data\res_lgb_ex.csv',index=True)