Kaggle Competition으로 2017년말에 개최되었던 "WSDM - KKBox's Music Recommendation Challenge" 데이터

- class 변수인 'target' 필드에서 1의 값이 전체 데이터의 0.1%, 1%, 10%가 되도록 축소하여 3가지의 data imbalance 상황을 만든다.

- 평가척도로는 Accuracy, ROC-AUC, Recall, Precision, F1-score 등 5가지를 사용하고, "Survey of resampling techniques for improving classification performance in unbalanced datasets" 논문(첨부화일)에서 테스트한 기법 중 imbalanced-learn 패키지에서 지원하는 것은 모두 테스트한다.

In [58]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
import numpy as np
!pip install -U imbalanced-learn
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.datasets import load_digits
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score



## 데이터 처리

In [59]:
data = pd.read_csv(r'WSDM_train_full_features.csv',encoding='euc-kr')
data=data.sample(frac=0.05,random_state=0)
data.shape

(368871, 30)

In [60]:
data.head()

Unnamed: 0,target,user_idx,song_idx,ssn_idx,sst_idx,st_idx,song_length,genr_idx,art_idx,com_idx,...,lyricists_count,composer_count,is_featured,artist_count,artist_composer,artist_composer_lyricist,song_lang_boolean,smaller_song,count_song_played,count_artist_played
4536199,1,9481,8945,1,1,2,233116,0,146,2,...,0,1,0,0,0,0,0,1,202,2771
2588100,0,15666,20,8,3,0,189846,4,3339,8365,...,0,1,0,0,0,0,0,1,3496,6250
1650329,1,9919,7269,5,1,4,287137,0,166,12577,...,0,1,0,0,0,0,0,0,133,33035
2732406,0,19109,118170,5,3,4,192783,25,278,188,...,0,1,0,0,1,0,0,1,7,540
613088,1,818,6790,1,1,1,221309,1,11967,2,...,0,1,0,0,0,0,0,1,11,26


In [21]:
data_x=data.iloc[:,1:];    data_y=data.iloc[:,0]

In [22]:
len_0=len(data.query("target==0"))
len_1=len(data.query("target==1"))
imbalacned_list_p={0.001:999,0.01:99,0.1:9}

def making_imbalanced(x):
    i=imbalacned_list_p[x]
    len_2=round((len_0/i*(i+1))-len_0)
    data_target1=data.query("target==1").sample(n=len_2,random_state=42)
    data_final=pd.concat([data.query("target==0"),data_target1])
    print(len(data_target1)/len(data_final))
    return data_final

In [23]:
data_1=making_imbalanced(0.1) 
data_x1=data_1.iloc[:,1:];    data_y1=data_1.iloc[:,0]
data_01=making_imbalanced(0.01)
data_x01=data_01.iloc[:,1:];    data_y01=data_01.iloc[:,0]
data_001=making_imbalanced(0.001)
data_x001=data_001.iloc[:,1:];    data_y001=data_001.iloc[:,0]


0.10000196631698996
0.009998269634243937
0.0009985485578339682


In [24]:
score_list=[]
def imbalanced_process(x,x_1,y,y_1,z):
    if z!=0:
        X1, y1 = z.fit_resample(x, y)
    else: X1,y1=x,y
    RF=RandomForestClassifier(random_state=0)
    RF.fit(X1, y1)
    y1_pred=RF.predict(x_1)
    y2_pred=RF.predict_proba(x_1)[:,1]
    print(f'accuracy_score : {round(accuracy_score(y_1, y1_pred),5)}, roc_auc_score : {round(roc_auc_score(y_1, y2_pred),5)}, f1_score : {round(f1_score(y_1, y1_pred),5)}, precision_score : {round(precision_score(y_1, y1_pred),5)}, recall_score : {round(recall_score(y_1, y1_pred),5)}')
    score_list.append([round(accuracy_score(y_1, y1_pred),5),round(roc_auc_score(y_1, y2_pred),5),round(f1_score(y_1, y1_pred),5),round(precision_score(y_1, y1_pred),5),round(recall_score(y_1, y1_pred),5)])

# 9:1 imbalanced

In [25]:
# 학습/평가 데이터 분할
X1_train, X1_test, y1_train, y1_test = train_test_split(data_x1, data_y1, random_state=0)
X1_train.shape, X1_test.shape

((152569, 29), (50857, 29))

### original

In [None]:
from sklearn.ensemble import RandomForestClassifier
# 원래 데이터를 사용했을 때의 모델성능
imbalanced_process(X1_train, X1_test, y1_train, y1_test,0)

accuracy_score : 0.89936, roc_auc_score : 0.67379, f1_score : 0.02663, precision_score : 0.30837, recall_score : 0.01391


### Under_Sampling

- Ramdom Under-Sampler

In [None]:
from imblearn.under_sampling import *
from sklearn.ensemble import RandomForestClassifier
imbalanced_process(X1_train, X1_test, y1_train, y1_test,RandomUnderSampler(random_state=0))

accuracy_score : 0.6303, roc_auc_score : 0.68405, f1_score : 0.25748, precision_score : 0.16066, recall_score : 0.64798


- NearMiss-1

In [None]:
from imblearn.under_sampling import *
from sklearn.ensemble import RandomForestClassifier
imbalanced_process(X1_train, X1_test, y1_train, y1_test, NearMiss(version=1,n_jobs=-1))

accuracy_score : 0.23932, roc_auc_score : 0.43446, f1_score : 0.15874, precision_score : 0.08912, recall_score : 0.7255


- NearMiss-3

In [None]:
from imblearn.under_sampling import *
from sklearn.ensemble import RandomForestClassifier
imbalanced_process(X1_train, X1_test, y1_train, y1_test, NearMiss(version=3,n_jobs=-1))

accuracy_score : 0.56677, roc_auc_score : 0.65308, f1_score : 0.23023, precision_score : 0.13967, recall_score : 0.65494


- Condensed Nearest Neighbour

In [None]:
from imblearn.under_sampling import *
from sklearn.ensemble import RandomForestClassifier
imbalanced_process(X1_train, X1_test, y1_train, y1_test, CondensedNearestNeighbour(n_jobs=-1))

-  TomekLinks

In [None]:
from imblearn.under_sampling import *
from sklearn.ensemble import RandomForestClassifier
imbalanced_process(X1_train, X1_test, y1_train, y1_test, TomekLinks(sampling_strategy='all',n_jobs=-1))

accuracy_score : 0.9009, roc_auc_score : 0.65114, f1_score : 0.01254, precision_score : 0.43836, recall_score : 0.00636


- Edited Nearest Neighbours

In [None]:
from imblearn.under_sampling import *
from sklearn.ensemble import RandomForestClassifier
imbalanced_process(X1_train, X1_test, y1_train, y1_test, EditedNearestNeighbours(kind_sel="all", n_neighbors=5,n_jobs=-1))

accuracy_score : 0.87333, roc_auc_score : 0.67781, f1_score : 0.17516, precision_score : 0.24613, recall_score : 0.13596


- RepeatedEditedNearestNeighbours

In [10]:
from imblearn.under_sampling import *
from sklearn.ensemble import RandomForestClassifier
imbalanced_process(X1_train, X1_test, y1_train, y1_test, RepeatedEditedNearestNeighbours(n_jobs=-1))



accuracy_score : 0.8679, roc_auc_score : 0.67461, f1_score : 0.18332, precision_score : 0.23599, recall_score : 0.14987


### Over_Sampling

- RandomOverSampler

In [None]:
from imblearn.over_sampling import *
from sklearn.ensemble import RandomForestClassifier
imbalanced_process(X1_train, X1_test, y1_train, y1_test, RandomOverSampler(random_state=0))

accuracy_score : 0.89345, roc_auc_score : 0.67428, f1_score : 0.07066, precision_score : 0.2575, recall_score : 0.04095


- SMOTE

In [None]:
from imblearn.over_sampling import *
from sklearn.ensemble import RandomForestClassifier
imbalanced_process(X1_train, X1_test, y1_train, y1_test, SMOTE(random_state=0,k_neighbors=5,n_jobs=-1))

accuracy_score : 0.86507, roc_auc_score : 0.6619, f1_score : 0.16885, precision_score : 0.21612, recall_score : 0.13854


- BorderlineSMOTE-1

In [None]:
from imblearn.over_sampling import *
from sklearn.ensemble import RandomForestClassifier
imbalanced_process(X1_train, X1_test, y1_train, y1_test, BorderlineSMOTE(kind = 'borderline-1',random_state=0,n_jobs=-1))

accuracy_score : 0.86979, roc_auc_score : 0.66468, f1_score : 0.16851, precision_score : 0.22878, recall_score : 0.13337


- BorderlineSMOTE-2

In [None]:
from imblearn.over_sampling import *
from sklearn.ensemble import RandomForestClassifier
imbalanced_process(X1_train, X1_test, y1_train, y1_test, BorderlineSMOTE(kind = 'borderline-2',random_state=0,n_jobs=-1))

accuracy_score : 0.86562, roc_auc_score : 0.65901, f1_score : 0.16085, precision_score : 0.21041, recall_score : 0.13019


### Combine

- SMOTETomek

In [None]:
from imblearn.combine import *
from sklearn.ensemble import RandomForestClassifier
imbalanced_process(X1_train, X1_test, y1_train, y1_test, SMOTETomek(random_state=0, sampling_strategy='all',n_jobs=-1))

accuracy_score : 0.86385, roc_auc_score : 0.66451, f1_score : 0.17355, precision_score : 0.21721, recall_score : 0.1445


- SMOTE+ENN

In [None]:
from imblearn.combine import *
from sklearn.ensemble import RandomForestClassifier
imbalanced_process(X1_train, X1_test, y1_train, y1_test, SMOTEENN(random_state=0,n_jobs=-1))

accuracy_score : 0.82997, roc_auc_score : 0.67192, f1_score : 0.22898, precision_score : 0.20763, recall_score : 0.25522


### Ensemble

- BalanceCascade

In [11]:
from imblearn.ensemble import BalancedRandomForestClassifier
brf = BalancedRandomForestClassifier(n_estimators=100, random_state=0)
brf.fit(X1_train, y1_train) 
y1_pred = brf.predict(X1_test)
y2_pred = brf.predict_proba(X1_test)[:,1]
print(f'accuracy_score : {round(accuracy_score(y1_test, y1_pred),5)}, roc_auc_score : {round(roc_auc_score(y1_test, y2_pred),5)}, f1_score : {round(f1_score(y1_test, y1_pred),5)}, precision_score : {round(precision_score(y1_test, y1_pred),5)}, recall_score : {round(recall_score(y1_test, y1_pred),5)}')

accuracy_score : 0.63124, roc_auc_score : 0.69057, f1_score : 0.26096, precision_score : 0.16274, recall_score : 0.65812


- EasyEnsemble

In [11]:
from imblearn.ensemble import *
eec=EasyEnsembleClassifier(random_state=0)
eec.fit(X1_train, y1_train) 
y1_pred = eec.predict(X1_test)
y2_pred = eec.predict_proba(X1_test)[:,1]
print(f'accuracy_score : {round(accuracy_score(y1_test, y1_pred),5)}, roc_auc_score : {round(roc_auc_score(y1_test, y2_pred),5)}, f1_score : {round(f1_score(y1_test, y1_pred),5)}, precision_score : {round(precision_score(y1_test, y1_pred),5)}, recall_score : {round(recall_score(y1_test, y1_pred),5)}')

accuracy_score : 0.62056, roc_auc_score : 0.69608, f1_score : 0.26023, precision_score : 0.1612, recall_score : 0.67462


# 99:1 imbalanced

In [15]:
# 학습/평가 데이터 분할
X01_train, X01_test, y01_train, y01_test = train_test_split(data_x01, data_y01, random_state=0)
X01_train.shape, X01_test.shape

((138699, 29), (46233, 29))

### original

In [17]:
from sklearn.ensemble import RandomForestClassifier
# 원래 데이터를 사용했을 때의 모델성능
imbalanced_process(X01_train, X01_test, y01_train, y01_test,0)

accuracy_score : 0.98942, roc_auc_score : 0.57057, f1_score : 0.0, precision_score : 0.0, recall_score : 0.0


### Under_Sampling

- Ramdom Under-Sampler

In [None]:
from imblearn.under_sampling import *
from sklearn.ensemble import RandomForestClassifier
imbalanced_process(X01_train, X01_test, y01_train, y01_test, RandomUnderSampler(random_state=0))

accuracy_score : 0.60902, roc_auc_score : 0.66592, f1_score : 0.03378, precision_score : 0.01734, recall_score : 0.64754


- NearMiss-1

In [None]:
from imblearn.under_sampling import *
from sklearn.ensemble import RandomForestClassifier
imbalanced_process(X01_train, X01_test, y01_train, y01_test, NearMiss(version=1,n_jobs=-1))

accuracy_score : 0.07568, roc_auc_score : 0.47198, f1_score : 0.02045, precision_score : 0.01034, recall_score : 0.91393


- NearMiss-2

In [None]:
from imblearn.under_sampling import *
from sklearn.ensemble import RandomForestClassifier
imbalanced_process(X01_train, X01_test, y01_train, y01_test, NearMiss(version=2,n_jobs=-1))

accuracy_score : 0.02293, roc_auc_score : 0.57996, f1_score : 0.02102, precision_score : 0.01062, recall_score : 0.99385


- NearMiss-3

In [None]:
from imblearn.under_sampling import *
from sklearn.ensemble import RandomForestClassifier
imbalanced_process(X01_train, X01_test, y01_train, y01_test, NearMiss(version=3,n_jobs=-1))

accuracy_score : 0.47406, roc_auc_score : 0.61636, f1_score : 0.02689, precision_score : 0.01371, recall_score : 0.68852


- Condensed Nearest Neighbour

In [None]:
from imblearn.under_sampling import *
from sklearn.ensemble import RandomForestClassifier
imbalanced_process(X01_train, X01_test, y01_train, y01_test, CondensedNearestNeighbour(random_state=0,n_jobs=-1))

- TomekLinks

In [None]:
from imblearn.under_sampling import *
from sklearn.ensemble import RandomForestClassifier
imbalanced_process(X01_train, X0p1_test, y01_train, y01_test, TomekLinks(sampling_strategy='all',n_jobs=-1))

accuracy_score : 0.98944, roc_auc_score : 0.54706, f1_score : 0.0, precision_score : 0.0, recall_score : 0.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


- Edited Nearest Neighbours

In [None]:
from imblearn.under_sampling import *
from sklearn.ensemble import RandomForestClassifier
imbalanced_process(X01_train, X01_test, y01_train, y01_test, EditedNearestNeighbours(kind_sel="all", n_neighbors=5,n_jobs=-1))

accuracy_score : 0.98942, roc_auc_score : 0.56003, f1_score : 0.0, precision_score : 0.0, recall_score : 0.0


- RepeatedEditedNearestNeighbours

In [None]:
from imblearn.under_sampling import *
from sklearn.ensemble import RandomForestClassifier
imbalanced_process(X01_train, X01_test, y01_train, y01_test, RepeatedEditedNearestNeighbours(n_jobs=-1))

accuracy_score : 0.98938, roc_auc_score : 0.57241, f1_score : 0.0, precision_score : 0.0, recall_score : 0.0


### Over_Sampling

- RandomOverSampler

In [None]:
from imblearn.over_sampling import *
from sklearn.ensemble import RandomForestClassifier
imbalanced_process(X01_train, X01_test, y01_train, y01_test, RandomOverSampler(random_state=0))

accuracy_score : 0.98925, roc_auc_score : 0.59561, f1_score : 0.0, precision_score : 0.0, recall_score : 0.0


- SMOTE

In [None]:
from imblearn.over_sampling import *
from sklearn.ensemble import RandomForestClassifier
imbalanced_process(X01_train, X01_test, y01_train, y01_test, SMOTE(random_state=0,k_neighbors=5,n_jobs=-1))

accuracy_score : 0.98521, roc_auc_score : 0.6054, f1_score : 0.01724, precision_score : 0.02885, recall_score : 0.0123


- BorderlineSMOTE-1

In [None]:
from imblearn.over_sampling import *
from sklearn.ensemble import RandomForestClassifier
imbalanced_process(X01_train, X01_test, y01_train, y01_test, BorderlineSMOTE(random_state=0,kind = 'borderline-1',n_jobs=-1))

accuracy_score : 0.98914, roc_auc_score : 0.59913, f1_score : 0.00791, precision_score : 0.11111, recall_score : 0.0041


- BorderlineSMOTE-2

In [None]:
from imblearn.over_sampling import *
from sklearn.ensemble import RandomForestClassifier
imbalanced_process(X01_train, X01_test, y01_train, y01_test, BorderlineSMOTE(random_state=0,kind = 'borderline-2',n_jobs=-1))

accuracy_score : 0.98899, roc_auc_score : 0.6028, f1_score : 0.0078, precision_score : 0.08, recall_score : 0.0041


### Combine

- SMOTETomek

In [None]:
from imblearn.combine import *
from sklearn.ensemble import RandomForestClassifier
imbalanced_process(X01_train, X01_test, y01_train, y01_test, SMOTETomek(random_state=0, sampling_strategy='all',n_jobs=-1))

accuracy_score : 0.98508, roc_auc_score : 0.6096, f1_score : 0.01429, precision_score : 0.02358, recall_score : 0.01025


- SMOTE+ENN

In [None]:
from imblearn.combine import *
from sklearn.ensemble import RandomForestClassifier
imbalanced_process(X01_train, X01_test, y01_train, y01_test, SMOTEENN(random_state=0,n_jobs=-1))

accuracy_score : 0.98352, roc_auc_score : 0.61494, f1_score : 0.0155, precision_score : 0.02098, recall_score : 0.0123


### Ensemble

- BalanceCascade

In [None]:
from imblearn.ensemble import BalancedRandomForestClassifier
brf = BalancedRandomForestClassifier(n_estimators=100, random_state=0)
brf.fit(X01_train, y01_train) 
y1_pred = brf.predict(X01_test)
y2_pred = brf.predict_proba(X01_test)[:,1]
print(f'accuracy_score : {round(accuracy_score(y01_test, y1_pred),5)}, roc_auc_score : {round(roc_auc_score(y01_test, y2_pred),5)}, f1_score : {round(f1_score(y01_test, y1_pred),5)}, precision_score : {round(precision_score(y01_test, y1_pred),5)}, recall_score : {round(recall_score(y01_test, y1_pred),5)}')

accuracy_score : 0.61887, roc_auc_score : 0.66543, f1_score : 0.0341, precision_score : 0.01752, recall_score : 0.6373


- EasyEnsemble

In [16]:
from imblearn.ensemble import *
from sklearn.ensemble import RandomForestClassifier
eec=EasyEnsembleClassifier(random_state=0)
eec.fit(X01_train, y01_train) 
y1_pred = eec.predict(X01_test)
y2_pred = eec.predict_proba(X01_test)[:,1]
print(f'accuracy_score : {round(accuracy_score(y01_test, y1_pred),5)}, roc_auc_score : {round(roc_auc_score(y01_test, y2_pred),5)}, f1_score : {round(f1_score(y01_test, y1_pred),5)}, precision_score : {round(precision_score(y01_test, y1_pred),5)}, recall_score : {round(recall_score(y01_test, y1_pred),5)}')

accuracy_score : 0.60418, roc_auc_score : 0.66624, f1_score : 0.03348, precision_score : 0.01719, recall_score : 0.64959
