In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd drive/My\ Drive/PRML-2020/Data\ Contest

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from sklearn.model_selection import KFold
from sklearn.utils import shuffle
from imblearn.over_sampling import SMOTE, SVMSMOTE, BorderlineSMOTE
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings("ignore")

In [None]:
data = pd.read_csv('./Required data/train_complete.csv')
data.drop(['Unnamed: 0', 'host_id'], axis=1, inplace=True)
display(data)

# Score and cross-validation function

In [None]:
def MAP_Score(y_ord, y_pred_ord):
    MAP = []
    for y_, y_pred in zip(y_ord, y_pred_ord):
        cor = 0
        score = 0
        for i in range(len(y_pred)):
            if y_pred[i] in y_:
                cor += 1
                score += cor/(i+1)
        MAP.append(score/len(y_))
        
    return np.mean(MAP)

a = [[2,4]]
b = [[2,3,5,4,1]]
MAP_Score(a, b)

In [None]:
def cross_validation(model, data, cls_wgt):
    all_ids = shuffle(data['biker_id'].unique())
    num_ids = len(all_ids)
    MAP_score = []
    
    kfold = KFold(n_splits=5)
    for t_indx, v_indx in kfold.split(all_ids):
        train_ids = all_ids[t_indx]
        val_ids = all_ids[v_indx]
        
        train_indx, val_indx = [], []
        for id in all_ids:
            if id in train_ids:
                train_indx.extend(np.where(data['biker_id'] == id)[0])
            else:
                val_indx.extend(np.where(data['biker_id'] == id)[0])
            
        train_data = data.iloc[train_indx]
        train_bikers = train_data['biker_id']
        train_tour = train_data['tour_id']
        train_data.drop(['biker_id', 'tour_id'], axis=1, inplace=True)
        
        val_data = data.iloc[val_indx]
        val_bikers = val_data['biker_id']
        val_tour = val_data['tour_id']
        val_data.drop(['biker_id', 'tour_id'], axis=1, inplace=True)
        
        X_train = train_data.drop(['like', 'dislike'], axis=1).to_numpy()
        X_val = val_data.drop(['like', 'dislike'], axis=1).to_numpy()
        y_train = train_data['like']
        y_val = val_data['like']
        
        oversample = BorderlineSMOTE(sampling_strategy={0:9000, 1:int(9000*cls_wgt)})
        X_train, y_train = oversample.fit_resample(X_train, y_train)
        
        y_val_ord = []
        y_val_prob = []
        for id in val_ids:
            like = []
            dt = val_data[data['biker_id'] == id]
            for idx, lk in enumerate(dt['like']):
                if lk:
                    like.append(idx)
            y_val_prob.append(list(dt['like']))
            y_val_ord.append(like)
        
        sclr = StandardScaler()
        X_train_scld = sclr.fit_transform(X_train)
        X_val_scld = sclr.transform(X_val)
        
        model.fit(X_train_scld, y_train)
        y_train_pred = model.predict(X_train_scld)
        y_val_pred = model.predict(X_val_scld)

        y_val_pred = model.predict(X_val_scld)
        y_val_pred_prob = model.predict_proba(X_val_scld)
        
        y_val_pred_ord = []
        y_val_prd_prob = []
        
        for id in val_ids:
            indx = np.where(val_bikers==id)[0]
            pred = np.flip(np.argsort(y_val_pred_prob[indx, 1]))
            prob = y_val_pred_prob[indx, 1]
            
            y_val_pred_ord.append(pred)
            y_val_prd_prob.append(prob)
            
        val_map = MAP_Score(y_val_ord, y_val_pred_ord)
        MAP_score.append(val_map)
        
    return np.mean(MAP_score)

# XGBoost with 50 trees

In [None]:
MAP_score = []
features = {}

n_est = 50
for cls_wgt in tqdm(np.linspace(0.5, 1.5, 5)):
    for lr in np.linspace(0.1, 0.5, 3):
        for max_features in np.linspace(0.5, 0.9, 5):
            for max_samples in np.linspace(0.5, 0.9, 5):
                for max_depth in np.linspace(10, 50, 5, dtype=np.int32):

                    print('===========================================================================')
                    clf = XGBClassifier(n_estimators=n_est, colsample_bytree=max_features,
                                        subsample=max_samples, max_depth=max_depth, 
                                        learning_rate=lr)
                    score = cross_validation(clf, data, cls_wgt)
                    MAP_score.append(score)
                    features[str(score)] = [cls_wgt, n_est, max_depth, max_features, max_samples]
                    print('Class weight : {0:.1f}, Num estimators : {1}, Max depth : {2}, Max features : {3:.2f}, Max samples : {4:.2f}, Learning rate : {5:.1f}, MAP score : {6:.3f}'
                        .format(cls_wgt, n_est, max_depth, max_features, max_samples, lr, score))

In [None]:
max_map = max(MAP_score)
feat = features[str(max_map)]
max_map, feat

# LightGBM with 50 trees

In [None]:
MAP_score = []
features = {}

n_est = 50
for cls_wgt in tqdm(np.linspace(0.5, 1.5, 5)):
    for lr in np.linspace(0.1, 0.5, 3):
        for max_features in np.linspace(0.5, 0.9, 5):
            for max_samples in np.linspace(0.5, 0.9, 5):
                for max_depth in np.linspace(10, 50, 5, dtype=np.int32):

                    print('===========================================================================')
                    clf = LGBMClassifier(num_iterations=n_est, feature_fraction=max_features,
                                        subsample=max_samples, max_depth=max_depth, 
                                        learning_rate=lr)
                    score = cross_validation(clf, data, cls_wgt)
                    MAP_score.append(score)
                    features[str(score)] = [cls_wgt, n_est, max_depth, max_features, max_samples]
                    print('Class weight : {0:.1f}, Num estimators : {1}, Max depth : {2}, Max features : {3:.2f}, Max samples : {4:.2f}, Learning rate : {5:.1f}, MAP score : {6:.3f}'
                        .format(cls_wgt, n_est, max_depth, max_features, max_samples, lr, score))

In [None]:
max_map = max(MAP_score)
feat = features[str(max_map)]
max_map, feat

# CatBoost with 50 trees

In [None]:
MAP_score = []
features = {}

n_est = 50
for cls_wgt in tqdm(np.linspace(0.5, 1.5, 5)):
    for lr in np.linspace(0.1, 0.5, 3):
        for max_features in np.linspace(0.5, 0.9, 5):
            for max_samples in np.linspace(0.5, 0.9, 5):
                for max_depth in np.linspace(10, 50, 5, dtype=np.int32):

                    print('===========================================================================')
                    clf = CatBoostClassifier(iterations=n_est, rsm=max_features,
                                             subsample=max_samples, depth=max_depth, 
                                             learning_rate=lr)
                    score = cross_validation(clf, data, cls_wgt)
                    MAP_score.append(score)
                    features[str(score)] = [cls_wgt, n_est, max_depth, max_features, max_samples]
                    print('Class weight : {0:.1f}, Num estimators : {1}, Max depth : {2}, Max features : {3:.2f}, Max samples : {4:.2f}, Learning rate : {5:.1f}, MAP score : {6:.3f}'
                        .format(cls_wgt, n_est, max_depth, max_features, max_samples, lr, score))

In [None]:
max_map = max(MAP_score)
feat = features[str(max_map)]
max_map, feat