<a href="https://www.kaggle.com/mikayil/ai4digigov-hackaton?scriptVersionId=88504023" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Imports

In [None]:
import numpy as np
np.random.seed(420)
import pandas as pd
import matplotlib
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
import os
import seaborn as sns
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold
import xgboost as xgb
from xgboost import XGBClassifier
import lightgbm as lgb
import optuna
import re
from sklearn.utils import resample

matplotlib.rcParams['figure.figsize'] = [20, 10]

DEV = False

# Data stuff

## Loading

In [None]:
%%time

train_df = pd.read_csv('queue_dataset_train.csv')

if DEV:
    train_df = train_df.sample(n=500000)
    
train_df = train_df.dropna()
train_df = train_df.reset_index(drop=True)

test_df = pd.read_csv('queue_dataset_test.csv')

## Basic data stats

In [None]:
train_df.describe()

In [None]:
test_df.describe()

## Data cleaning

In [None]:
def one_hot_encode(df, columns):
    res = None
    
    for col in columns:
        res_col = pd.get_dummies(df[col], prefix=col)
        if not isinstance(res, pd.DataFrame):
            res = res_col
        else:
            res = pd.concat([res, res_col], axis=1)
    
    res = pd.concat([res, df], axis=1)
    
    res = res.rename(lambda c: c[:40], axis=1)
    
    return res

In [None]:
# if DEV:
#     train_df.head()

In [None]:
disc_cols = ['branch_name', 'customer_gender', 'customer_city', 'service_name_organization', 'service_name', 'service_name_2']

train_df = one_hot_encode(train_df, disc_cols)
test_df_clean = one_hot_encode(test_df, disc_cols)

In [None]:
# if DEV:
#     train_df.head()

In [None]:
def heatmap(df):
    corr = df.corr()
    
    sns.heatmap(corr, cmap='RdYlGn', linewidths=0.2)
    
    fig = plt.gcf()
    plt.show()
    
    return corr

In [None]:
# %%time
# if DEV:
#     corr = heatmap(train_df)

In [None]:
# if DEV:
#     sorted_ids = np.argsort(np.abs(corr['service_canceled']))
#     sorted_ids.sort_values()[-10:]

### Clean age

In [None]:
def fix_ages(ages_raw, mean=None, std=None):
    ages = []
    
    for row in tqdm(ages_raw):
        if isinstance(row, str):
            age = np.sum(list(map(int, row.split('-')))) / 2
        else:
            age = row
            
        if np.isnan(age):
            age = mean if mean != None else 30
            
        ages.append(age)
    
    ages = np.array(ages)
    
    print(ages)
    
    if mean == None:
        mean = np.mean(ages)
        
    if std == None:
        std = np.std(ages)
        
    print(f"mean: {mean}, std: {std}")
        
    res = []
        
    for age in tqdm(ages):
        res.append((age - mean) / std)
    
    return res, mean, std

In [None]:
train_df['customer_age_appl'], mean, std = fix_ages(train_df['customer_age_appl'])
test_df_clean['customer_age_appl'], mean, std = fix_ages(test_df_clean['customer_age_appl'], mean, std)

In [None]:
# if DEV:
#     corr = heatmap(train_df)

### Clean time

In [None]:
def fix_time(times_raw):
    times = []
    
    for t in tqdm(times_raw):
        if not isinstance(t, str) and np.isnan(t):
            t = "12:00:00.0"
            
        ts = t.split(":")
        hours = int(ts[0])
        minutes = int(ts[1])
        seconds = int(ts[2].split('.')[0])
        
        times.append(hours * 3600 + minutes * 60 + seconds)
    
    return np.array(times) / 86400

In [None]:
train_df['time_start_process'] = fix_time(train_df['time_start_process'])
test_df_clean['time_start_process'] = fix_time(test_df_clean['time_start_process'])

In [None]:
# if DEV:
#     corr = heatmap(train_df)

In [None]:
# train_df = train_df.dropna()

In [None]:
cols_to_add = []

for col in test_df_clean.columns:
    if not col in train_df.columns:
        cols_to_add.append(col)

In [None]:
train_df = pd.concat([train_df, pd.DataFrame({col: np.zeros(len(train_df)) for col in cols_to_add})], axis=1)

### Clean date

In [None]:
def fix_date(df):
    df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')

    df['weekday'] = [x.weekday() for x in df.date]
    df['day'] = [x.day for x in df.date]
    df['year'] = [x.year for x in df.date]
    df['month'] = [x.month for x in df.date]
    
    return df

In [None]:
train_df = fix_date(train_df)
test_df_clean = fix_date(test_df_clean)

In [None]:
train_df = one_hot_encode(train_df, ['weekday'])
test_df_clean = one_hot_encode(test_df_clean, ['weekday'])

### Organization name + service name + service name 2 

In [None]:
def get_column_coefs(df, column):
    res = {}
    
    uniques = df[column].unique()
    
    for val in uniques:
        norm = len(df.loc[test_df[column] == val]) / len(df)
        
        res[val] = norm
    
    return res

In [None]:
def service_popularity(df, service_name_organization_coefs=None, service_name_coefs=None, service_name_2_coefs=None):
    if service_name_organization_coefs == None:
        service_name_organization_coefs = get_column_coefs(test_df, 'service_name_organization')
    df = df.replace({'service_name_organization': service_name_organization_coefs})

    if service_name_coefs == None:
        service_name_coefs = get_column_coefs(test_df, 'service_name')
    df = df.replace({'service_name': service_name_coefs})

    if service_name_2_coefs == None:
        service_name_2_coefs = get_column_coefs(test_df, 'service_name_2')
    df = df.replace({'service_name_2': service_name_2_coefs})

    df['popularity_coeff'] = df.service_name * df.service_name_2 * df.service_name_organization
    
    return df, service_name_organization_coefs, service_name_coefs, service_name_2_coefs

In [None]:
train_df, service_name_organization_norm, service_name_norm, service_name_2_norm = service_popularity(train_df)
test_df_clean, service_name_organization_norm, service_name_norm, service_name_2_norm = service_popularity(test_df_clean, service_name_organization_norm, service_name_norm, service_name_2_norm)

# Model

In [None]:
drop_cols = ['id', 'branch_name', 'customer_gender', 'customer_city', 'service_name_organization', 'service_name', 'service_name_2', 'date', 'weekday', 'day', 'month', 'year']

seed = 420

train_df_ = train_df.drop(['service_canceled'] + drop_cols, axis=1)

In [None]:
train_df_ = train_df_.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
train_df = train_df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
x_train, x_test, y_train, y_test = train_test_split(train_df_, train_df['service_canceled'],
                                                    test_size=0.2,
                                                    random_state=seed)

## Parameter Tuning with Optuna

In [None]:
def LightGBM(params, X_train_adv, X_valid_adv, y_train_adv, y_valid_adv):
    # Set data
    lgb_train = lgb.Dataset(X_train_adv, y_train_adv)
    lgb_valid = lgb.Dataset(X_valid_adv, y_valid_adv, reference = lgb_train)
    # Training
    model = lgb.train(
        params,
        lgb_train,
        valid_sets = [lgb_train, lgb_valid],
        num_boost_round = 100,
        early_stopping_rounds = 100
    )
    # Prediction
    y_pred = model.predict(X_valid_adv, num_iteration = model.best_iteration)
    # Evaluation
    ROC_AUC_Score = roc_auc_score(y_valid_adv,y_pred)
    print('ROC AUC Score of LightGBM =', ROC_AUC_Score)
    return ROC_AUC_Score

In [None]:
def objective(trial):
    params = {
        'task': 'train',
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'seed': 420,
        'metric': 'AUC',
        'is_unbalance':True,
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 0.1, 0.9),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 0.1, 0.9),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'learning_rate' :trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
        'device': 'gpu',
        'gpu_platform_id': 0,
        'gpu_device_id': 0
    }
    
    return LightGBM(params, x_train, x_test, y_train, y_test)

In [None]:
%%time
#from optuna.samplers import TPESampler
#study = optuna.create_study(direction = 'maximize', sampler = TPESampler(seed=420))
#study.optimize(objective, n_trials = 50)

In [None]:
params = {'lambda_l1': 0.12316592750974795, 'lambda_l2': 0.8462662628054513, 'num_leaves': 171, 
          'feature_fraction': 0.5759943289441473, 'bagging_fraction': 0.703925954466332,
          'bagging_freq': 5, 'min_child_samples': 25, 'learning_rate': 0.0965962567977748, 
         'task': 'train',
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'seed': 420,
        'metric': 'AUC',
        'is_unbalance':True,'device': 'gpu',
        'gpu_platform_id': 0,
        'gpu_device_id': 0}

## **Install GPU**

In [None]:
%%time
!rm -r /opt/conda/lib/python3.6/site-packages/lightgbm
!git clone --recursive https://github.com/Microsoft/LightGBM

In [None]:
%%time
!apt-get install -y -qq libboost-all-dev

In [None]:
%%time
%%bash
cd LightGBM
rm -r build
mkdir build
cd build
cmake -DUSE_GPU=1 -DOpenCL_LIBRARY=/usr/local/cuda/lib64/libOpenCL.so -DOpenCL_INCLUDE_DIR=/usr/local/cuda/include/ ..
make -j$(nproc)

In [None]:
%%time
!cd LightGBM/python-package/;python3 setup.py install --precompile

In [None]:
%%time
!mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd
!rm -r LightGBM

## Train

In [None]:
%%time

# model = get_model(study.best_params, 30)

if len(drop_cols) > 0:
    test_df_ = test_df_clean.drop(drop_cols, axis=1)
preds = np.zeros(test_df_.shape[0])
kf = StratifiedKFold(n_splits=10, shuffle=True)
rmse=[]  # list contains rmse for each fold
n=0
for trn_idx, test_idx in kf.split(train_df_, train_df['service_canceled']):
    X_tr,X_val=train_df_.iloc[trn_idx],train_df_.iloc[test_idx]
    y_tr,y_val=train_df['service_canceled'].iloc[trn_idx],train_df['service_canceled'].iloc[test_idx]
    lgb_train = lgb.Dataset(X_tr, y_tr)
    lgb_valid = lgb.Dataset(X_val, y_val, reference = lgb_train)
    # Training
    model3 = lgb.train(
        params,
        lgb_train,
        valid_sets = [lgb_train, lgb_valid],
        num_boost_round = 200,
        early_stopping_rounds = 100
    )
    preds+= model3.predict(test_df_)/kf.n_splits

## Prediction

In [None]:
preds

In [None]:
#y_pred_test = model2.predict_proba(test_df_)
#y_scores_test = y_pred_test[:, 1]

#test_df['service_canceled'] = preds
#test_df[["id", "service_canceled"]].to_csv("/kaggle/working/submission.csv", index=False) == submission(10)

# Comparative method

In [None]:
dfk = pd.DataFrame({ 
    'Kernel ID': ['A', 'B', 'C','D','E','F','G','H'],  
    'Score': [0.72308,0.71875,0.72206,0.72289,0.72339,0.72342,0.72289,0.72316],
    'File Path': ['../input/comparative/submission (7).csv',
                  '../input/comparative/submission (6).csv',
                  '../input/comparative2/submission (5).csv',
                 '../input/comparative/submission (8).csv',
                 '../input/comparative/submissionEnsenbled.csv',
                 '../input/comparative/submissionEnsenbled2.csv',
                  '../input/comparative/submission (9).csv',
                 '../input/comparative/submission (10).csv']     
})    
    
dfk = dfk.sort_values('Score')
dfk.reset_index()


def generate(main, support, coeff):
    
    g = main.copy()    
    for i in main.columns[1:]:
        
        res = []
        lm, Is = [], []        
        lm = main[i].tolist()
        ls = support[i].tolist()  
        
        for j in range(len(main)):
            res.append((lm[j] * coeff) + (ls[j] * (1.- coeff)))            
        g[i] = res
        
    return g

support = pd.read_csv(dfk.iloc[0, 2])

for k in range (1,8):
    main = pd.read_csv(dfk.iloc[k, 2])
    support = generate(main, support, 0.60)

sub = support


sub.to_csv("/kaggle/working/submissionEnsenbled5.csv", index=False)