# 1) Importing libraries 

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
import gc
import os
import matplotlib.pyplot as plt
import seaborn as sns 
import lightgbm as lgb
import itertools
import pickle, gzip
import glob
from sklearn.preprocessing import StandardScaler
from tsfresh.feature_extraction import extract_features
np.warnings.filterwarnings('ignore')

In [None]:
import missingno as msno

In [None]:
from pandasql import sqldf

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt

In [None]:
from sklearn.model_selection import KFold

In [None]:
import dask.dataframe as dd

In [None]:
np.random.seed(51)

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# 2) Reading train_metadata_kaggle & test_metadata_kaggle

In [None]:
%%time
train_metadata_kaggle = dd.read_csv('klm_train.csv')
test_metadata_kaggle = dd.read_csv('klm_test.csv')
train_metadata_kaggle = train_metadata_kaggle.compute()
test_metadata_kaggle = test_metadata_kaggle.compute()
print(train_metadata_kaggle.shape,test_metadata_kaggle.shape)

In [None]:
test_id = test_metadata_kaggle['object_id']

In [None]:
print(train_metadata_kaggle.shape,test_metadata_kaggle.shape)

In [None]:
test_metadata_kaggle.head()

# 3) reading train_metadata_final & test_metadata_final

In [None]:
"""%%time
train_metadata = dd.read_csv('train_metadata_final.csv')
test_metadata = dd.read_csv('test_metadata_final.csv')
train_metadata = train_metadata.compute()
test_metadata = test_metadata.compute()
print(train_metadata.shape,test_metadata.shape)"""

In [None]:
"""y = train_metadata['target']
del train_metadata['target']
gc.collect()"""

# 4) Reading created_column

In [None]:
created_column = pd.read_csv('hostgal_specz.csv',header=None)

In [None]:
created_column.head()

In [None]:
created_column.shape[0] == train_metadata_kaggle.shape[0] + test_metadata_kaggle.shape[0]

In [None]:
created_column.columns = ['object_id','hostgal_specz']

In [None]:
print(train_metadata_kaggle.shape)

# 5) Removing duplicated_columns from train_metadata & test_metadata

In [None]:
will_be_deleted_columns = ['NG_min_flux','NG_max_flux','NG_std_flux','NG_mean_flux','NG_median_flux',
                          'NG_min_flux_err','NG_max_flux_err','NG_std_flux_err','NG_mean_flux_err','NG_median_flux_err',
                          'NG_mean_detected','hostgal_photoz','hostgal_photoz_err','mwebv']

In [None]:
"""%%time
train_metadata.drop(will_be_deleted_columns,inplace=True , axis=1)
test_metadata.drop(will_be_deleted_columns,inplace=True , axis=1)
gc.collect()"""

In [None]:
print(train_metadata_kaggle.shape,test_metadata_kaggle.shape)

In [None]:
train_metadata_kaggle.head()

In [None]:
#for i,j in enumerate(list(train_metadata.columns.values)):
#    print(i,j)

# 6) Adding hostgal_specz to train_metadata_kaggle

In [None]:
"""%%time
train_metadata_kaggle = train_metadata_kaggle.merge(right = created_column ,how='left',on = 'object_id')

# 7) Adding hostgal_specz to train_metadata_kaggle

In [None]:
"""%%time
test_metadata_kaggle = test_metadata_kaggle.merge(right = created_column ,how='left',on = 'object_id')"""

In [None]:
print(train_metadata_kaggle.shape,test_metadata_kaggle.shape)

# 8) Creating train and test datasets

In [None]:
ignored_columns = ['ra','decl','gal_l','gal_b','ddf','hostgal_specz','distmod']

In [None]:
#temp = train_metadata_kaggle.merge(right = train_metadata[[x for x in train_metadata.columns if x not in ignored_columns ]] ,how='left',on = 'object_id')

In [None]:
"""#train
train_metadata_kaggle['flux_diff_btw_mean_and_min'] = train_metadata_kaggle['flux_mean'] - train_metadata_kaggle['flux_min']
train_metadata_kaggle['flux_diff_btw_mean_and_max'] = train_metadata_kaggle['flux_max'] - train_metadata_kaggle['flux_mean']
train_metadata_kaggle['flux_std_divided_by_skew'] = train_metadata_kaggle['flux_std'] / train_metadata_kaggle['flux_skew']

#test
test_metadata_kaggle['flux_diff_btw_mean_and_min'] = test_metadata_kaggle['flux_mean'] - test_metadata_kaggle['flux_min']
test_metadata_kaggle['flux_diff_btw_mean_and_max'] = test_metadata_kaggle['flux_max'] - test_metadata_kaggle['flux_mean']
test_metadata_kaggle['flux_std_divided_by_skew'] = test_metadata_kaggle['flux_std'] / test_metadata_kaggle['flux_skew']"""

In [None]:
temp = train_metadata_kaggle.copy()

In [None]:
y = temp['target']

In [None]:
del temp['target']

In [None]:
print(temp.shape)

In [None]:
temp.head()

# 9) Writing helpful functions

In [None]:
classes = sorted(y.unique())

# Taken from Giba's topic : https://www.kaggle.com/titericz
# https://www.kaggle.com/c/PLAsTiCC-2018/discussion/67194
# with Kyle Boone's post https://www.kaggle.com/kyleboone
class_weight = {
    c: 1 for c in classes
}
for c in [64, 15]:
    class_weight[c] = 2

print('Unique classes : ', classes)

train_id = temp['object_id']
del temp['object_id']

In [None]:
# Compute weights
w = y.value_counts()
weights = {i : np.sum(w) / w[i] for i in w.index}

def multi_weighted_logloss(y_true, y_preds):
    """
    @author olivier https://www.kaggle.com/ogrellier
    multi logloss for PLAsTiCC challenge
    """
    # class_weights taken from Giba's topic : https://www.kaggle.com/titericz
    # https://www.kaggle.com/c/PLAsTiCC-2018/discussion/67194
    # with Kyle Boone's post https://www.kaggle.com/kyleboone
    classes = [6, 15, 16, 42, 52, 53, 62, 64, 65, 67, 88, 90, 92, 95]
    class_weight = {6: 1, 15: 2, 16: 1, 42: 1, 52: 1, 53: 1, 62: 1, 64: 2, 65: 1, 67: 1, 88: 1, 90: 1, 92: 1, 95: 1}
    if len(np.unique(y_true)) > 14:
        classes.append(99)
        class_weight[99] = 2
    y_p = y_preds
    # Trasform y_true in dummies
    y_ohe = pd.get_dummies(y_true)
    # Normalize rows and limit y_preds to 1e-15, 1-1e-15
    y_p = np.clip(a=y_p, a_min=1e-15, a_max=1 - 1e-15)
    # Transform to log
    y_p_log = np.log(y_p)
    # Get the log for ones, .values is used to drop the index of DataFrames
    # Exclude class 99 for now, since there is no class99 in the training set
    # we gave a special process for that class
    y_log_ones = np.sum(y_ohe.values * y_p_log, axis=0)
    # Get the number of positives for each class
    nb_pos = y_ohe.sum(axis=0).values.astype(float)
    # Weight average and divide by the number of positives
    class_arr = np.array([class_weight[k] for k in sorted(class_weight.keys())])
    y_w = y_log_ones * class_arr / nb_pos

    loss = - np.sum(y_w) / np.sum(class_arr)
    return loss


def lgb_multi_weighted_logloss(y_true, y_preds):
    """
    @author olivier https://www.kaggle.com/ogrellier
    multi logloss for PLAsTiCC challenge
    """
    # class_weights taken from Giba's topic : https://www.kaggle.com/titericz
    # https://www.kaggle.com/c/PLAsTiCC-2018/discussion/67194
    # with Kyle Boone's post https://www.kaggle.com/kyleboone
    classes = [6, 15, 16, 42, 52, 53, 62, 64, 65, 67, 88, 90, 92, 95]
    class_weight = {6: 1, 15: 2, 16: 1, 42: 1, 52: 1, 53: 1, 62: 1, 64: 2, 65: 1, 67: 1, 88: 1, 90: 1, 92: 1, 95: 1}
    if len(np.unique(y_true)) > 14:
        classes.append(99)
        class_weight[99] = 2
    y_p = y_preds.reshape(y_true.shape[0], len(classes), order='F')

    # Trasform y_true in dummies
    y_ohe = pd.get_dummies(y_true)
    # Normalize rows and limit y_preds to 1e-15, 1-1e-15
    y_p = np.clip(a=y_p, a_min=1e-15, a_max=1 - 1e-15)
    # Transform to log
    y_p_log = np.log(y_p)
    # Get the log for ones, .values is used to drop the index of DataFrames
    # Exclude class 99 for now, since there is no class99 in the training set
    # we gave a special process for that class
    y_log_ones = np.sum(y_ohe.values * y_p_log, axis=0)
    # Get the number of positives for each class
    nb_pos = y_ohe.sum(axis=0).values.astype(float)
    # Weight average and divide by the number of positives
    class_arr = np.array([class_weight[k] for k in sorted(class_weight.keys())])
    y_w = y_log_ones * class_arr / nb_pos

    loss = - np.sum(y_w) / np.sum(class_arr)
    return 'wloss', loss, False

# 10 ) Defining hyperparameters

In [None]:
%%time
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=51)
clfs = []
importances = pd.DataFrame()
lgb_params = {
'random_state':51,
'device': 'cpu', 
'objective': 'multiclass', 
'num_class': 14, 
'boosting_type': 'gbdt', 
'n_jobs': -1, 
'max_depth': 7, 
'n_estimators': 500, 
'subsample_freq': 2, 
'subsample_for_bin': 5000, 
'min_data_per_group': 100, 
'max_cat_to_onehot': 4, 
'cat_l2': 1.0, 
'cat_smooth': 59.5, 
'max_cat_threshold': 32, 
'metric_freq': 10, 
'verbosity': -1, 
'metric': 'multi_logloss', 
'xgboost_dart_mode': False, 
'uniform_drop': False, 
'colsample_bytree': 0.5, 
'drop_rate': 0.173, 
'learning_rate': 0.0267, 
'max_drop': 5, 
'min_child_samples': 10, 
'min_child_weight': 100.0, 
'min_split_gain': 0.1, 
'num_leaves': 7, 
'reg_alpha': 0.1, 
'reg_lambda': 0.00023, 
'skip_drop': 0.44, 
'subsample': 0.75}

In [None]:
# Compute weights
w = y.value_counts()
weights = {i : np.sum(w) / w[i] for i in w.index}

oof_preds = np.zeros((len(temp), np.unique(y).shape[0]))

# 11) Training

In [None]:
%%time
for fold_, (trn_, val_) in enumerate(folds.split(y, y)):
    trn_x, trn_y = temp.iloc[trn_], y.iloc[trn_]
    val_x, val_y = temp.iloc[val_], y.iloc[val_]

    clf = lgb.LGBMClassifier(**lgb_params)
    clf.fit(
        trn_x, trn_y,
        eval_set=[(trn_x, trn_y), (val_x, val_y)],
        eval_metric=lgb_multi_weighted_logloss,
        verbose=100,
        early_stopping_rounds=50,
        sample_weight=trn_y.map(weights)
    )
    oof_preds[val_, :] = clf.predict_proba(val_x, num_iteration=clf.best_iteration_)
    print(multi_weighted_logloss(val_y, oof_preds[val_, :]))

    imp_df = pd.DataFrame()
    imp_df['feature'] = temp.columns
    imp_df['gain'] = clf.feature_importances_
    imp_df['fold'] = fold_ + 1
    importances = pd.concat([importances, imp_df], axis=0, sort=False)

    clfs.append(clf)

print('MULTI WEIGHTED LOG LOSS : %.5f ' % multi_weighted_logloss(y_true=y, y_preds=oof_preds))

In [None]:
msno.

In [None]:
print(list(temp.columns.values)[:5])

In [None]:
print(list(test_metadata_kaggle.columns.values)[:5])

In [None]:
imp_df.sort_values(by ='gain',ascending=False).reset_index(drop=True).head(20)

# 12) Preparing test data for scoring

In [None]:
%%time
#temp_test = test_metadata_kaggle.merge(right = test_metadata[[x for x in test_metadata.columns if x not in ignored_columns ]] ,how='left',on = 'object_id')

In [None]:
%%time
temp_test = test_metadata_kaggle.copy()

In [None]:
del temp_test['object_id']

In [None]:
print(temp.shape,temp_test.shape)

In [None]:
list(temp.columns) == list(temp_test.columns)

In [None]:
gc.enable()
del train_metadata_kaggle,test_metadata_kaggle
gc.collect()

In [None]:
%%time
test_pred0 = pd.DataFrame()
test_pred1 = pd.DataFrame()
test_pred2 = pd.DataFrame()
test_pred3 = pd.DataFrame()
test_pred4 = pd.DataFrame()

list_of_df = [test_pred0,test_pred1,test_pred2,test_pred3,test_pred4]

for num,c in enumerate(clfs):
    print(num)
    for k in range(0,len(temp_test),500000):
        print(k)
        test_pred = pd.DataFrame(c.predict_proba(temp_test[ k:k+500000] ))
        list_of_df[num] = pd.concat([list_of_df[num],test_pred],axis=0)
        del test_pred

In [None]:
test_pred2 = pd.DataFrame()
test_pred2 = (list_of_df[0] + list_of_df[1] + list_of_df[2] + list_of_df[3] + list_of_df[4])/5

In [None]:
test_pred2.shape

In [None]:
test_pred2.head()

In [None]:
test_pred2[14] = 0.15

In [None]:
test_pred2 = test_pred2 / 1.15

In [None]:
test_pred2 = test_pred2.reset_index(drop=True)

In [None]:
temp_columns = ['object_id','class_6','class_15','class_16','class_42','class_52','class_53','class_62','class_64','class_65','class_67','class_88','class_90','class_92','class_95','class_99']

In [None]:
test_pred2.columns = temp_columns[1:]

In [None]:
print(test_pred2.shape,test_id.shape)

In [None]:
test_id.tail()

In [None]:
test_id = test_id.reset_index(drop=True)

In [None]:
test_id.index == test_pred2.index

In [None]:
%%time
test_pred = pd.concat([test_id,test_pred2],axis=1)

In [None]:
test_pred = test_pred[temp_columns]

In [None]:
test_pred.head()

In [None]:
test_pred.shape

In [None]:
%%time
test_pred.to_csv('test_pred_8.csv',index=False)