In [None]:
import pandas as pd
import lightgbm as lgb
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from math import sqrt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from lightgbm import early_stopping
from lightgbm import log_evaluation

In [None]:
GREEKS_PATH = Path('data_/greeks.csv')
SAMPLE_SUBMISSION_PATH = Path('data_/sample_submission.csv')
TEST_PATH = Path('data_/test.csv')
TRAIN_PATH = Path('data_/train.csv')

In [None]:
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)
greeks = pd.read_csv(GREEKS_PATH)

In [None]:
FEATURES = [col for col in train.columns if col != 'Id' and col != 'Class']
RANDOM_STATE = 42

In [None]:
train['EJ'] = train['EJ'].map(dict((v, k) for k, v in enumerate(train['EJ'].unique())))
test['EJ'] = test['EJ'].map(dict((v, k) for k, v in enumerate(test['EJ'].unique())))

In [None]:

x = train.loc[:, FEATURES].values
x = StandardScaler().fit_transform(x)

x_ = test.loc[:, FEATURES].values
x_ = StandardScaler().fit_transform(x_)

train_normalized = pd.DataFrame(x, columns=FEATURES)
train_normalized.fillna(train_normalized.mean(), inplace=True)

test_normalized = pd.DataFrame(x_, columns=FEATURES)
test_normalized.fillna(test_normalized.mean(), inplace=True)

pca = PCA(n_components=2)

train_normalized_pca = pca.fit_transform(train_normalized)
test_normalized_pca = pca.fit_transform(test_normalized)

plt.scatter(train_normalized_pca[:, 0] ,train_normalized_pca[:, 1], c=['red' if cls == 1 else 'blue' for cls in train['Class']], label='Classes')

In [None]:
cor_matrix = train.drop(['Id', 'Class'], axis=1).corr().abs()
upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),k=1).astype(bool))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.7)]
train_new = train.drop(to_drop, axis=1)
test_new = test.drop(to_drop, axis=1)

In [None]:

    
FEATURES_NEW = [f for f in FEATURES if f not in to_drop]

for col in FEATURES_NEW:
    train_new.loc[:,col + '_squared'] = train_new[col]**2
    train_new.loc[:,col + '_cubed'] = train_new[col]**3
    train_new.loc[:,col + '_sqrt'] = train_new[col].apply(np.sqrt)
    test_new.loc[:,col + '_squared'] = test_new[col]**2
    test_new.loc[:,col + '_cubed'] = test_new[col]**3
    test_new.loc[:,col + '_sqrt'] = test_new[col].apply(np.sqrt)
    
    

In [None]:
train_new_dataset = lgb.Dataset(train_new.drop(['Id', 'Class'], axis=1), train_new['Class'], feature_name=train_new.drop(['Id', 'Class'], axis=1).columns.tolist())

In [None]:
params = {
    'objective': 'binary',
}
cv_results = lgb.cv(params, train_new_dataset, num_boost_round=1000, nfold=5, early_stopping_rounds=50, verbose_eval=50, feval=balanced_logarithmic_loss)

# print results
print('Best number of iterations:', len(cv_results['custom']))
print('Best CV score:', cv_results['custom'][-1])

In [None]:
def balanced_logarithmic_loss(y_true, y_pred):

    N = len(y_true)
    # Nc is the number of observations
    N_1 = np.sum(y_true == 1, axis=0)
    N_0 = np.sum(y_true == 0, axis=0)
    # wc prevalence
    prev_w_1 = N_1 / N
    prev_w_0 = N_0 / N

    # wc is equal to the inverse prevalence of c
    w_1 = 1 / prev_w_1
    w_0 = 1 / prev_w_0

    # In order to avoid the extremes of the log function, each predicted probability 𝑝 is replaced with max(min(𝑝,1−10−15),10−15)
    y_pred = np.maximum(np.minimum(y_pred, 1 - 1e-15), 1e-15)

    # balanced logarithmic loss
    loss_numerator = - (w_0/N_0) * np.sum((1 - y_true) * np.log(1-y_pred)) - (w_1/N_1) * np.sum(y_true * np.log(y_pred))
    loss_denominator = w_0 + w_1

    return loss_numerator / loss_denominator

In [None]:
print(balanced_logarithmic_loss(1, 1))

In [None]:
booster = lgb.LGBMClassifier(objective="binary", n_estimators=1000, random_state=42, metric="custom", verbose=100 )

In [None]:
booster.fit(train_new.drop(['Id', 'Class'], axis=1), 
            train_new['Class'], 
            eval_metric=balanced_logarithmic_loss,
           )


In [None]:
# preds = booster.predict_proba(test_normalized_pca)
# preds = pd.DataFrame(preds, columns=['class_0', 'class_1'])
preds = booster.predict_proba(test_new.drop('Id', axis=1))
preds
# preds = pd.DataFrame(preds, columns=['class_0', 'class_1'])

In [93]:
booster.fit(train_new.drop(['Id', 'Class'], axis=1), 
            train_new['Class'], 
            eval_metric=balanced_logarithmic_loss,
           )


[LightGBM] [Info] Number of positive: 108, number of negative: 509
[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.709076
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.031514
[LightGBM] [Debug] init for col-wise cost 0.000599 seconds, init for row-wise cost 0.001679 seconds
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 31844
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 180
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.175041 -> initscore=-1.550317
[LightGBM] [Info] Start training from score -1.550317
[LightGBM] [Debug] Trained a tree with leaves = 14 and depth = 6
[LightGBM] [Debug] Trained a tree with leaves = 19 and depth = 12
[LightGBM] [Debug] Trained a tree with leaves = 23 and depth = 15
[LightGBM] [Debug] Trained a tree with leaves = 26 and depth = 13
[LightGBM] [Debug] Trained a tree with leaves = 24 and depth = 15
[LightGBM] [Debug] Trained a

[LightGBM] [Debug] Trained a tree with leaves = 30 and depth = 9
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 15
[LightGBM] [Debug] Trained a tree with leaves = 27 and depth = 15
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 11
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 11
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 12
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 12
[LightGBM] [Debug] Trained a tree with leaves = 26 and depth = 13
[LightGBM] [Debug] Trained a tree with leaves = 30 and depth = 13
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 11
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 9
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 11
[LightGBM] [Debug] Trained a tree with leaves = 29 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 29 and depth = 9
[LightGBM] [Debug] Trained a tree with leaves = 24 and depth = 16
[LightGBM] [De

[LightGBM] [Debug] Trained a tree with leaves = 3 and depth = 2
[LightGBM] [Debug] Trained a tree with leaves = 3 and depth = 2
[LightGBM] [Debug] Trained a tree with leaves = 3 and depth = 2
[LightGBM] [Debug] Trained a tree with leaves = 2 and depth = 1
[LightGBM] [Debug] Trained a tree with leaves = 2 and depth = 1
[LightGBM] [Debug] Trained a tree with leaves = 2 and depth = 1
[LightGBM] [Debug] Trained a tree with leaves = 3 and depth = 2
[LightGBM] [Debug] Trained a tree with leaves = 2 and depth = 1
[LightGBM] [Debug] Trained a tree with leaves = 2 and depth = 1
[LightGBM] [Debug] Trained a tree with leaves = 2 and depth = 1
[LightGBM] [Debug] Trained a tree with leaves = 2 and depth = 1
[LightGBM] [Debug] Trained a tree with leaves = 2 and depth = 1
[LightGBM] [Debug] Trained a tree with leaves = 2 and depth = 1
[LightGBM] [Debug] Trained a tree with leaves = 3 and depth = 2
[LightGBM] [Debug] Trained a tree with leaves = 3 and depth = 2
[LightGBM] [Debug] Trained a tree with l

[LightGBM] [Debug] Trained a tree with leaves = 2 and depth = 1
[LightGBM] [Debug] Trained a tree with leaves = 2 and depth = 1
[LightGBM] [Debug] Trained a tree with leaves = 2 and depth = 1
[LightGBM] [Debug] Trained a tree with leaves = 2 and depth = 1
[LightGBM] [Debug] Trained a tree with leaves = 2 and depth = 1
[LightGBM] [Debug] Trained a tree with leaves = 2 and depth = 1
[LightGBM] [Debug] Trained a tree with leaves = 2 and depth = 1
[LightGBM] [Debug] Trained a tree with leaves = 2 and depth = 1
[LightGBM] [Debug] Trained a tree with leaves = 2 and depth = 1
[LightGBM] [Debug] Trained a tree with leaves = 2 and depth = 1
[LightGBM] [Debug] Trained a tree with leaves = 2 and depth = 1
[LightGBM] [Debug] Trained a tree with leaves = 2 and depth = 1
[LightGBM] [Debug] Trained a tree with leaves = 2 and depth = 1
[LightGBM] [Debug] Trained a tree with leaves = 2 and depth = 1
[LightGBM] [Debug] Trained a tree with leaves = 2 and depth = 1
[LightGBM] [Debug] Trained a tree with l

In [95]:
# preds = booster.predict_proba(test_normalized_pca)
# preds = pd.DataFrame(preds, columns=['class_0', 'class_1'])
preds = booster.predict_proba(test_new.drop('Id', axis=1))
preds
# preds = pd.DataFrame(preds, columns=['class_0', 'class_1'])

array([0, 0, 0, 0, 0])

In [79]:
preds

Unnamed: 0,class_0,class_1
0,0.999889,0.000111
1,0.999889,0.000111
2,0.999889,0.000111
3,0.999889,0.000111
4,0.999889,0.000111


In [110]:


gridParams = {
    'learning_rate': [0.005, 0.01],
    'n_estimators': [8,16,24],
    'num_leaves': [6,8,12,16], # large num_leaves helps improve accuracy but might lead to over-fitting
    'boosting_type' : ['gbdt', 'dart'], # for better accuracy -> try dart
    'objective' : ['binary'],
    'max_bin':[255, 510], # large max_bin helps improve accuracy but might slow down training progress
    'random_state' : [500],
    'colsample_bytree' : [0.64, 0.65, 0.66],
    'subsample' : [0.7,0.75],
    'reg_alpha' : [1,1.2],
    'reg_lambda' : [1,1.2,1.4],
    }

grid = GridSearchCV(mdl, gridParams, verbose=1, cv=4, n_jobs=-1)
# Run the grid
grid.fit(X, y)

# Print the best parameters found
print(grid.best_params_)
print(grid.best_score_)

Unnamed: 0,class_0,class_1
0,0.915763,0.084237
1,0.915763,0.084237
2,0.915763,0.084237
3,0.915763,0.084237
4,0.915763,0.084237


In [None]:
cv_results = lgb.cv(params, train_dataset, num_boost_round=1000, nfold=5,
                    verbose_eval=20, early_stopping_rounds=40)

In [26]:
booster.get_params()

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': -1,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 10,
 'n_jobs': -1,
 'num_leaves': 31,
 'objective': 'binary',
 'random_state': 42,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'silent': 'warn',
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0,
 'early_stopping_rounds': 10,
 'metric': <function __main__.balanced_logarithmic_loss(y_true, y_pred)>}

In [25]:
submission = pd.concat([test.Id, preds], axis=1)

In [28]:
submission.to_csv('submission.csv', index=False)

In [4]:
df = pd.read_csv('submission.csv')

In [5]:
df

Unnamed: 0,Id,class_0,class_1
0,00eed32682bb,0.838392,0.161608
1,010ebe33f668,0.838392,0.161608
2,02fa521e1838,0.838392,0.161608
3,040e15f562a2,0.838392,0.161608
4,046e85c7cc7f,0.838392,0.161608
