In [11]:
import pandas as pd
import lightgbm as lgb
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from math import sqrt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from lightgbm import early_stopping
from lightgbm import log_evaluation

In [12]:
GREEKS_PATH = Path('data_/greeks.csv')
SAMPLE_SUBMISSION_PATH = Path('data_/sample_submission.csv')
TEST_PATH = Path('data_/test.csv')
TRAIN_PATH = Path('data_/train.csv')

In [13]:
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)
greeks = pd.read_csv(GREEKS_PATH)

In [14]:
FEATURES = [col for col in train.columns if col != 'Id' and col != 'Class']
RANDOM_STATE = 42

In [15]:
train['EJ'] = train['EJ'].map(dict((v, k) for k, v in enumerate(train['EJ'].unique())))
test['EJ'] = test['EJ'].map(dict((v, k) for k, v in enumerate(test['EJ'].unique())))

In [25]:
def balanced_logarithmic_loss(preds, eval_data):
    eval_data = eval_data.get_label()
    N = len(eval_data)
    # Nc is the number of observations
    N_1 = np.sum(eval_data == 1, axis=0)
    N_0 = np.sum(eval_data == 0, axis=0)
    # wc prevalence
    prev_w_1 = N_1 / N
    prev_w_0 = N_0 / N

    # wc is equal to the inverse prevalence of c
    w_1 = 1 / prev_w_1
    w_0 = 1 / prev_w_0

    # In order to avoid the extremes of the log function, each predicted probability 𝑝 is replaced with max(min(𝑝,1−10−15),10−15)
    y_pred = np.maximum(np.minimum(preds, 1 - 1e-15), 1e-15)

    # balanced logarithmic loss
    loss_numerator = - (w_0 / N_0) * np.sum((1 - eval_data) * np.log(1 - y_pred)) - (w_1 / N_1) * np.sum(
        eval_data * np.log(y_pred))
    loss_denominator = w_0 + w_1

    return ('balanced_logarithmic_loss', loss_numerator / loss_denominator, False)

In [None]:

x = train.loc[:, FEATURES].values
x = StandardScaler().fit_transform(x)

x_ = test.loc[:, FEATURES].values
x_ = StandardScaler().fit_transform(x_)

train_normalized = pd.DataFrame(x, columns=FEATURES)
train_normalized.fillna(train_normalized.mean(), inplace=True)

test_normalized = pd.DataFrame(x_, columns=FEATURES)
test_normalized.fillna(test_normalized.mean(), inplace=True)

pca = PCA(n_components=2)

train_normalized_pca = pca.fit_transform(train_normalized)
test_normalized_pca = pca.fit_transform(test_normalized)

plt.scatter(train_normalized_pca[:, 0], train_normalized_pca[:, 1],
            c=['red' if cls == 1 else 'blue' for cls in train['Class']], label='Classes')

In [17]:
cor_matrix = train.drop(['Id', 'Class'], axis=1).corr().abs()
upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.9)]
train_new = train.drop(to_drop, axis=1)
test_new = test.drop(to_drop, axis=1)

In [18]:
FEATURES_NEW = [f for f in FEATURES if f not in to_drop]
for col in FEATURES_NEW:
    train_new.loc[:, col + '_squared'] = train_new[col] ** 2
    train_new.loc[:, col + '_cubed'] = train_new[col] ** 3
    test_new.loc[:, col + '_squared'] = test_new[col] ** 2
    test_new.loc[:, col + '_cubed'] = test_new[col] ** 3

  train_new.loc[:,col + '_squared'] = train_new[col]**2
  train_new.loc[:,col + '_cubed'] = train_new[col]**3
  test_new.loc[:,col + '_cubed'] = test_new[col]**3
  train_new.loc[:,col + '_squared'] = train_new[col]**2
  train_new.loc[:,col + '_cubed'] = train_new[col]**3
  test_new.loc[:,col + '_squared'] = test_new[col]**2
  test_new.loc[:,col + '_cubed'] = test_new[col]**3
  train_new.loc[:,col + '_squared'] = train_new[col]**2
  train_new.loc[:,col + '_cubed'] = train_new[col]**3
  test_new.loc[:,col + '_squared'] = test_new[col]**2
  test_new.loc[:,col + '_cubed'] = test_new[col]**3
  train_new.loc[:,col + '_squared'] = train_new[col]**2
  train_new.loc[:,col + '_cubed'] = train_new[col]**3
  test_new.loc[:,col + '_squared'] = test_new[col]**2
  test_new.loc[:,col + '_cubed'] = test_new[col]**3


In [19]:
train_new_dataset = lgb.Dataset(train_new.drop(['Id', 'Class'], axis=1), train_new['Class'],
                                feature_name=train_new.drop(['Id', 'Class'], axis=1).columns.tolist())

In [39]:
params = {'metric': 'binary_logloss', 'objective': 'binary'}
booster = lgb.LGBMClassifier(**params,
                             n_estimators=1000,
                             random_state=42,
                             verbose=100,
                             callbacks=[early_stopping(stopping_rounds=10)])

In [27]:
booster.fit(train_new.drop(['Id', 'Class'], axis=1), train_new['Class'], )

[LightGBM] [Info] Number of positive: 108, number of negative: 509
[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.734198
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.042358
[LightGBM] [Debug] init for col-wise cost 0.000927 seconds, init for row-wise cost 0.001778 seconds
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 26919
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 156
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.175041 -> initscore=-1.550317
[LightGBM] [Info] Start training from score -1.550317
[LightGBM] [Debug] Trained a tree with leaves = 14 and depth = 6
[LightGBM] [Debug] Trained a tree with leaves = 19 and depth = 12
[LightGBM] [Debug] Trained a tree with leaves = 23 and depth = 14
[LightGBM] [Debug] Trained a tree with leaves = 26 and depth = 13
[LightGBM] [Debug] Trained a tree with leaves = 26 and depth = 16
[LightGBM] [Debug] Trained a

In [79]:
preds = booster.predict_proba(test_new.drop('Id', axis=1))
preds = pd.DataFrame(preds, columns=['class_0', 'class_1'])


Unnamed: 0,class_0,class_1
0,0.999935,6.5e-05
1,0.999935,6.5e-05
2,0.999935,6.5e-05
3,0.999935,6.5e-05
4,0.999935,6.5e-05


    class_0   class_1
0  0.999935  0.000065
1  0.999935  0.000065
2  0.999935  0.000065
3  0.999935  0.000065
4  0.999935  0.000065


In [37]:
params = {'objective': 'binary', }
cv_results = lgb.cv(params,
                    train_new_dataset,
                    num_boost_round=1000,
                    nfold=20,
                    feval=balanced_logarithmic_loss,
                    callbacks=[early_stopping(stopping_rounds=10), log_evaluation()]
                    )

print(cv_results['balanced_logarithmic_loss-mean'][-1])
# print results
# print('Best number of iterations:', len(cv_results['custom']))
# print('Best CV score:', cv_results['custom'][-1])

[LightGBM] [Info] Number of positive: 102, number of negative: 484
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 26919
[LightGBM] [Info] Number of data points in the train set: 586, number of used features: 156
[LightGBM] [Info] Number of positive: 102, number of negative: 484
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 26919
[LightGBM] [Info] Number of data points in the train set: 586, number of used features: 156
[LightGBM] [Info] Number of positive: 102, number of negative: 484
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 26919
[LightGBM] [Info] Number of data points in the train set: 586, number of used features: 156
[LightGBM] [Info] Number of positive: 102, number of negative: 484
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 26919
[LightGBM] [Info] Number of data points in the train set: 586, number of used featur

In [62]:
# booster.get_params()
#
# gridParams = {
#     'learning_rate': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2],
#     'n_estimators': [1000],
#     'num_leaves': [6, 8, 12, 16, 31, 62],  # large num_leaves helps improve accuracy but might lead to over-fitting
#     'boosting_type': ['gbdt', 'dart'],  # for better accuracy -> try dart
#     'objective': ['binary'],
#     'max_bin': [255, 510],  # large max_bin helps improve accuracy but might slow down training progress
#     'random_state': [RANDOM_STATE],
#     'colsample_bytree': [0.64, 0.65, 0.66],
#     'subsample': [0.7, 0.75],
#     'reg_alpha': [1, 1.2],
#     'reg_lambda': [1, 1.2, 1.4],
# }

# num_leaves_choices = [2, 7, 15, 31, 62, 128]
# learning_rate_choices = [0.01, 0.03, 0.05, 0.1, 0.2, 0.3, 0.4]
# learning_rate_choices = np.arange(0.001, 1, 0.001, dtype=float)


# cv_results = []
# for num_lv in num_leaves_choices:
#     for lr in learning_rate_choices:
#         hyperparams = {"objective": 'binary',
#                        "num_leaves": num_lv,
#                        "learning_rate": lr,
#                        'boosting_type': 'gbdt',
#                        'class_weight': 'balanced'
#                        }
#
#         cv_summary = lgb.cv(hyperparams,
#                             train_new_dataset,
#                             num_boost_round=1000,
#                             nfold=20,
#                             feval=balanced_logarithmic_loss,
#                             callbacks=[early_stopping(stopping_rounds=10), log_evaluation()],
#                             seed=RANDOM_STATE
#                             )
#
#         optimal_num_trees = len(cv_summary['balanced_logarithmic_loss-mean'])
#         hyperparams["optimal_number_of_trees"] = optimal_num_trees
#
#         cv_results.append((hyperparams, cv_summary['balanced_logarithmic_loss-mean'][-1]))
#
# print(cv_results)

# grid = GridSearchCV(booster, gridParams, verbose=1, cv=20, n_jobs=-1)
# # # Run the grid
# grid.fit(train_new.drop(['Id', 'Class'], axis=1), train_new['Class'])
# #
# # # Print the best parameters found
# print(grid.best_params_)
# print(grid.best_score_)

[LightGBM] [Info] Number of positive: 102, number of negative: 484
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 26919
[LightGBM] [Info] Number of data points in the train set: 586, number of used features: 156
[LightGBM] [Info] Number of positive: 102, number of negative: 484
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 26919
[LightGBM] [Info] Number of data points in the train set: 586, number of used features: 156
[LightGBM] [Info] Number of positive: 102, number of negative: 484
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 26919
[LightGBM] [Info] Number of data points in the train set: 586, number of used features: 156
[LightGBM] [Info] Number of positive: 102, number of negative: 484
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 26919
[LightGBM] [Info] Number of data points in the train set: 586, number of used featur

In [75]:
params = {'metric': 'binary_logloss',
          'objective': 'binary',
          'boosting_type': 'gbdt',
          'learning_rate': 0.03,
          'class_weight': 'balanced',
          'optimal_number_of_trees': 239,
          'num_leaves': 7,

          }
booster = lgb.LGBMClassifier(**params,
                             n_estimators=1000,
                             random_state=42,
                             verbose=100,
                             callbacks=[early_stopping(stopping_rounds=10)])

# cv_summary = lgb.cv(params,
#                     train_new_dataset,
#                     num_boost_round=239,
#                     nfold=20,
#                     feval=balanced_logarithmic_loss,
#                     callbacks=[early_stopping(stopping_rounds=10), log_evaluation()],
#                     seed=RANDOM_STATE
#                     )

In [76]:
booster.fit(train_new.drop(['Id', 'Class'], axis=1), train_new['Class'], )

[LightGBM] [Info] Number of positive: 108, number of negative: 509
[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.734198
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.042358
[LightGBM] [Debug] init for col-wise cost 0.001326 seconds, init for row-wise cost 0.002052 seconds
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 26919
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 156
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Debug] Trained a tree with leaves = 7 and depth = 4
[LightGBM] [Debug] Trained a tree with leaves = 7 and depth = 5
[LightGBM] [Debug] Trained a tree with leaves = 7 and depth = 4
[LightGBM] [Debug] Trained a tree with leaves = 7 and depth = 5
[LightGBM] [Debug] Trained a tree with leaves = 7 and depth = 4
[LightGBM] [Debug] Trained a tree with 

In [81]:
submission = pd.concat([test.Id, preds], axis=1)

In [82]:
submission.to_csv('submission.csv', index=False)