In [4]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import os
import pickle
from tqdm import tqdm

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')

from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline

import lightgbm as lgb

import warnings
warnings.filterwarnings("ignore")

from drive.MyDrive.Kaggle.Clustering_072022.src.functions import *

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [38]:
def set_seed(seed):
    """
    Sets a global random seed of your choice
    """
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

set_seed(150)

In [20]:
PATH = 'drive/MyDrive/Kaggle/Clustering_072022/'
data = pd.read_csv(PATH + 'src/data_removed.csv', index_col='id')
preds = pd.read_csv(PATH + '/submissions/soft_vote/BGM_7_PT_200seeds_bestcolumns.csv', index_col='Id')
probs = pd.read_csv(PATH + 'src/BGM_7_RSPT_predarray.csv').drop('Unnamed: 0', axis=1)
data['target'] = preds['Predicted']

In [40]:
low_stds = probs[probs['max_stds'] < probs['max_stds'].quantile(q=0.1)].index
low_diff = probs[probs['max_diff'] < probs['max_diff'].quantile(q=0.1)].index
low_confidence = set(low_stds).intersection(set(low_diff))
print(len(low_confidence))

high_stds = probs[probs['min_stds'] > probs['min_stds'].quantile(q=0.2)].index
high_diff = probs[probs['min_diff'] > probs['min_diff'].quantile(q=0.2)].index
high_confidence = set(high_stds).intersection(set(high_diff))
print(len(high_confidence))

7712
72452


In [24]:
train = data.loc[high_confidence]
X_train = train.drop('target', axis=1)
y_train = train['target']
test = data.loc[low_confidence]

In [31]:
params_lgb = {'learning_rate': 0.08,'objective': 'multiclass','boosting': 'gbdt','verbosity': -1,'n_jobs': -1, 'num_classes':7} 

history = []

skf = StratifiedKFold(5)
for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
    tr_dataset = lgb.Dataset(X_train.iloc[train_idx], y_train.iloc[train_idx])
    val_dataset = lgb.Dataset(X_train.iloc[val_idx], y_train.iloc[val_idx])

    model = lgb.train(params=params_lgb,
                      train_set=tr_dataset,
                      valid_sets=val_dataset,
                      num_boost_round=1000,
                      callbacks=[ lgb.early_stopping(stopping_rounds=300, verbose=True)])  

    history.append(model)

[1]	valid_0's multi_logloss: 1.64602
Training until validation scores don't improve for 300 rounds.
[2]	valid_0's multi_logloss: 1.4959
[3]	valid_0's multi_logloss: 1.37278
[4]	valid_0's multi_logloss: 1.2679
[5]	valid_0's multi_logloss: 1.17688
[6]	valid_0's multi_logloss: 1.0965
[7]	valid_0's multi_logloss: 1.02379
[8]	valid_0's multi_logloss: 0.958542
[9]	valid_0's multi_logloss: 0.899163
[10]	valid_0's multi_logloss: 0.845047
[11]	valid_0's multi_logloss: 0.795801
[12]	valid_0's multi_logloss: 0.750033
[13]	valid_0's multi_logloss: 0.708409
[14]	valid_0's multi_logloss: 0.669746
[15]	valid_0's multi_logloss: 0.63408
[16]	valid_0's multi_logloss: 0.600556
[17]	valid_0's multi_logloss: 0.570412
[18]	valid_0's multi_logloss: 0.541676
[19]	valid_0's multi_logloss: 0.515049
[20]	valid_0's multi_logloss: 0.490473
[21]	valid_0's multi_logloss: 0.46703
[22]	valid_0's multi_logloss: 0.445256
[23]	valid_0's multi_logloss: 0.425311
[24]	valid_0's multi_logloss: 0.406559
[25]	valid_0's multi_l

In [34]:
lgb_preds = 0
for model in history:
    lgb_preds += model.predict(test.drop('target', axis=1))

In [37]:
sub = pd.read_csv(PATH + 'submissions/sample_submission.csv', index_col='Id')
data.loc[low_confidence, 'target'] = np.argmax(lgb_preds, axis=1)
sub['Predicted'] = data['target']
sub.to_csv('test.csv')