## Example of read csv

In [1]:
import os
import pandas as pd
import numpy as np
from pathlib import Path
import time

from sklearn.metrics import log_loss

path = Path('./')

submission = pd.read_csv(path / 'sample_submission.csv', index_col='id')
labels = pd.read_csv(path / 'train_labels.csv', index_col='id')

# the ids of the submission rows (useful later)
sub_ids = submission.index

# the ids of the labeled rows (useful later)
gt_ids = labels.index 

# list of files in the submission folder
subs = sorted(os.listdir(path / 'submission_files'))



In [2]:
s0 = pd.read_csv(path / 'submission_files' / subs[0], index_col='id')

score = log_loss(labels, s0.loc[gt_ids])

# Notice the score of the labeled rows matches the file name
print(subs[0],":", f'{score:.10f}')


0.6222863195.csv : 0.6222863195


In [3]:
s1 = pd.read_csv(path / 'submission_files' / subs[1], index_col='id')

score = log_loss(labels, s0.loc[gt_ids])

# Notice the score of the labeled rows matches the file name
print(subs[1],":", f'{score:.10f}')
len(s0.columns)

0.6223807245.csv : 0.6222863195


1

## Merge submission files

In [4]:
np_preds = np.zeros((40000,len(subs)))
df_preds = pd.DataFrame(np_preds)
scores = []
start = time.time()
for i,sub in enumerate(subs):
    pred = pd.read_csv(path / "submission_files" / sub, index_col = "id")
    # np_preds[:,i] = pred["pred"]
    df_preds.iloc[:,i] = pred["pred"]
end = time.time()

print(end-start)
np_preds = df_preds.to_numpy()

58.44338393211365


## Train test split

In [17]:
from sklearn.model_selection import train_test_split as tts

In [39]:
data_x = np_preds[:len(labels),:]
data_y = labels
test_x = np_preds[len(labels):,:]

In [41]:
train_x,val_x,train_y,val_y=tts(data_x,data_y,random_state=10,test_size=0.3)

## Inverse weighting

In [7]:
# scores = np.array(scores)
# df_inverse_weighting = df_preds * scores**2
# df_inverse_weighting.sum(axis=1)/sum(scores**2)

In [8]:
# np_preds = df_preds.to_numpy()
# labels.to_numpy()

## Light GBM

In [10]:
import lightgbm as lgb

In [44]:
params = {
    "objective" : "binary",
    "metric" : "binary_logloss",
    "random_state":10
}
num_round = 100

lgb_train = lgb.Dataset(train_x,train_y)
lgb_val = lgb.Dataset(val_x,val_y)

# train
model = lgb.train(params,lgb_train,num_boost_round=num_round, valid_names=['train', 'valid']
                  , valid_sets=[lgb_train, lgb_val], early_stopping_rounds=20)

[LightGBM] [Info] Number of positive: 6993, number of negative: 7007
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1261109
[LightGBM] [Info] Number of data points in the train set: 14000, number of used features: 4998
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499500 -> initscore=-0.002000
[LightGBM] [Info] Start training from score -0.002000
[1]	train's binary_logloss: 0.662744	valid's binary_logloss: 0.667045
Training until validation scores don't improve for 20 rounds
[2]	train's binary_logloss: 0.637524	valid's binary_logloss: 0.645224
[3]	train's binary_logloss: 0.616219	valid's binary_logloss: 0.627189
[4]	train's binary_logloss: 0.597863	valid's binary_logloss: 0.612368
[5]	train's binary_logloss: 0.582336	valid's binary_logloss: 0.600092
[6]	train's binary_logloss: 0.568485	valid's binary_logloss: 0.589665
[7]	train's binary_logloss: 0.556502	valid's binary_logloss: 0.580633
[8]	train's binary_logloss: 0.545928	valid's binary_logl

In [52]:
# blend = model.predict(test_x)
submission["pred"] = blend

In [57]:
submission.reset_index().to_csv("blend.csv",index = False)

## Xgboost

In [12]:
from xgboost import XGBClassifier as xgbc

In [13]:
model_xgb = xgbc(random_state=10)

model_xgb.fit(np_preds[:len(labels),:],labels.to_numpy())

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_bin=256, max_cat_threshold=64, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=10, ...)

## Stacking

In [3]:
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold

# model import 

def predict_cv(model,train_x,train_y,test_x):
    preds = []
    preds_test = []
    va_idxes = []
    
    kf = KFold(n_splits = 4,shuffle=True,random_state=33)
    
    for i,(tr_idx,va_idx) in enumerate(kf.split(train_x)):
        tr_x,va_x = train_x.iloc[tr_idx],train_x.iloc[va_idx]
        tr_y,va_y = train_y.iloc[tr_idx],train_y.iloc[va_idx]
        
        model.fit(tr_x,tr_y,va_x,va_y)
        pred = model.predict(va_x)
        preds.append(pred)
        pred_test = model.predict(test_x)
        preds_test.append(pred_test)
        va_idxes.append(va_idx)
        
    va_idxes = np.concatenate(va_idxes)
    preds = cp.concatenate(preds,axis=0)
    order = np.argsort(va_idxes)
    pred_train = preds[order]
    
    
    preds_test = np.mean(preds_test,axis = 0)
    
    return pred_train,preds_test



from sklearn.ensemble import RandomForestClassifier as rfs



In [None]:
1

## Create blending score

In [137]:
# blend = df_preds.mean(axis=1) #first submit
# blend = df_inverse_weighting.sum(axis=1)/sum(scores**2) #second submit

# third submit
# medians_id = summary.query("diff >0.03").index
# blend = df_preds.mean(axis=1) #first submit
# blend.iloc[medians_id] = summary.query("diff>0.03")["diff"]

# fourth submit
blend = np.sqrt(s0 * s1)

score = log_loss(labels, blend.loc[gt_ids])

print("Brending score = ", f'{score:.10f}')



Brending score =  0.6019399304
