In [9]:
import numpy as np
import pandas as pd
import gc,random,os
from sklearn.preprocessing import PowerTransformer, RobustScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.mixture import BayesianGaussianMixture
from sklearn import metrics
import lightgbm as lgb

In [10]:
def seed_everything(seed=2022):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed=2020
seed_everything(seed)

N_FOLDS =10
N_cluster=7

In [11]:
data=pd.read_csv("../data/data.csv")
data=data.drop("id",axis=1)
cols = list(data.columns)

In [26]:

best_data =['f_07','f_08', 'f_09', 'f_10','f_11', 'f_12', 'f_13', 'f_22','f_23', 'f_24', 'f_25','f_26','f_27', 'f_28']

rob_scaler = RobustScaler().fit(data)
power_transformer = PowerTransformer().fit(data)

data_scaled = rob_scaler.fit_transform(data[best_data])
data_scaled = power_transformer.fit_transform(data_scaled)
train = pd.DataFrame(data_scaled, columns = best_data)

BGM = BayesianGaussianMixture(n_components=N_cluster,covariance_type='full', max_iter=100, random_state=1,n_init = 5)
pred = BGM.fit_predict(train)
pred_proba=BGM.predict_proba(train)
train['predict']=pred
train.head()

Unnamed: 0,f_07,f_08,f_09,f_10,f_11,f_12,f_13,f_22,f_23,f_24,f_25,f_26,f_27,f_28,predict
0,-1.033497,1.407376,1.076834,-0.600202,1.155585,-0.088842,0.075636,-0.707704,0.911425,-0.678993,0.76851,0.960439,1.043167,0.692866,0
1,-1.033497,-0.922377,-0.198131,-0.885656,-0.441754,1.727871,0.758833,-0.535662,0.453245,1.031821,-0.118652,-0.551262,0.367902,-1.635096,5
2,0.013448,1.033965,-0.439612,0.140324,0.304431,-0.983604,1.144644,2.203942,0.086177,-1.518865,-0.568497,0.979032,-0.926147,-2.297124,0
3,0.294264,-1.240713,0.957816,0.140324,0.304431,-0.088842,-0.483476,0.731578,-1.217686,0.826754,-1.172432,-0.395774,-0.099899,0.32443,3
4,0.779194,0.179779,-0.439612,-1.188775,-1.897861,1.289315,1.144644,0.227623,-1.481688,0.848269,-0.613662,1.164903,-0.374124,-1.158148,2


In [29]:
pred_proba

array([[9.86551568e-01, 1.28449168e-03, 3.61678673e-04, ...,
        2.93903218e-03, 7.66573516e-03, 1.19749200e-03],
       [1.56576559e-04, 2.92630821e-02, 2.83635722e-03, ...,
        1.79950696e-06, 9.48568479e-01, 1.11506353e-02],
       [8.59878009e-01, 1.80198016e-05, 1.20383818e-03, ...,
        1.38595307e-01, 2.08178477e-04, 9.66473849e-05],
       ...,
       [3.08047621e-03, 1.80850220e-04, 2.55847396e-02, ...,
        9.65162072e-01, 5.51137872e-03, 4.74946729e-04],
       [4.13275945e-04, 2.33087210e-02, 3.23853268e-04, ...,
        2.00311841e-01, 1.21471370e-01, 6.45334604e-01],
       [8.00233066e-02, 8.71864285e-01, 1.98109585e-05, ...,
        1.87751394e-09, 5.60335413e-04, 4.75322599e-02]])

In [32]:

train['predict_proba']=0
for n in range(N_cluster):
    train[f'predict_proba_{n}']=pred_proba[:,n] #n列目をすべて移動
    train.loc[train.predict == n,'predict_proba']=train[f'predict_proba_{n}']
    
train_index=np.array([])
for n in range(N_cluster):
    median=train[train.predict==n]['predict_proba'].median() #中央値の算出
    n_inx=train[(train.predict==n) & (train.predict_proba > 0.7)].index
    train_index = np.concatenate((train_index, n_inx))  
    print(f'class:{n}',f'median: {round(median,4)}','Training data:'+str(round(len(n_inx)/len(train[(train.predict==n)]),2)*100)+'%')

class:0 median: 0.9434 Training data:80.0%
class:1 median: 0.8598 Training data:72.0%
class:2 median: 0.8838 Training data:73.0%
class:3 median: 0.9822 Training data:88.0%
class:4 median: 0.9228 Training data:79.0%
class:5 median: 0.7507 Training data:56.99999999999999%
class:6 median: 0.9189 Training data:76.0%


In [35]:
print(train_index)
print(train_index.shape)

[0.0000e+00 2.0000e+00 7.0000e+00 ... 9.7960e+04 9.7982e+04 9.7992e+04]
(73901,)


In [37]:
X=train.loc[train_index]
y=train.loc[train_index]['predict']

params_lgb = {'learning_rate': 0.07,'objective': 'multiclass','boosting': 'gbdt','verbosity': -1,'n_jobs': -1, 'num_classes':N_cluster} 

model_list=[]

gkf = StratifiedKFold(N_FOLDS)
for fold, (train_idx, valid_idx) in enumerate(gkf.split(X,y)):   

    tr_dataset = lgb.Dataset(X.iloc[train_idx],y.iloc[train_idx])
    vl_dataset = lgb.Dataset(X.iloc[valid_idx],y.iloc[valid_idx])
    
    model = lgb.train(params = params_lgb, 
                train_set = tr_dataset, 
                valid_sets =  vl_dataset, 
                num_boost_round = 5000, 
                callbacks=[ lgb.early_stopping(stopping_rounds=300, verbose=True), lgb.log_evaluation(period=200)])  
    
    model_list.append(model) 

Training until validation scores don't improve for 300 rounds
[200]	valid_0's multi_logloss: 9.08086e-08
[400]	valid_0's multi_logloss: 8.85208e-08
[600]	valid_0's multi_logloss: 8.80273e-08
[800]	valid_0's multi_logloss: 8.78779e-08
[1000]	valid_0's multi_logloss: 8.78264e-08
[1200]	valid_0's multi_logloss: 8.78051e-08
[1400]	valid_0's multi_logloss: 8.77971e-08
[1600]	valid_0's multi_logloss: 8.77962e-08
[1800]	valid_0's multi_logloss: 8.77954e-08
[2000]	valid_0's multi_logloss: 8.77944e-08
[2200]	valid_0's multi_logloss: 8.77938e-08
[2400]	valid_0's multi_logloss: 8.77929e-08
[2600]	valid_0's multi_logloss: 8.77924e-08
[2800]	valid_0's multi_logloss: 8.77917e-08
[3000]	valid_0's multi_logloss: 8.77912e-08
[3200]	valid_0's multi_logloss: 8.77907e-08
[3400]	valid_0's multi_logloss: 8.779e-08
[3600]	valid_0's multi_logloss: 8.77895e-08
[3800]	valid_0's multi_logloss: 8.7789e-08
[4000]	valid_0's multi_logloss: 8.77883e-08
[4200]	valid_0's multi_logloss: 8.7788e-08
[4400]	valid_0's multi

In [39]:
lgb_preds=0
for model in model_list:
    lgb_preds+=model.predict(train)

In [40]:
ss=pd.read_csv("../data/sample_submission.csv")
ss['Predicted']=np.argmax(lgb_preds, axis=1)
ss.to_csv("submission_lgb.csv",index=False)
ss.head()

Unnamed: 0,Id,Predicted
0,0,0
1,1,5
2,2,0
3,3,3
4,4,2
