# Import neccesary libraries


In [125]:
import optuna 
import random as rd
from sklearn.model_selection import train_test_split, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import learning_curve
from functools import partial


#############################
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, power_transform
from scipy import stats
from statsmodels.stats.outliers_influence import variance_inflation_factor
#################################
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.svm import SVC

import eli5
from eli5.sklearn import PermutationImportance

In [126]:
train = pd.read_csv('/kaggle/input/playground-series-s3e23/train.csv', na_values=['NaN'])
test = pd.read_csv('/kaggle/input/playground-series-s3e23/test.csv', na_values=['NaN'])
df = pd.concat([train, test], axis=0)
scaler = StandardScaler()

#### Some cleaning

In [127]:
for object_features in list(df.loc[:, df.dtypes == 'O']):
    df[object_features] = df[object_features].replace({'?':np.nan}).astype('float64')

# EDA + INFERENCES

From the original dataset:

> volume = V = N * log2(unique_operands + unique_operators)

> E  = effort to write program = V/L 

> T  = time to write program = E/18 seconds



### Categorical

In [128]:
df.select_dtypes(include=['object', 'bool']).sum()

Series([], dtype: float64)

*Only categorial is target value*

In [129]:
target = 'defects'

### Numerical


In [130]:
numerical_columns = df.drop(['id'], axis=1).select_dtypes(include=['float64', 'int64']).columns

In [131]:
numerical_columns

Index(['loc', 'v(g)', 'ev(g)', 'iv(g)', 'n', 'v', 'l', 'd', 'i', 'e', 'b', 't',
       'lOCode', 'lOComment', 'lOBlank', 'locCodeAndComment', 'uniq_Op',
       'uniq_Opnd', 'total_Op', 'total_Opnd', 'branchCount', 'defects'],
      dtype='object')

In [132]:
train_mean = train.drop(['id', 'defects'], axis=1).mean()
test_mean = test.drop(['id'], axis=1).mean()
analyse = pd.concat([train_mean, test_mean, 100*abs(train_mean-test_mean)/train_mean], axis=1)
analyse.columns = ['Train', 'Test', 'Variance(%)']

In [133]:
train.describe()

Unnamed: 0,id,loc,v(g),ev(g),iv(g),n,v,l,d,i,...,t,lOCode,lOComment,lOBlank,locCodeAndComment,uniq_Op,uniq_Opnd,total_Op,total_Opnd,branchCount
count,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0,...,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0,101763.0
mean,50881.0,37.34716,5.492684,2.845022,3.498826,96.655995,538.280956,0.111634,13.681881,27.573007,...,1141.357982,22.802453,1.773945,3.979865,0.196604,11.896131,15.596671,57.628116,39.249698,9.839549
std,29376.592059,54.600401,7.900855,4.631262,5.534541,171.147191,1270.791601,0.100096,14.121306,22.856742,...,9862.795472,38.54101,5.902412,6.382358,0.998906,6.749549,18.064261,104.53766,71.692309,14.412769
min,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,25440.5,13.0,2.0,1.0,1.0,25.0,97.67,0.05,5.6,15.56,...,31.38,7.0,0.0,1.0,0.0,8.0,7.0,15.0,10.0,3.0
50%,50881.0,22.0,3.0,1.0,2.0,51.0,232.79,0.09,9.82,23.36,...,125.4,14.0,0.0,2.0,0.0,11.0,12.0,30.0,20.0,5.0
75%,76321.5,42.0,6.0,3.0,4.0,111.0,560.25,0.15,18.0,34.34,...,565.92,26.0,1.0,5.0,0.0,16.0,20.0,66.0,45.0,11.0
max,101762.0,3442.0,404.0,165.0,402.0,8441.0,80843.08,1.0,418.2,569.78,...,935923.39,2824.0,344.0,219.0,43.0,410.0,1026.0,5420.0,3021.0,503.0


In [134]:
analyse

Unnamed: 0,Train,Test,Variance(%)
loc,37.34716,37.855945,1.362313
v(g),5.492684,5.559462,1.215757
ev(g),2.845022,2.857699,0.445568
iv(g),3.498826,3.535715,1.054343
n,96.655995,98.222178,1.620368
v,538.280956,548.422497,1.884061
l,0.111634,0.111521,0.100483
d,13.681881,13.762472,0.589031
i,27.573007,27.635377,0.226199
e,20853.589876,22164.320048,6.285393


In [135]:
train[50:100]
commented = train['lOCode']/(train['lOComment'] + np.ones(len(train['lOComment'])))
commented.columns = ['CommentedRatio']
print(commented.describe())

count    101763.000000
mean         12.772065
std          17.198362
min           0.000000
25%           4.075696
50%           9.000000
75%          16.000000
max        1588.000000
dtype: float64


#### Result:
**A significant difference in e, t and lOComment, which may encourage us to not overfit for train data / use weaker models**

In [136]:
X = train.drop(['id', 'defects'], axis=1)
y = train['defects']
kf = StratifiedKFold(n_splits=10,shuffle=True,random_state=42)
cv_score1 =[]
for train_index,test_index in kf.split(X,y):
    xtr,xvl = X.loc[train_index],X.loc[test_index]
    ytr,yvl = y.loc[train_index],y.loc[test_index]
    model = LGBMClassifier()
    model.fit(xtr,ytr)
    
    score = roc_auc_score(yvl, model.predict_proba(xvl)[:, 1])
    cv_score1.append(score)    
print('Mean cv Score:', np.mean(cv_score1))

Mean cv Score: 0.7918485545027518


In [137]:
train['m'] = abs(train['v(g)'] - train['ev(g)'])
train['difficulty'] = 1/train['l']
train['difficulty'] = train['difficulty'].replace(np.inf, 50)
train['vocabulary'] = train['uniq_Op'] + train['uniq_Opnd']
train['N'] = train['total_Op'] + train['total_Opnd']
train['McCabe_says_bad'] = train['v(g)'] > 10
train['CommentedRatio'] = train['lOCode']/(train['lOComment'] + np.ones(len(train['lOComment'])))

train['easy_program'] = (train['e'] < 9000) & (train['difficulty'] < 40)
train['hard_program'] = (train['difficulty'] > 18) & (train['loc'] > 59) & (train['branchCount'] > 16)
train['medium_understandable_programm'] = (train['McCabe_says_bad'] == False) & (train['difficulty'] < 34) & (train['vocabulary'] < 34) & (train['e'] < 8500)

print(train['m'].mean())
print(train['difficulty'].mean())
print(train['vocabulary'].mean())


2.6672562719259454
17.50432598303269
27.492801902459636


In [138]:
# cluster_1 = easy_programm : (train['e'] < 9000) & (train['difficulty'] < 40)
# cluster_2 = hard_programm: (train['difficulty'] > 18) & (train['loc'] > 59) & (train['branchCount'] > 16)
# cluster_3 = medium_understandable_programm: (commented > 7) & (train['McCabe_says_bad'] == False) & (train['difficulty'] < 34) & (train['vocabulary'] < 34) & (train['e'] < 8500)

In [139]:
columns = train.drop(['id', 'defects'], axis=1).columns

In [140]:
#data_conditioned = train[(commented > 7) & (train['McCabe_says_bad'] == False) & (train['difficulty'] < 34) & (train['vocabulary'] < 34) & (train['e'] < 8500)].copy()
#print('Okay:', data_conditioned[data_conditioned['defects'] == False].shape[0])
#print('Defected count:', data_conditioned[data_conditioned['defects'] == True].shape[0])
#print('Defected/Okay relation:', data_conditioned[data_conditioned['defects'] == True].shape[0]/data_conditioned[data_conditioned['defects'] == False].shape[0])

In [141]:
#for i in range(20):
#    col1 = rd.choice(columns)
#    col2 = rd.choice(columns)
#    while col2 == col1:
#        col2 = rd.choice(train.columns)
#    fig, ax = plt.subplots(figsize=(10, 3))
#    sns.scatterplot(x=col1, y=col2, hue='defects', data=data_conditioned, palette='colorblind', s=15, ax=ax, alpha=0.8)


In [142]:
#sns.pairplot(train[['loc', 'v(g)', 'ev(g)', 'iv(g)', 'defects']].sample(500), hue='defects')

In [143]:
#sns.pairplot(train[['loc', 'v(g)', 'ev(g)', 'iv(g)', 'defects']].sample(500), hue='defects')

In [144]:
#sns.pairplot(train[['difficulty', 'vocabulary', 'N', 'McCabe_says_bad', 'm', 'CommentedRatio', 'easy_program', 'hard_program', 'defects']].sample(1000), hue='defects')

In [145]:
for col in train.drop(['id', 'defects', 'McCabe_says_bad', 'easy_program', 'hard_program', 'medium_understandable_programm'], axis=1).columns:
    train[col] = np.log1p(train[col])
    train[col] = scaler.fit_transform(train[[col]])


In [146]:
#sns.pairplot(train[['d', 'i', 'e', 'b', 'total_Op', 'total_Opnd', 'branchCount', 'defects']].sample(500), hue='defects')

In [147]:
#sns.pairplot(train[['d', 'i', 'e', 'b', 'total_Op', 'total_Opnd', 'branchCount', 'defects']].sample(500), hue='defects')

In [148]:
X = train.drop(['id', 'defects'], axis=1)
y = train['defects']
kf = StratifiedKFold(n_splits=10,shuffle=True,random_state=42)
cv_score2 =[]
for train_index,test_index in kf.split(X,y):
    xtr,xvl = X.loc[train_index],X.loc[test_index]
    ytr,yvl = y.loc[train_index],y.loc[test_index]
    model = LGBMClassifier()
    model.fit(xtr,ytr)
    
    score = roc_auc_score(yvl, model.predict_proba(xvl)[:, 1])
    cv_score2.append(score)    
print('Mean cv Score:', np.mean(cv_score2))

Mean cv Score: 0.7917900549554381


In [149]:
train.drop(['d', 'l', 'v(g)', 'uniq_Opnd', 'total_Op'], axis=1, inplace=True)


In [150]:
X = train.drop(['id', 'defects'], axis=1)
y = train['defects']
kf = StratifiedKFold(n_splits=10,shuffle=True,random_state=42)
cv_score3 =[]
for train_index,test_index in kf.split(X,y):
    xtr,xvl = X.loc[train_index],X.loc[test_index]
    ytr,yvl = y.loc[train_index],y.loc[test_index]
    model = LGBMClassifier()
    model.fit(xtr,ytr)
    
    score = roc_auc_score(yvl, model.predict_proba(xvl)[:, 1])
    cv_score3.append(score)    
print('Mean cv Score:', np.mean(cv_score3))

Mean cv Score: 0.7916278386478035


In [151]:
print(f'1: {np.mean(cv_score1)}, 2: {np.mean(cv_score2)}, 3: {np.mean(cv_score3)}')

1: 0.7918485545027518, 2: 0.7917900549554381, 3: 0.7916278386478035


In [152]:
df['m'] = abs(df['v(g)'] - df['ev(g)'])
df['difficulty'] = 1 / df['l']
df['difficulty'] = df['difficulty'].replace(np.inf, 50)
df['vocabulary'] = df['uniq_Op'] + df['uniq_Opnd']
df['N'] = df['total_Op'] + df['total_Opnd']
df['McCabe_says_bad'] = df['v(g)'] > 10
df['CommentedRatio'] = df['lOCode']/(df['lOComment'] + np.ones(len(df['lOComment'])))

df['easy_program'] = (df['e'] < 9000) & (df['difficulty'] < 40)
df['hard_program'] = (df['difficulty'] > 18) & (df['loc'] > 59) & (df['branchCount'] > 16)
df['medium_understandable_programm'] = (df['McCabe_says_bad'] == False) & (df['difficulty'] < 34) & (df['vocabulary'] < 34) & (df['e'] < 8500)

print(df['m'].mean())
print(df['difficulty'].mean())
print(df['vocabulary'].mean())

2.689808673093364
17.562643703003022
27.53093953598066


In [153]:
for col in df.drop(['id', 'defects', 'McCabe_says_bad', 'easy_program', 'hard_program', 'medium_understandable_programm'], axis=1).columns:
    df[col] = np.log1p(df[col])
    df[col] = scaler.fit_transform(df[[col]])


In [154]:
df.drop(['d', 'l', 'v(g)', 'uniq_Opnd', 'total_Op'], axis=1, inplace=True)

In [155]:
test = df[df['defects'].isnull()]

X_train = train.drop(['id', 'defects'], axis=1)
y_train = train['defects']

In [156]:
lgbm_params = {'num_leaves': 33, 'learning_rate': 0.04140345883600424, 'bagging_fraction': 0.9055318388706308}
skip_lgbm = True

In [157]:
def objective1(trial):
    params = {
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'metric': 'binary_logloss',
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'max_depth': -1,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'feature_fraction': 1.0,
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 1.0),
        'bagging_freq': 5,
        'verbose': -1,
        'seed': 42,
    }

    # Create a LightGBM classifier with the trial parameters
    # Use 5-fold cross-validation to evaluate the model
    cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)
    cv_score4 =[]
    for train_index,test_index in cv.split(X_train,y_train):
        xtr,xvl = X_train.loc[train_index],X_train.loc[test_index]
        ytr,yvl = y_train.loc[train_index],y_train.loc[test_index]
        model = LGBMClassifier(**params)
        model.fit(xtr,ytr)
        score = roc_auc_score(yvl, model.predict_proba(xvl)[:, 1])
        cv_score4.append(score)
        
    accuracy = np.mean(cv_score4)
    print(accuracy)
    return accuracy

if skip_lgbm == False:
    study = optuna.create_study(direction='maximize')
    study.optimize(objective1, n_trials=200)

    # Print the best parameters and accuracy
    lgbm_params = study.best_params
    lgbm_best_accuracy = study.best_value
    print("Best Parameters: ", lgbm_params)
    print("Best Accuracy: ", lgbm_best_accuracy)


In [158]:
xgb_params = {'lambda': 4.2040847641721335, 'alpha': 6.4775035456984, 'colsample_bytree': 0.9830447528109422, 'subsample': 0.7731737044472982, 'learning_rate': 0.06021964990735283, 'n_estimators': 173, 'max_depth': 4, 'min_child_weight': 3, 'num_parallel_tree': 1}
skip_xgb = False

In [159]:
def objective2(trial):
    params = {  
        'objective': 'binary:logistic',  # Change 'multi:softmax' to 'binary:logistic'
        'eval_metric': 'error',  # Change 'merror' to 'error'
        'lambda': trial.suggest_float('lambda', 0.5, 5),
        'alpha': trial.suggest_float('alpha', 0.5, 5),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5,1.0),
        'subsample': trial.suggest_float('subsample', 0.5,1.0),
        'learning_rate': trial.suggest_float('learning_rate', 0.001,0.3),
        'n_estimators': trial.suggest_int('n_estimators', 30, 200),
        'max_depth': trial.suggest_categorical('max_depth', [1,2,3,4,5,6,7,10,12,14,16]),
        'min_child_weight': trial.suggest_int('min_child_weight', 0.5, 10),
        'num_parallel_tree': trial.suggest_int('num_parallel_tree',1,3),
    }
    
    cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)
    cv_score5 =[]
    for train_index,test_index in cv.split(X_train,y_train):
        xtr,xvl = X_train.loc[train_index],X_train.loc[test_index]
        ytr,yvl = y_train.loc[train_index],y_train.loc[test_index]
        model = XGBClassifier(**params)
        model.fit(xtr,ytr)
        score = roc_auc_score(yvl, model.predict_proba(xvl)[:, 1])
        cv_score5.append(score)
        
    accuracy = np.mean(cv_score5)
    return accuracy

if skip_xgb == False:
    study = optuna.create_study(direction='maximize')
    study.optimize(objective2, n_trials=100)

    # Print the best parameters and accuracy
    xgb_params = study.best_params
    xgb_best_accuracy = study.best_value
    print("Best Parameters: ", xgb_params)
    print("Best Accuracy: ", xgb_best_accuracy)

In [160]:
hist_params = {'max_iter': 176, 'learning_rate': 0.06558952830715783, 'max_depth': 13, 'max_leaf_nodes': 14, 'min_samples_leaf': 93, 'l2_regularization': 0.7195389959935768}
skip_hist = True

In [161]:
def objective3(trial):
    params = {  
        'loss': 'log_loss',
        'max_iter': trial.suggest_int('max_iter', 30, 250),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.6),
        'max_depth': trial.suggest_int('max_depth', 2, 30),
        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 2, 80),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 30, 200),
        'l2_regularization': trial.suggest_float('l2_regularization', 0.1, 1),
        'warm_start': False,
        'early_stopping': 'auto',
        'scoring': 'roc_auc'
    }
    
    cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)
    cv_score6 =[]
    for train_index,test_index in cv.split(X_train,y_train):
        xtr,xvl = X_train.loc[train_index],X_train.loc[test_index]
        ytr,yvl = y_train.loc[train_index],y_train.loc[test_index]
        model = HistGradientBoostingClassifier(**params)
        model.fit(xtr,ytr)
        score = roc_auc_score(yvl, model.predict_proba(xvl)[:, 1])
        cv_score6.append(score)
    
    accuracy = np.mean(cv_score6)
    return accuracy

if skip_hist == False:
    study = optuna.create_study(direction='maximize')
    study.optimize(objective3, n_trials=200)

    # Print the best parameters and accuracy
    hist_params = study.best_params
    hist_best_accuracy = study.best_value
    print("Best Parameters: ", hist_params)
    print("Best Accuracy: ", hist_best_accuracy)

In [162]:
rf_params = {'n_estimators': 335, 'criterion': 'gini', 'max_depth': 17, 'min_samples_split': 46, 'min_samples_leaf': 142, 'max_features': None}
skip_rf = True

In [163]:
def objective4(trial):
    params = {  
        'n_estimators': trial.suggest_int('n_estimators', 50, 400),
        'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy']),
        'max_depth': trial.suggest_int('max_depth', 2, 30),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 80),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 20, 200),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
        'random_state': 42
    }
    
    cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)
    cv_scores = []
    
    for train_index, test_index in cv.split(X_train, y_train):
        xtr, xvl = X_train.loc[train_index], X_train.loc[test_index]
        ytr, yvl = y_train.loc[train_index], y_train.loc[test_index]
        
        model = RandomForestClassifier(**params)
        model.fit(xtr, ytr)
        
        y_pred_proba = model.predict_proba(xvl)[:, 1]
        score = roc_auc_score(yvl, y_pred_proba)
        cv_scores.append(score)
    
    roc_auc = np.mean(cv_scores)
    return roc_auc

if skip_rf == False:
    study = optuna.create_study(direction='maximize')
    study.optimize(objective4, n_trials=30)
    rf_params = study.best_params
    rf_best_accuracy = study.best_value
    print("Best Parameters: ", rf_params)
    print("Best Accuracy: ", rf_best_accuracy)

In [164]:
gb_params = {'n_estimators': 118, 'learning_rate': 0.05177081371045021, 'max_depth': 5, 'min_samples_split': 22, 'min_samples_leaf': 158, 'max_features': None}
skip_gb = True

In [165]:
def objective5(trial):
    params = {  
        'n_estimators': trial.suggest_int('n_estimators', 50, 400),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.6),
        'max_depth': trial.suggest_int('max_depth', 2, 30),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 80),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 20, 200),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
        'random_state': 42
    }
    
    cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)
    cv_scores = []
    
    for train_index, test_index in cv.split(X_train, y_train):
        xtr, xvl = X_train.loc[train_index], X_train.loc[test_index]
        ytr, yvl = y_train.loc[train_index], y_train.loc[test_index]
        
        model = GradientBoostingClassifier(**params)
        model.fit(xtr, ytr)
        
        y_pred_proba = model.predict_proba(xvl)[:, 1]
        score = roc_auc_score(yvl, y_pred_proba)
        cv_scores.append(score)
    
    roc_auc = np.mean(cv_scores)
    return roc_auc

if skip_gb == False:
    study = optuna.create_study(direction='maximize')
    study.optimize(objective5, n_trials=200)
    gb_params = study.best_params
    gb_best_accuracy = study.best_value
    print("Best Parameters: ", gb_params)
    print("Best Accuracy: ", gb_best_accuracy)

In [166]:
svc_params = {'C': 4.989576248670001, 'kernel': 'rbf', 'degree': 2, 'gamma': 'auto'}
skip_svc = True

In [167]:
def objective6(trial):
    params = {  
        'C': trial.suggest_float('C', 0.1, 10),
        'kernel': trial.suggest_categorical('kernel', ['linear', 'rbf', 'poly']),
        'degree': trial.suggest_int('degree', 2, 5),
        'gamma': trial.suggest_categorical('gamma', ['scale', 'auto']),
        'random_state': 42
    }
    
    cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)
    cv_scores = []
    
    for train_index, test_index in cv.split(X_train, y_train):
        xtr, xvl = X_train.loc[train_index], X_train.loc[test_index]
        ytr, yvl = y_train.loc[train_index], y_train.loc[test_index]
        
        model = SVC(**params)
        model.fit(xtr, ytr)
        
        y_pred_proba = model.decision_function(xvl)
        score = roc_auc_score(yvl, y_pred_proba)
        cv_scores.append(score)
    
    roc_auc = np.mean(cv_scores)
    return roc_auc

if skip_svc == False:
    study = optuna.create_study(direction='maximize')
    study.optimize(objective6, n_trials=200)
    svc_params = study.best_params
    svc_best_accuracy = study.best_value
    print("Best Parameters: ", svc_params)
    print("Best Accuracy: ", svc_best_accuracy)


In [168]:
weights = [-0.5732312864119468, 1.8787321555622554, 0.3813054331921917, 0.655202154958717, -0.04230503077118486]

In [175]:
class OptunaWeights:
    def __init__(self, random_state, n_trials=5000, weights=None):
        self.study = None
        self.weights = None
        self.random_state = random_state
        self.n_trials = n_trials

    def _objective(self, trial, y_true, y_preds):
        # Define the weights for the predictions from each model
        weights = [trial.suggest_float(f"weight{n}", -1, 2) for n in range(len(y_preds))]

        # Calculate the weighted prediction
        weighted_pred = np.average(np.array(y_preds), axis=0, weights=weights)
        score = roc_auc_score(y_true, weighted_pred)
        return score

    def fit(self, y_true, y_preds):
        optuna.logging.set_verbosity(optuna.logging.ERROR)
        sampler = optuna.samplers.CmaEsSampler(seed=self.random_state)
        pruner = optuna.pruners.HyperbandPruner()
        self.study = optuna.create_study(sampler=sampler, pruner=pruner, study_name="OptunaWeights", direction='maximize')
        objective_partial = partial(self._objective, y_true=y_true, y_preds=y_preds)
        self.study.optimize(objective_partial, n_trials=self.n_trials)
        self.weights = [self.study.best_params[f"weight{n}"] for n in range(len(y_preds))]

    def predict(self, y_preds):
        assert self.weights is not None, 'OptunaWeights error, must be fitted before predict'
        weighted_pred = np.average(np.array(y_preds), axis=0, weights=self.weights)
        return weighted_pred

    def fit_predict(self, y_true, y_preds):
        self.fit(y_true, y_preds)
        return self.predict(y_preds)
    
    def weights(self):
        return self.weights

In [170]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_train, y_train, stratify=y_train, test_size=0.2)

In [179]:
preds = []
test_preds = []

#model = LGBMClassifier(**lgbm_params)
#model.fit(X_train2, y_train2)
#preds.append(model.predict_proba(X_test2)[:, 1])
#test_preds.append(model.predict_proba(test.drop(['id', 'defects'], axis=1)))
#print('LGBM')
model = XGBClassifier(**xgb_params)
model.fit(X_train2, y_train2)
preds.append(model.predict_proba(X_test2)[:, 1])
test_preds.append(model.predict_proba(test.drop(['id', 'defects'], axis=1)))
print('XGB')
#model = RandomForestClassifier(**rf_params)
#model.fit(X_train2, y_train2)
#preds.append(model.predict_proba(X_test2)[:, 1])
#test_preds.append(model.predict_proba(test.drop(['id', 'defects'], axis=1)))
#print('RF')
#model = GradientBoostingClassifier(**gb_params)
#model.fit(X_train2, y_train2)
#preds.append(model.predict_proba(X_test2)[:, 1])
#test_preds.append(model.predict_proba(test.drop(['id', 'defects'], axis=1)))
#print('GB')
#model = SVC(**svc_params)
#model.fit(X_train2, y_train2)
#preds.append(model.decision_function(X_test2))
#test_preds.append(model.decision_function(test.drop(['id', 'defects'], axis=1)))
#print('SVC')

LGBM
XGB
GB


In [183]:
optweights = OptunaWeights(random_state=42)
#optweights.weights = weights

In [181]:
y_val_pred = optweights.fit_predict(y_test2, preds)   
score = roc_auc_score(y_test2, y_val_pred)    
print(optweights.weights)
weights = optweights.weights
test_preds = optweights.predict(test_preds)

[0.6795621172405011, 1.6819845879128938, 0.45025988907248515]


In [None]:
test_preds

In [182]:
print('Ensemble score:', score)

Ensemble score: 0.7899699030225338


In [None]:
y_pred = pd.DataFrame(test_preds[:, 1], columns=['defects'])
submission = pd.concat([test['id'], y_pred], axis=1)
print(submission)
submission.to_csv('submission.csv', index=False)