In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import PowerTransformer
from sklearn.metrics import balanced_accuracy_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.mixture import BayesianGaussianMixture
import lightgbm as lgb

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv("/kaggle/input/tabular-playground-series-jul-2022/data.csv")
sub = pd.read_csv("/kaggle/input/tabular-playground-series-jul-2022/sample_submission.csv")

In [None]:
data.head()

In [None]:
sub.head()

In [None]:
data.shape

In [None]:
sub.shape

**EDA**

In [None]:
data.describe()

In [None]:
plt.figure(figsize = (15,10))
sns.heatmap(data.corr(),annot = True,fmt = ".2f")

In [None]:
plt.figure(figsize = (15,10))
for i,column in enumerate(list(data.columns),1):
    plt.subplot(5,6,i)
    sns.histplot(x = column,data = data.sample(1000),kde = True)

**Feature Engineering**

In [None]:
data = data.drop(columns = "id")

In [None]:
cols = list(data.columns)

In [None]:
def iqr_outliers(df,col_list):
    for col in col_list:
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3-q1
        Lower_tail = q1 - 2 * iqr
        Upper_tail = q3 + 2 * iqr
        df.loc[df[col] > Upper_tail,col ] = Upper_tail
        df.loc[df[col] < Lower_tail,col] = Lower_tail
    return df

float_cols = data.columns[data.dtypes == 'float']
data = iqr_outliers(data,float_cols)

In [None]:
data_scaled = PowerTransformer(method = "yeo-johnson").fit_transform(data)

In [None]:
data_scaled = pd.DataFrame(data_scaled,columns = cols)
data_scaled.head()

In [None]:
plt.figure(figsize = (15,10))
for i,column in enumerate(list(data_scaled.columns),1):
    plt.subplot(5,6,i)
    sns.histplot(x = column,data = data_scaled.sample(1000),kde = True)

**Model**

BayesianGaussianMixture

In [None]:
vbgmm = BayesianGaussianMixture(n_components = 7,covariance_type = 'full',random_state = 1)
preds = vbgmm.fit_predict(data_scaled)

In [None]:
plt.style.use('ggplot')
plt.figure(figsize = (15,6))
for i in range(vbgmm.means_.shape[0]):
    plt.scatter(np.arange(data_scaled.shape[1]), vbgmm.means_[i])
plt.xticks(ticks = np.arange(data_scaled.shape[1]), labels = cols)
plt.show()

In [None]:
important_cols = ['f_07', 'f_08', 'f_09', 'f_10', 'f_11', 'f_12', 'f_13', 'f_22', 'f_23', 'f_24', 'f_25', 'f_26', 'f_27', 'f_28', ]

In [None]:
preds_vbgmm = vbgmm.fit_predict(data_scaled[important_cols])

In [None]:
pp = vbgmm.predict_proba(data_scaled[important_cols])

In [None]:
data_scaled[[f'predict_proba_{i}' for i in range(7)]] = pp 
data_scaled['predict_proba'] = np.max(pp,axis=1)
data_scaled['predict'] = np.argmax(pp,axis=1)
data_scaled

In [None]:
tr_idx = np.array([])
for n in range(7):
    idx = data_scaled[(data_scaled["predict"] == n) & (data_scaled["predict_proba"] > 0.8)].index 
    tr_idx = np.concatenate((tr_idx, idx))    
X = data_scaled.loc[tr_idx][important_cols]
y = data_scaled.loc[tr_idx]['predict']

In [None]:
X.shape

In [None]:
y

LightGBM

In [None]:
lgb_predict_proba = 0 
qda_predicted_proba = 0
kf = StratifiedKFold(5, shuffle = True, random_state = 1)

for fold, (tr_idx, val_idx) in enumerate(kf.split(X,y)):   
    print("*****Fold {}*****".format(fold+1))
    tr_X,val_X = X.iloc[tr_idx],X.iloc[val_idx]
    tr_y,val_y = y.iloc[tr_idx],y.iloc[val_idx] 
    
    lgb_train = lgb.Dataset(tr_X,tr_y)
    lgb_eval = lgb.Dataset(val_X,val_y)
    
    params = {'learning_rate': 0.07,
              'objective': 'multiclass',
              "metric" : "multi_logloss",
              'boosting': 'gbdt',
              'verbosity': -1,
              'n_jobs': -1,
              'num_classes':7} 
    
    lgb_model = lgb.train(params, 
                      lgb_train,
                      valid_sets = [lgb_train,lgb_eval],
                      num_boost_round = 5000, 
                      callbacks = [ lgb.early_stopping(stopping_rounds=100, verbose=True), 
                                    lgb.log_evaluation(period=200)]) 
    
    lgb_predict_proba += lgb_model.predict(data_scaled[important_cols])
     
y_lgb = np.argmax(lgb_predict_proba,axis = 1)

In [None]:
vbgmm.fit(data_scaled[important_cols],y_lgb)
preds = vbgmm.predict_proba(data_scaled[important_cols])

**Submission**

In [None]:
preds = np.concatenate((preds,preds*0.8),axis = 1)

In [None]:
sub["Predicted"] = preds

In [None]:
sub.to_csv('submission.csv', index=False)