# Imports

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
from tqdm import tqdm

from sklearn.preprocessing import RobustScaler, PowerTransformer
from sklearn.mixture import BayesianGaussianMixture
from sklearn.semi_supervised import LabelSpreading
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings("ignore")

from drive.MyDrive.Kaggle.Clustering_072022.src.functions import *

# Seed, Load and Transform Data

In [9]:
def seed_everything(seed=50):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

In [17]:
PATH = 'drive/MyDrive/Kaggle/Clustering_072022/'
data = pd.read_csv(PATH+'src/data_removed.csv', index_col='id')

cat_feats = data.columns[data.dtypes=='int'].tolist()
num_feats = data.columns[data.dtypes=='float'].tolist()

In [18]:
data[num_feats] = RobustScaler().fit_transform(data[num_feats])
data = pd.DataFrame(PowerTransformer().fit_transform(data), columns=data.columns)
data

Unnamed: 0,f_07,f_08,f_09,f_10,f_11,f_12,f_13,f_22,f_23,f_24,f_25,f_26,f_27,f_28
0,-0.977987,1.383372,1.039938,-0.567775,1.142180,-0.083596,0.086073,-0.707704,0.911425,-0.678993,0.768510,0.960439,1.043167,0.692866
1,-0.977987,-0.875405,-0.179925,-0.837020,-0.420725,1.725834,0.738456,-0.535662,0.453245,1.031821,-0.118652,-0.551262,0.367902,-1.635096
2,0.021718,1.017648,-0.394246,0.124844,0.296135,-0.928325,1.118063,2.203942,0.086177,-1.518865,-0.568497,0.979032,-0.926147,-2.297124
3,0.286548,-1.213526,0.917564,0.124844,0.296135,-0.083596,-0.434221,0.731578,-1.217686,0.826754,-1.172432,-0.395774,-0.099899,0.324430
4,0.756900,0.187543,-0.394246,-1.135381,-1.954502,1.271662,1.118063,0.227623,-1.481688,0.848269,-0.613662,1.164903,-0.374124,-1.158148
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97995,0.021718,0.187543,0.191659,-0.567775,-0.171515,-0.083596,0.317221,0.953153,0.407244,1.284161,-0.076272,-0.989589,0.167419,0.457440
97996,-0.598123,0.622133,-1.243439,-0.567775,-1.259638,0.936024,1.295599,1.159516,-0.429922,0.271348,0.783856,0.140914,0.693482,0.638176
97997,0.756900,0.622133,0.509344,-0.090749,-0.682876,-0.614590,-0.735370,-0.995376,1.450944,0.867385,0.179693,1.540546,1.179070,0.583192
97998,-1.440987,0.410491,0.653205,0.706932,-1.259638,0.565556,-0.162832,0.277521,0.512194,0.687666,-0.286704,-1.085032,-0.538376,0.127070


In [19]:
bgm = BayesianGaussianMixture(
    n_components=7,
    tol=0.00001,
    max_iter=1000,
    n_init=10,
)
preds = bgm.fit_predict(data)
pred_proba = bgm.predict_proba(data)
sub = pd.read_csv(PATH+'submissions/sample_submission.csv', index_col='Id')
sub['Predicted'] = preds
sub.to_csv(PATH+'submissions/BGM_seed50_tol0.00001_mi1000_ni10.csv', index='Id')

In [13]:
# get trusted data to train LGB model.
proba_threshold = 0.90

data['predict'] = preds
data['predict_proba'] = 0
for n in range(7):
    data[f'predict_proba_{n}'] = pred_proba[:, n]
    data.loc[data['predict']==n, 'predict_proba'] = data[f'predict_proba_{n}']
    
    
idxs = np.array([])
for n in range(7):
    median = data[data.predict==n]['predict_proba'].median()
    idx = data[(data.predict==n) & (data.predict_proba > proba_threshold)].index
    idxs = np.concatenate((idxs, idx))
    print(f'Class n{n}  |  Median : {median:.4f}  |  Training data : {len(idx)/len(data[(data.predict==n)]):.1%}')
    
# X = data.loc[idxs][cat_feats+num_feats].reset_index(drop=True)
# y = data.loc[idxs]['predict'].reset_index(drop=True)
unlabelled = data[~data.index.isin(idxs)].index
data.loc[unlabelled, 'predict'] = -1
labels = data['predict']
X = data[cat_feats+num_feats]

Class n0  |  Median : 0.9074  |  Training data : 51.4%
Class n1  |  Median : 0.8670  |  Training data : 43.7%
Class n2  |  Median : 0.9833  |  Training data : 72.7%
Class n3  |  Median : 0.9118  |  Training data : 52.1%
Class n4  |  Median : 0.9376  |  Training data : 57.5%
Class n5  |  Median : 0.8684  |  Training data : 44.2%
Class n6  |  Median : 0.7313  |  Training data : 25.0%


In [16]:
label_prop_model = LabelSpreading(kernel='knn', n_jobs=-1, alpha=0.1)
label_prop_model.fit(X, labels)
prop_preds = label_prop_model.predict(X)
sub = pd.read_csv(PATH+'submissions/sample_submission.csv', index_col='Id')
sub['Predicted'] = prop_preds
sub.to_csv(PATH+'submissions/label_spreading/BGM_seed50_tol0.0001_mi500_ni3_PP_0.90_alpha0.2.csv', index='Id')

In [15]:
sub

Unnamed: 0_level_0,Predicted
Id,Unnamed: 1_level_1
0,4
1,6
2,4
3,2
4,5
...,...
97995,6
97996,1
97997,0
97998,3
