# **Instead of performing a multiclass classification, a binary classification was performed twice and the results were analyzed.**

In [None]:
%%time

SEED=42

import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.neighbors import LocalOutlierFactor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, KFold,cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.metrics import roc_curve, classification_report, confusion_matrix
from sklearn.metrics import f1_score, make_scorer

from scipy import stats

import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
sample_submission = pd.read_csv('/kaggle/input/playground-series-s3e22/sample_submission.csv')
train = pd.read_csv('/kaggle/input/playground-series-s3e22/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s3e22/test.csv')

sample_submission.shape, train.shape, test.shape

In [None]:
train

In [None]:
train.info()

In [None]:
train['outcome'].value_counts()

In [None]:
train_2 = train[train['outcome']!='lived']
train_2.shape

In [None]:
le = preprocessing.LabelEncoder()
object_columns = train.select_dtypes(include='object').columns

for i in object_columns[0:-1]:
    labels = train[i]
    labels_id = le.fit_transform(labels)
    train[i] = labels_id
    
train

In [None]:
object_columns_test = test.select_dtypes(include='object').columns

for i in object_columns_test:
    labels = test[i]
    labels_id = le.fit_transform(labels)
    test[i] = labels_id
    
test

In [None]:
# Imputation
imp_mean = IterativeImputer(random_state=SEED)
Imp_X= imp_mean.fit_transform(train.iloc[:,1:-1])

In [None]:
# Outlier removal
local_outlier_factor = LocalOutlierFactor(contamination='auto', novelty=True)
local_outlier_factor.fit(Imp_X)
predicted = local_outlier_factor.predict(Imp_X)

In [None]:
IX = pd.DataFrame(Imp_X)
IX.index = train.iloc[:,1:-1].index
IX.columns = train.iloc[:,1:-1].columns

pd.DataFrame(predicted).value_counts()
X = IX.loc[predicted > 0,:]
y = train.iloc[:,-1].loc[predicted > 0]

In [None]:
TRAIN = pd.concat([X,y],axis=1)
TRAIN

# **1st step (dead+euthanized vs lived)**

In [None]:
sig = []
for i in TRAIN.columns[0:-1]:
    tt = stats.ttest_ind(TRAIN[i][TRAIN['outcome']=='lived'], TRAIN[i][TRAIN['outcome']!='lived'], equal_var=False)
    sig.append(tt[1])
S = pd.DataFrame(sig)
S.index = TRAIN.columns[0:-1]
S.columns = ['P_value']

In [None]:
S.sort_values(by='P_value',ascending = True)

In [None]:
Feature_doa = S.sort_values(by='P_value',ascending = True)[S['P_value'] < 0.05].index
Feature_doa

**16 features**

# 2nd step (died or euthanized)

In [None]:
sig = []
for i in TRAIN.columns[0:-1]:
    tt = stats.ttest_ind(TRAIN[i][TRAIN['outcome']=='died'], TRAIN[i][TRAIN['outcome']=='euthanized'], equal_var=False)
    sig.append(tt[1])
S = pd.DataFrame(sig)
S.index = TRAIN.columns[0:-1]
S.columns = ['P_value']

In [None]:
Feature_doe = S.sort_values(by='P_value',ascending = True)[S['P_value'] < 0.05].index
Feature_doe

**16 features**

# 3rd step (1st ML)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, 
                                                    random_state = SEED, stratify = y)

In [None]:
y_train2 = y_train.replace('lived',0).replace('died',1).replace('euthanized',1)
y_test2 = y_test.replace('lived',0).replace('died',1).replace('euthanized',1)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

mlp = make_pipeline(StandardScaler(), MLPClassifier(hidden_layer_sizes=(200,), max_iter=10000))
lr = make_pipeline(StandardScaler(), LogisticRegression())
rf = RandomForestClassifier()
abc = AdaBoostClassifier()
gbc = GradientBoostingClassifier()
bc = BaggingClassifier()
etc = ExtraTreesClassifier()
xgbc = XGBClassifier()
svm = make_pipeline(StandardScaler(), SVC(probability=True))
dtc = DecisionTreeClassifier()
lgb = LGBMClassifier()

from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

all_clf=[lr, rf, abc, gbc, bc, etc, xgbc, svm, mlp,dtc, lgb]
clf_labels=["LR", "RF","ABC", "GBC", "BC","ETC","XGBC", "SVM","MLP","DTC", "LGB"]
colors =["black", "orange", "blue", "green","pink","red","black", "orange", "blue", "green","pink"]
linestyles = [":","--", "-.","-","-","-",":","--", "-.","-","-","-"]

for clf, label,clr,ls in zip(all_clf, clf_labels, colors, linestyles):
  y_pred= clf.fit(X_train,y_train2).predict_proba(X_test)[:,1]
  fpr,tpr,thresholds = roc_curve(y_test2, y_pred)
  roc_auc= auc(x=fpr, y=tpr)
  plt.plot(fpr, tpr, color=clr, linestyle=ls,
           label='%s (auc = %0.5f)' % (label, roc_auc))
  
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],linestyle='--',color='gray',linewidth=2)
plt.xlim([-0.1,1.1])
plt.ylim([-0.1,1.1])
plt.grid(alpha=0.5)
plt.xlabel('False positive rate (FPR)')
plt.ylabel('True positive rate (TPR)')
plt.show()

# 1st RF

In [None]:
rf=RandomForestClassifier(random_state=SEED)

In [None]:
rf.fit(X_train.loc[:,Feature_doa],y_train2)
pred = rf.predict(X_test.loc[:,Feature_doa])
c_matrix = confusion_matrix(y_test2, pred)
c_report = classification_report(y_test2, pred)
c_matrix

In [None]:
print(c_report)

# 4th step (2nd ML)

In [None]:
Z_train = X_train.loc[(y=='died')|(y=='euthanized')]
y2_train = y_train.loc[(y=='died')|(y=='euthanized')]

Z_test = X_test.loc[pred==1]
y2_test = y_test.loc[pred==1]

In [None]:
y2_train

In [None]:
y2_train2 = y2_train.replace('lived',0).replace('died',1).replace('euthanized',0)
y2_test2 = y2_test.replace('lived',0).replace('died',1).replace('euthanized',0)

In [None]:
all_clf=[lr, rf, abc, gbc, bc, etc, xgbc, svm, mlp,dtc, lgb]
clf_labels=["LR", "RF","ABC", "GBC", "BC","ETC","XGBC", "SVM","MLP","DTC", "LGB"]
colors =["black", "orange", "blue", "green","pink","red","black", "orange", "blue", "green","pink"]
linestyles = [":","--", "-.","-","-","-",":","--", "-.","-","-","-"]

for clf, label,clr,ls in zip(all_clf, clf_labels, colors, linestyles):
  z_pred= clf.fit(Z_train,y2_train2).predict_proba(Z_test)[:,1]
  fpr,tpr,thresholds = roc_curve(y2_test2, z_pred)
  roc_auc= auc(x=fpr, y=tpr)
  plt.plot(fpr, tpr, color=clr, linestyle=ls,
           label='%s (auc = %0.5f)' % (label, roc_auc))
  
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],linestyle='--',color='gray',linewidth=2)
plt.xlim([-0.1,1.1])
plt.ylim([-0.1,1.1])
plt.grid(alpha=0.5)
plt.xlabel('False positive rate (FPR)')
plt.ylabel('True positive rate (TPR)')
plt.show()

In [None]:
rf.fit(Z_train.loc[:,Feature_doe],y2_train2)
pred2 = rf.predict(Z_test.loc[:,Feature_doe])
c_matrix = confusion_matrix(y2_test2, pred2)
c_report = classification_report(y2_test2, pred2)
c_matrix

In [None]:
print(c_report)

In [None]:
pred

In [None]:
pred2

In [None]:
first = pd.DataFrame(pred).replace(0,'lived')

In [None]:
second = pd.DataFrame(pred2).replace(1,'died')

In [None]:
first.shape, second.shape

In [None]:
first

In [None]:
PRED = first
PRED[PRED.iloc[:,0]==1] = second.values
PRED

In [None]:
PRED = PRED.replace(0,'euthanized')
PRED

In [None]:
c_matrix = confusion_matrix(y_test, PRED)
c_report = classification_report(y_test, PRED)
c_matrix

In [None]:
print(c_report)

# Prediction

In [None]:
test

In [None]:
Y = y.replace('lived',0).replace('died',1).replace('euthanized',1)
Y2 = y.replace('lived',0).replace('died',1).replace('euthanized',0)

In [None]:
rf.fit(X.loc[:,Feature_doa],Y)
pred_test = rf.predict(test.loc[:,Feature_doa])

In [None]:
rf.fit(X.loc[:,Feature_doe],Y2)
pred_test2 = rf.predict(test.loc[:,Feature_doe].loc[pred_test==1])

In [None]:
first = pd.DataFrame(pred_test).replace(0,'lived')
second = pd.DataFrame(pred_test2).replace(1,'died')
PRED_test = first
PRED_test[PRED_test.iloc[:,0]==1] = second.values
PRED_test = PRED_test.replace(0,'euthanized')
PRED_test

In [None]:
sample_submission

In [None]:
sample_submission['outcome'] = PRED_test.iloc[:,0]
sample_submission

In [None]:
sample_submission.to_csv('submission.csv', index=False)

# Thanks for taking a look.
# 
# Good performance was not achieved.
# 
# If you have a two-step binary classification method, please let us know.