In [50]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import os
import warnings
import time
import math
from functools import reduce
from typing import List, Dict
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.experimental import enable_halving_search_cv
from sklearn.decomposition import PCA
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedKFold, train_test_split, HalvingGridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from xgboost import XGBClassifier
from scipy.stats import mode
from functools import reduce

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('float_format', '{:f}'.format)
warnings.filterwarnings("ignore")

In [51]:
train_csv = pd.read_csv(os.path.join(os.getcwd(), "train.csv"))
test_csv = pd.read_csv(os.path.join(os.getcwd(), "test.csv"))

In [52]:
df_train = train_csv.drop(["row_id"], axis=1)
df_test = test_csv.drop(["row_id"], axis=1)

isna_sum = reduce(lambda prev, current: prev + current, train_csv.isna().sum().values)
assert isna_sum == 0

In [53]:
def reduce_df_memory(df: pd.DataFrame) -> pd.DataFrame:
  memory_start = df.memory_usage().sum() / math.pow(1024, 2)
  for i in df.columns:
    
    if df[i].dtype != 'object':
      min_value = df[i].min()
      max_value = df[i].max()

      if "int" in str(df[i].dtype):
        if min_value > np.iinfo(np.int8).min and max_value < np.iinfo(np.int8).max:
            df[i] = df[i].astype(np.int8)
        elif min_value > np.iinfo(np.int16).min and max_value < np.iinfo(np.int16).max:
            df[i] = df[i].astype(np.int16)
        elif min_value > np.iinfo(np.int32).min and max_value < np.iinfo(np.int32).max:
            df[i] = df[i].astype(np.int32)
        elif min_value > np.iinfo(np.int64).min and max_value < np.iinfo(np.int64).max:
            df[i] = df[i].astype(np.int64)  
      elif "float" in str(df[i].dtype):
        if min_value > np.finfo(np.float16).min and max_value < np.finfo(np.float16).max:
          df[i] = df[i].astype(np.float16)
        elif min_value > np.finfo(np.float32).min and max_value < np.finfo(np.float32).max:
          df[i] = df[i].astype(np.float32)
        else:
          df[i] = df[i].astype(np.float64)

  memory_end = df.memory_usage().sum() / math.pow(1024, 2)
  print(f"memory before: {np.round(memory_start)}Mb, memory after: {np.round(memory_end, 2)}Mb, reduction: {np.round(((memory_start - memory_end) / memory_start) * 100.0)}%")
  return df

df_train_reduced = reduce_df_memory(df_train)
df_test_reduced = reduce_df_memory(df_test)


memory before: 438.0Mb, memory after: 110.63Mb, reduction: 75.0%
memory before: 218.0Mb, memory after: 54.55Mb, reduction: 75.0%


In [64]:
random_state = 1
ss = StandardScaler()
le = LabelEncoder()

y = le.fit_transform(df_train["target"])
X = df_train.drop('target', axis=1)

folds = 3
str_kfold = StratifiedKFold(n_splits=folds, shuffle=True, random_state=random_state)
scores = []
predictions = []

for train_index, test_index in str_kfold.split(X, y):
  start = time.time()
  X_train, X_test = X.iloc[train_index], X.iloc[test_index]
  y_train, y_test = y[train_index], y[test_index]

  X_train_ss = ss.fit_transform(X_train)
  X_test_ss = ss.transform(X_test)

  pca = PCA(n_components=100)
  X_train = pca.fit_transform(X_train_ss)
  X_test = pca.transform(X_test_ss)

  etc = ExtraTreesClassifier(n_estimators=500, class_weight='balanced', random_state=random_state).fit(X_train, y_train)
  y_pred = etc.predict(X_test)
  score = accuracy_score(y_test, y_pred)
  scores.append(score)

  c_report = classification_report(y_test, y_pred, target_names=le.classes_, output_dict=True)
  df_c_report = pd.DataFrame(c_report).T.iloc[:10, :]
  df_c_report[["f1-score"]] = df_c_report[["f1-score"]].mul(100)
  print(f"mean f1-score: {df_c_report['f1-score'].mean()}")
  display(df_c_report.sort_values("f1-score", ascending=False).style\
    .background_gradient(cmap="flare_r", subset=["f1-score"])\
    .format({'f1-score':'{:,.1f}%'}))
  
  df_test_copy = df_test.copy()
  df_test_copy_ss = ss.transform(df_test_copy)
  df_test_copy_pca = pca.transform(df_test_copy_ss)
  predictions.append(etc.predict(df_test_copy_pca))
  
  print(f"Took: {time.time() - start}s")

mean f1-score: 98.95788543649432


Unnamed: 0,precision,recall,f1-score,support
Klebsiella_pneumoniae,0.99667,0.995163,99.6%,6616.0
Campylobacter_jejuni,0.996992,0.991326,99.4%,6687.0
Staphylococcus_aureus,0.987027,0.996387,99.2%,6643.0
Bacteroides_fragilis,0.986994,0.994786,99.1%,6713.0
Salmonella_enterica,0.991738,0.988767,99.0%,6677.0
Enterococcus_hirae,0.991217,0.984509,98.8%,6649.0
Streptococcus_pneumoniae,0.988613,0.985953,98.7%,6692.0
Escherichia_fergusonii,0.986172,0.987359,98.7%,6645.0
Escherichia_coli,0.986615,0.986021,98.6%,6653.0
Streptococcus_pyogenes,0.983888,0.985505,98.5%,6692.0


Took: 184.34744572639465s
mean f1-score: 98.92744827035408


Unnamed: 0,precision,recall,f1-score,support
Klebsiella_pneumoniae,0.995455,0.993198,99.4%,6616.0
Campylobacter_jejuni,0.993579,0.994916,99.4%,6688.0
Bacteroides_fragilis,0.989336,0.995084,99.2%,6713.0
Salmonella_enterica,0.990691,0.988168,98.9%,6677.0
Staphylococcus_aureus,0.987843,0.990817,98.9%,6643.0
Streptococcus_pneumoniae,0.99322,0.985204,98.9%,6691.0
Enterococcus_hirae,0.989171,0.989171,98.9%,6649.0
Escherichia_coli,0.985449,0.987523,98.6%,6652.0
Streptococcus_pyogenes,0.985193,0.98431,98.5%,6692.0
Escherichia_fergusonii,0.982873,0.984351,98.4%,6646.0


Took: 227.46534943580627s
mean f1-score: 98.97843453023953


Unnamed: 0,precision,recall,f1-score,support
Klebsiella_pneumoniae,0.994254,0.993953,99.4%,6615.0
Bacteroides_fragilis,0.993005,0.993892,99.3%,6713.0
Campylobacter_jejuni,0.992669,0.992075,99.2%,6688.0
Enterococcus_hirae,0.991414,0.989923,99.1%,6649.0
Staphylococcus_aureus,0.989042,0.991871,99.0%,6643.0
Streptococcus_pneumoniae,0.989839,0.989987,99.0%,6691.0
Salmonella_enterica,0.989796,0.988017,98.9%,6676.0
Streptococcus_pyogenes,0.988762,0.986103,98.7%,6692.0
Escherichia_coli,0.986917,0.986472,98.7%,6653.0
Escherichia_fergusonii,0.982156,0.985555,98.4%,6646.0


Took: 232.13030004501343s


In [65]:
result = le.inverse_transform(mode(np.column_stack(predictions), axis=1)[0].flatten()).tolist()
submission_csv = pd.read_csv(os.path.join(os.getcwd(), "sample_submission.csv"))
submission_csv.iloc[:, 1] = result
submission_csv.to_csv("submission.csv", index=False)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>