In [34]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import os
import warnings
from functools import reduce
from typing import List, Dict
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from scipy.stats import mode
import time
from functools import reduce

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('float_format', '{:f}'.format)
warnings.filterwarnings("ignore")

In [40]:
train_csv = pd.read_csv(os.path.join(os.getcwd(), "train.csv"))
test_csv = pd.read_csv(os.path.join(os.getcwd(), "test.csv"))

In [42]:
df_train = train_csv.drop(["row_id"], axis=1)
df_test = test_csv.drop(["row_id"], axis=1)

isna_sum = reduce(lambda prev, current: prev + current, train_csv.isna().sum().values)
assert isna_sum == 0

In [44]:
random_state = 1
xgb_params = {
  'objective': 'multi:softmax',
  'eval_metric': 'mlogloss',
  'tree_method': 'gpu_hist',
  'predictor': 'gpu_predictor',
}
le = LabelEncoder()

def run_model(df_train: pd.DataFrame, df_test: pd.DataFrame, folds: int = 5, random_state: int = 1):
  df = df_train.copy(deep=True)
  df["target"] = le.fit_transform(df["target"])
  X = df.drop(["target"], axis=1)
  y = df["target"]
  scores = []
  probabilities = []
  predictions = []
  feature_cols = [i for i in df.columns if i not in ["target"]]
  str_kfold = StratifiedKFold(n_splits=folds, shuffle=True, random_state=random_state)
  i = 0

  for train_index, test_index in str_kfold.split(X, y):
    start = time.time()
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    model = XGBClassifier(**xgb_params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    scores.append(score)
    probabilities.append(model.predict_proba(df_test))
    predictions.append(model.predict(df_test))
    print(f"fold: {i}, score: {score}, time: {(start - time.time()):.2f}")
    i += 1
    
  print(f"mean score: {np.mean(scores)}")
  return (probabilities, predictions)

probabilities, predictions = run_model(df_train, df_test)

fold: 0, score: 0.858275
fold: 1, score: 0.857975
fold: 2, score: 0.85875
fold: 3, score: 0.8561
fold: 4, score: 0.8589
mean score: 0.858


In [46]:
sample_submission_csv = pd.read_csv(os.path.join(os.getcwd(), "drive/MyDrive/kaggle/sample_submission.csv"))
submission = sample_submission_csv.copy()
result = le.inverse_transform(mode(np.column_stack(predictions), axis=1)[0].flatten()).tolist()
submission.iloc[:, 1] = result
submission.to_csv("submission.csv", index=False)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>