In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import random

In [None]:
train = pd.read_csv('/kaggle/input/no-title/train_dataset.csv')
train.head()


In [None]:
train.isnull().sum()

In [None]:
train.describe()

In [None]:
binaries = ['Status', 'Drug', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema']
floats = [x for x in train.columns if x not in binaries]
train[floats].dtypes
# train[binaries]

In [None]:
# For float type columns

def fill_na_with_mean(df):
    df.apply(lambda x: x.fillna(x.mean()), axis=0)
    return df

def fill_na_with_random_from_std(df):
    df.apply(lambda x: x.fillna(random.randint(int(x.mean()-x.std()), int(x.mean()+x.std())), axis=0, inplace=True))
    return df

In [None]:
#  For Binary columns

def fill_na_with_a_value(df, val):
    df.apply(lambda x: x.fillna(val))
    return df

def fill_na_randomly(df):
    df.apply(lambda x: x.fillna(random.choice(x.dropna().unique().tolist()), axis=0, inplace=True))
    return df

In [None]:
train[floats] = fill_na_with_random_from_std(train[floats])

In [None]:
train.describe()

In [None]:
train[binaries] = fill_na_randomly(train[binaries])
train.isnull().sum()

In [None]:
train = pd.get_dummies(train)

In [None]:
train.columns

In [None]:
import optuna

from xgboost import XGBClassifier

from sklearn.metrics import f1_score, log_loss
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier

In [None]:
y = train.Stage.astype(int)
X = train.drop(['Stage', 'ID'], axis=1)

In [None]:
y -= 1

In [None]:
len(y[y == 0])

In [None]:
X.dtypes

In [None]:
def objective(trial, data=X, target=y):
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, random_state=0)
    
    params = {
        'num_class': 5,
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.005, 0.02, 0.05, 0.08, 0.1]),
        'n_estimators': 4000,
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
        'gamma': trial.suggest_float('gamma', 1e-5, 1.0, log = True),
        'alpha': trial.suggest_float('alpha', 1e-5, 10.0, log = True),
        'lambda': trial.suggest_float('lambda', 1e-5, 10.0, log = True),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 0.8),
        'subsample': trial.suggest_float('subsample', 0.1, 0.8),
        'tree_method': 'gpu_hist',
        'booster': 'gbtree',
        'random_state': 42,
        'use_label_encoder': False,
        'eval_metric': 'mlogloss'
    }
    
    xgb_model = XGBClassifier(**params)
    
    xgb_model.fit(X_train, y_train,
                 early_stopping_rounds = 200,
                 eval_set=[(X_val, y_val)],
                 verbose=False)
    
    preds = xgb_model.predict_proba(X_val)
    
    ll = log_loss(y_val, preds)
    return ll

In [None]:
%%time

study = optuna.create_study(direction='minimize', study_name="xgbclassifier")
study.optimize(objective, n_trials=30)

In [None]:
best_params = study.best_trial.params
best_params

In [None]:
optuna.visualization.plot_optimization_history(study)


In [None]:
optuna.visualization.plot_param_importances(study)


In [None]:
best_params['tree_method'] = 'gpu_hist'
best_params['booster'] = 'gbtree'
best_params['eval_metric'] = 'mlogloss'
best_params['random_state'] = 42
best_params['use_label_encoder'] = False
best_params

`{'max_depth': 4,
 'learning_rate': 0.05,
 'min_child_weight': 211,
 'gamma': 0.00012407542829713598,
 'alpha': 1.9866385273100282,
 'lambda': 1.866714249216817,
 'colsample_bytree': 0.2971450573343493,
 'subsample': 0.1626108344573282,
 'tree_method': 'gpu_hist',
 'booster': 'gbtree',
 'eval_metric': 'mlogloss',
 'random_state': 42,
 'use_label_encoder': False}`

In [None]:
best_params = {'max_depth': 4,
 'learning_rate': 0.05,
 'min_child_weight': 211,
 'gamma': 0.00012407542829713598,
 'alpha': 1.9866385273100282,
 'lambda': 1.866714249216817,
 'colsample_bytree': 0.2971450573343493,
 'subsample': 0.1626108344573282,
 'tree_method': 'gpu_hist',
 'booster': 'gbtree',
 'eval_metric': 'mlogloss',
 'random_state': 42,
 'use_label_encoder': False}

In [None]:
# X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, shuffle=True)

In [None]:
preds = np.zeros((test.shape[0], 4))
lls = []

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    
    print("Fold {}:".format(fold))
    
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    model = XGBClassifier(**best_params)
    
    model.fit(X_train, y_train,
             early_stopping_rounds=200,
             eval_set=[(X_val, y_val)],
             verbose=False)
    ll = log_loss(y_val, model.predict_proba(X_val))
    print("LogLoss: {}".format(ll))
    lls.append(ll)
    
    preds += model.predict_proba(test)
    
preds /= skf.n_splits

In [None]:
preds

In [None]:
y_train

In [None]:
rfc_model = RandomForestClassifier(n_estimators=4000,
                                  max_depth=3)
rfc_model.fit(X_train, y_train)

In [None]:
model = XGBClassifier(learning_rate=0.05,
                            max_depth = 3, 
                            n_estimators = 4000)
model.fit(X_train, y_train,
             eval_set=[(X_val, y_val)],
             verbose=False)

In [None]:
f1_score(y_val, model.predict(X_val), average='weighted')

In [None]:
f1_score(y_val, rfc_model.predict(X_val), average='weighted')

In [None]:
rfc_model.predict(X_val)

In [None]:
test = pd.read_csv('../input/no-title/test_dataset.csv')

In [None]:
test.head()

In [None]:
test.describe()

In [None]:
# floats.remove('Stage')
test[floats] = fill_na_with_random_from_std(test[floats])

In [None]:
test[binaries] = fill_na_randomly(test[binaries])

In [None]:
test.isnull().sum()

In [None]:
test.drop(['ID'], axis=1, inplace=True)

In [None]:
test = pd.get_dummies(test)

In [None]:
print(test)

In [None]:
predictions = model.predict(test)
predictions

In [None]:
model.predict_proba(test)

In [None]:
predictions += 1

In [None]:
set(predictions)

In [None]:
res = pd.DataFrame(predictions)
res.index = test.index
res.columns = ["Stage"]
res.to_csv("prediction_results.csv", index = False)

In [None]:
res