In [None]:
import pandas as pd
import numpy as np
import gc
import eli5

import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import train_test_split, StratifiedKFold
from ml_metrics import rmsle

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif
from sklearn.preprocessing import StandardScaler

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

train = pd.read_hdf('../input/diabetic_train.h5')
test = pd.read_hdf('../input/diabetic_test.h5')
df_all = pd.concat([train, test])
del train, test
gc.collect()

## Opis
Kluczem do poprawy mojego wyniku byla agregacja zmiennych numerycznych dla patient number.
W zbiorze danych patient number nie byl unikanym kodem. 

Ponizej znajduje sie set zagregowanych danych, ktory dal moj najlepszy wynik. 
Zauwazylem, ze model czasami uzywa mean a czasmi median wiec postanowilem pozostawic obie funkcje agregacji. 

## Feature Engineering

In [None]:
## Dodatkowe zmienne uzywane przez model
df_all['imp_diag'] = df_all['number_inpatient'] * df_all['number_diagnoses']
df_all['imp_emer'] = df_all['number_inpatient'] * df_all['number_emergency']

## Agregacja wynikow dla patient number. 
patient_imp_diag = df_all.groupby('patient_nbr').imp_diag.agg([np.mean, np.median])
df_all = pd.merge(df_all, patient_imp_diag, how='left', on='patient_nbr')
df_all = df_all.rename(columns={'mean':'patient_imp_diag_mean', 'median':'patient_imp_diag_median'})

patient_imp_emer = df_all.groupby('patient_nbr').imp_emer.agg([np.mean, np.median])
df_all = pd.merge(df_all, patient_imp_emer, how='left', on='patient_nbr')
df_all = df_all.rename(columns={'mean':'patient_imp_emer_mean', 'median':'patient_imp_emer_median'})

patient_nbrtime = df_all.groupby('patient_nbr').time_in_hospital.agg([np.mean, np.median])
df_all = pd.merge(df_all, patient_nbrtime, how='left', on='patient_nbr')
df_all = df_all.rename(columns={'mean':'patient_nbrtime_mean', 'median':'patient_nbrtime_median'})

patient_nbrnum_lab_procedures = df_all.groupby(['patient_nbr']).num_lab_procedures.agg([np.mean, np.median])
df_all = pd.merge(df_all, patient_nbrnum_lab_procedures, how='left', on=['patient_nbr'])
df_all = df_all.rename(columns={'mean':'patient_nbrnum_lab_procedures_mea', 'median':'patient_nbrnum_lab_procedures_med'})

patient_nbrnum_procedures = df_all.groupby(['patient_nbr']).num_procedures.agg([np.mean, np.median])
df_all = pd.merge(df_all, patient_nbrnum_procedures, how='left', on=['patient_nbr'])
df_all = df_all.rename(columns={'mean':'patient_nbrnum_procedures_mea', 'median':'patient_nbrnum_procedures_med'})

patient_nbrnum_medications = df_all.groupby(['patient_nbr']).num_medications.agg([np.mean, np.median])
df_all = pd.merge(df_all, patient_nbrnum_medications, how='left', on=['patient_nbr'])
df_all = df_all.rename(columns={'mean':'patient_nbrnum_medications_mea', 'median':'patient_nbrnum_medications_med'})

patient_nbrnumber_outpatient = df_all.groupby(['patient_nbr']).number_outpatient.agg([np.mean, np.median])
df_all = pd.merge(df_all, patient_nbrnumber_outpatient, how='left', on=['patient_nbr'])
df_all = df_all.rename(columns={'mean':'patient_nbrnumber_outpatient_mea', 'median':'patient_nbrnumber_outpatient_med'})

patient_nbrnumber_emergency = df_all.groupby(['patient_nbr']).number_emergency.agg([np.mean, np.median])
df_all = pd.merge(df_all, patient_nbrnumber_emergency, how='left', on=['patient_nbr'])
df_all = df_all.rename(columns={'mean':'patient_nbrnumber_emergency_mea', 'median':'patient_nbrnumber_emergency_med'})

patient_nbrnumber_inpatient = df_all.groupby(['patient_nbr']).number_inpatient.agg([np.mean, np.median])
df_all = pd.merge(df_all, patient_nbrnumber_inpatient, how='left', on=['patient_nbr'])
df_all = df_all.rename(columns={'mean':'patient_nbrnumber_inpatient_mea', 'median':'patient_nbrnumber_inpatient_med'})

patient_number_diagnoses = df_all.groupby(['patient_nbr']).number_diagnoses.agg([np.mean, np.median])
df_all = pd.merge(df_all, patient_number_diagnoses, how='left', on=['patient_nbr'])
df_all = df_all.rename(columns={'mean':'patient_number_diagnoses_mea', 'median':'patient_number_diagnoses_med'})

### Dodatkowe agregacje dla id
admission_type_id_pat = df_all.groupby(['patient_nbr']).admission_type_id.agg([np.mean, np.median])
df_all = pd.merge(df_all, admission_type_id_pat, how='left', on=['patient_nbr'])
f_all = df_all.rename(columns={'mean':'admission_type_id_pat_mea', 'median':'admission_type_id_pat_med'})

discharge_disposition_id_pat = df_all.groupby(['patient_nbr']).discharge_disposition_id.agg([np.mean, np.median])
df_all = pd.merge(df_all, discharge_disposition_id_pat, how='left', on=['patient_nbr'])
df_all = df_all.rename(columns={'mean':'discharge_disposition_id_pat_mea', 'median':'discharge_disposition_id_pat_med'})

admission_source_id_pat = df_all.groupby(['patient_nbr']).admission_source_id.agg([np.mean, np.median])
df_all = pd.merge(df_all, admission_source_id_pat, how='left', on=['patient_nbr'])
f_all = df_all.rename(columns={'mean':'admission_source_id_pat_mea', 'median':'admission_source_id_pat_mean'})

time_in_hospital_pat = df_all.groupby(['patient_nbr']).time_in_hospital.agg([np.mean, np.median])
df_all = pd.merge(df_all, time_in_hospital_pat, how='left', on=['patient_nbr'])
f_all = df_all.rename(columns={'mean':'time_in_hospital_pat_mea', 'median':'time_in_hospital_pat_mean'})

In [None]:
## Factorize zmiennych nie-numerycznych
cat_feats = df_all.select_dtypes(include=[np.object]).drop(columns='readmitted').columns
for cat_feat in cat_feats:
    df_all['{0}_cat'.format(cat_feat)] = pd.factorize( df_all[cat_feat] )[0]

## Model

In [None]:
## Standardowy kod
feats = df_all.select_dtypes(include=np.number).columns
black_list = ['id', 'encounter_id']
feats = [feat for feat in feats if feat not in black_list]
train = df_all[df_all.readmitted != '']
test = df_all[df_all.readmitted == '']
X_test = test[feats].values
train['readmitted'] = train['readmitted'].values.astype(np.int8)

X_train = train[feats].values
y_train = train['readmitted'].values

## Zmiana wartosci ujemnych na absolute. Wymagane przez SelectKBest.
X_train = np.absolute(X_train)
X_test = np.absolute(X_test)

## Wybor "najlepszych" zmiennych na podstawie "chi2". Wybor 50 features nie jest przypadkowy. 
## Z wieksza iloscia features model nie poprawial wyniku. 
## Chi2 (Chi-squared stats of non-negative features for classification tasks - sklearn documentation).

best_features = SelectKBest(score_func=chi2, k=50)
fit = best_features.fit(X_train, y_train)
X_train = fit.transform(X_train)
X_test = fit.transform(X_test)

## Po 1000 max_evals moj best_params wykazal ponizsze parametry dla modelu.
## Najlepsza proba to "best loss: 1.2763671875"
model = xgb.XGBClassifier(n_estimators=int(300.0),
                        colsample_bytree=0.8465094069349387,
                        learning_rate=0.06205850583029093,
                        max_depth=int(5.0),
                        min_child_weight=int(10.0),
                        random_state=int(4400.0),
                        reg_alpha=2.374486413700263,
                        reg_lambda=0.9544047687147013,
                        subsample=0.9406416711919477)

## Standardowy kod
cv = StratifiedKFold(n_splits=4)
scores = []
for train_idx, test_idx in cv.split(X_train, y_train):
    X_fold_train, X_fold_test = X_train[train_idx], X_train[test_idx]
    y_fold_train, y_fold_test = y_train[train_idx], y_train[test_idx]

    model.fit(X_fold_train, y_fold_train)
    y_pred = model.predict(X_fold_test)

    score = rmsle(y_fold_test, y_pred)
    scores.append( score )

print('score:' ,np.mean(scores), 'std: ', np.std(scores))

## Save results

In [None]:
#predict test and save result
test['readmitted'] = model.predict(X_test)
test[ ['id', 'readmitted'] ].to_csv('../output/xz9.csv', index=False)