In [1]:
import numpy as np 
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
import math
import datetime
from datetime import datetime,date

from sklearn.preprocessing import Binarizer,LabelEncoder
from sklearn.model_selection import KFold

from sklearn.ensemble import RandomForestRegressor,VotingRegressor
from sklearn.model_selection import RandomizedSearchCV
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor


import warnings
warnings.filterwarnings(action='ignore')

In [2]:
person = pd.read_csv('../input/cdm-data/person.csv')
visit_occurrence = pd.read_csv('../input/cdm-data/visit_occurrence.csv')
condition_occurrence = pd.read_csv('../input/cdm-data/condition_occurrence.csv')
drug_exposure = pd.read_csv('../input/cdm-data/drug_exposure.csv')
death = pd.read_csv('../input/cdm-data/death.csv')


In [3]:
print(person.shape)
print(visit_occurrence.shape)
print(condition_occurrence.shape)
print(death.shape)
print(drug_exposure.shape)

In [4]:
condition = condition_occurrence.iloc[::,0:3]
visit_occurrence_id = condition_occurrence.visit_occurrence_id
condition = pd.concat([condition,visit_occurrence_id],axis=1)
condition

In [5]:
visit_enter = visit_occurrence[visit_occurrence.visit_concept_id==9201]
visit_enter = visit_enter.iloc[::,0:6]

In [6]:
visit_enter

In [7]:
df = pd.merge(visit_enter.set_index('visit_occurrence_id'),condition.set_index('visit_occurrence_id'),
              left_index=True,right_index=True,how='inner')
df

In [8]:
df = df.reset_index().drop_duplicates(['visit_occurrence_id'])
df.reset_index(drop=True,inplace=True)
df

In [9]:
y = []
for i in range(len(df)):
    start = datetime.strptime(df.visit_start_date[i],'%Y-%m-%d')
    end = datetime.strptime(df.visit_end_date[i],'%Y-%m-%d')
    day = abs(start-end)
    y.append(day.days)
len(y)

In [10]:
df['y'] = y
df

In [11]:
drug_exposure

In [12]:
drug = drug_exposure.iloc[::,0:3]
visit = drug_exposure.visit_occurrence_id
drug = pd.concat([drug,visit],axis=1)
drug

In [13]:
df = pd.merge(df.set_index('visit_occurrence_id'),drug.set_index('visit_occurrence_id'),left_index=True
         ,right_index=True,how='inner')

In [14]:
df

In [15]:
df = df.reset_index().drop_duplicates(['visit_occurrence_id'])
df

In [16]:
death = death.iloc[::,0:2]

In [17]:
final_df = pd.merge(df.set_index('person_id'),death.set_index('person_id'),
              left_index=True,right_index=True,how='outer')

In [18]:
final_df

In [20]:
death_num = final_df.death_date
death_num.fillna(0,inplace=True)

binary_death = []
for i in death_num:
    if i == 0:
        binary_death.append(0)
    else:
        binary_death.append(1)
final_df['death'] = binary_death
final_df

In [21]:
final_df.dropna(inplace=True)

In [22]:
final_df

In [23]:
person.drop(['gender_concept_id','month_of_birth','birth_datetime','race_concept_id','day_of_birth',
            'ethnicity_concept_id','location_id','provider_id','care_site_id','person_source_value',
            'gender_source_concept_id','race_source_value','race_source_concept_id','ethnicity_source_concept_id'],
           axis=1,inplace=True)

In [24]:
final_df.drop(['death_date','person_id_y','visit_end_date','visit_start_date',
               'visit_concept_id','person_id_x','visit_occurrence_id']
              ,axis=1,inplace=True)

In [71]:
ex = pd.merge(final_df,person.set_index('person_id'),left_index=True,right_index=True,how='outer')
ex.dropna(inplace=True)
ex

In [72]:
final_df = ex

In [73]:
final_df.reset_index()

In [74]:
final_df.hist(figsize=(15,10))

In [29]:
final_df.info()

In [30]:
final_df.describe()

In [31]:
label = LabelEncoder()
final_df['condition_concept_id'] = label.fit_transform(final_df.condition_concept_id)
final_df['drug_concept_id'] = label.fit_transform(final_df.drug_concept_id)
final_df['gender_source_value'] = label.fit_transform(final_df.gender_source_value)

In [32]:
corr_data = final_df[final_df.keys()]
color_map = plt.cm.PuBu
cols = corr_data.corr().nlargest(10,'y')['y'].index
cm = np.corrcoef(final_df[cols].values.T)
f, ax = plt.subplots(figsize=(15,10))
heatmap = sns.heatmap(cm, vmax=1, linewidths=0.1,square=True,annot=True,
                       cmap=color_map, linecolor="white",xticklabels = cols.values ,yticklabels = cols.values)

In [33]:
final_df.drop(['ethnicity_source_value','condition_occurrence_id','drug_exposure_id',
              'visit_start_datetime'],axis=1,inplace=True)
final_df

In [35]:
train , test = final_df[math.ceil(len(final_df)*0.2):],final_df[-math.ceil(len(final_df)*0.2):]
print(train.shape,test.shape)

In [36]:
x_train = train.drop('y',axis=1)
y_train = train.y
print(x_train.shape,y_train.shape)

In [37]:
x_test = test.drop('y',axis=1)
y_test = test.y
print(x_test.shape,y_test.shape)

In [38]:
kf = KFold(n_splits=10,shuffle=True)

In [39]:
def MAPELoss(output,target):
    return np.mean(np.abs((target - output)/ target))

In [41]:
Cat = CatBoostRegressor()
XGB = XGBRegressor()
LGB = LGBMRegressor()
RF = RandomForestRegressor()

voting = VotingRegressor(estimators=[
    ('catboost',Cat),
    ('xgboost',XGB),
    ('LGB',LGB),
    ('RandomForest',RF)
])
voting_model = voting.fit(x_train.values,y_train.values)
pred = voting_model.predict(x_test.values)

In [42]:
print('voting')
score = MAPELoss(pred,y_test.values)
print('Test set result MAPE:{:.3f}'.format(score))

In [52]:
plot_result(pred)

In [43]:
def RF_model(n_estimators,n_jobs):
    RF_models = []
    for fold, (train_idx,valid_idx) in enumerate(kf.split(x_train,y_train)):
        train_x , valid_x = x_train.values[train_idx], x_train.values[valid_idx]
        train_y , valid_y = y_train.values[train_idx], y_train.values[valid_idx]
        print(f'Fold:{fold+1}')
        model = RandomForestRegressor(n_estimators=n_estimators,n_jobs=n_jobs,random_state=0)
        model.fit(train_x,train_y)
        pred = model.predict(valid_x)
        score = MAPELoss(pred,valid_y)
        print(f'MAPE:{score}')
        RF_models.append(model)
    return RF_models
    

In [44]:
estimator_300 = RF_model(300,8)
estimator_1000 = RF_model(1000,8)

In [45]:
def predict(models):
    RF_results = []
    for model in models:
        pred = model.predict(x_test)
        RF_results.append(pred)
    RF_pred = np.mean(RF_results,axis=0)
    return RF_pred

In [46]:
estimator_300_result = predict(estimator_300)
estimator_1000_result = predict(estimator_1000)

In [47]:
def plot_result(pred):
    plt.figure(figsize=(20,10))
    plt.plot(pred,label='RandomForest_pred')
    plt.plot(y_test.values,label='real_hospitalization')
    plt.title('Compare real_hospitalization with prediction')
    plt.legend()
    plt.grid()
    return plt.show()

In [48]:
plot_result(estimator_300_result)
plot_result(estimator_1000_result)

In [49]:
print('n_estimator:300')
score = MAPELoss(estimator_300_result,y_test.values)
print('Test set result MAPE:{:.3f}'.format(score))
print('n_estimator:1000')
score = MAPELoss(estimator_1000_result,y_test.values)
print('Test set result MAPE:{:.3f}'.format(score))

In [67]:
n_estimators = [int(x) for x in np.linspace(1000, 10000, num = 10)]
max_depth = [int(x) for x in np.linspace(10, 110, num = 10)]
num_leaves = [10,20,30,40]
learning_rate = [1e-1,1e-2,1e-3,1e-4]

random_grid_LGB = {'n_estimators':n_estimators,
                 'max_depth':max_depth,
                  'num_leaves':num_leaves,
                  'learning_rate':learning_rate
                 }

In [75]:
LGB = LGBMRegressor(objective='regression')
LGB_random = RandomizedSearchCV(estimator=LGB,param_distributions=random_grid_LGB,n_iter=100,cv=5,
                              verbose=2,random_state=0,n_jobs=-1)
LGB_random.fit(x_train.values,y_train.values)

In [76]:
LGB_random.best_params_

In [77]:
models = []
for fold, (train_idx,valid_idx) in enumerate(kf.split(x_train,y_train)):
    train_x , valid_x = x_train.values[train_idx], x_train.values[valid_idx]
    train_y , valid_y = y_train.values[train_idx], y_train.values[valid_idx]
    print(f'Fold:{fold+1}')
    model = LGBMRegressor(objective='regression',
                         max_depth=65,
                         n_estimators=2000,
                          learning_rate=0.001,
                          num_leaves=20
                         )
    model.fit(train_x,train_y,
             eval_set=[(valid_x,valid_y)],
             eval_metric=['mape'],
             early_stopping_rounds=1000,
             verbose=500)
    models.append(model)
    
    

In [78]:
LGB_results = []
for model in models:
    pred = model.predict(x_test)
    LGB_results.append(pred)
LGB_pred = np.mean(LGB_results,axis=0)

In [80]:
plot_result(LGB_pred)

In [81]:
print('LGB best_params')
score = MAPELoss(LGB_pred,y_test.values)
print('Test set result MAPE:{:.3f}'.format(score))