In [1]:
import pandas_profiling
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn import metrics
import numpy as np
import pandas as pd 
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, mean_squared_error, r2_score, roc_auc_score, roc_curve, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import KFold
import warnings
warnings.simplefilter(action = "ignore") 


In [2]:
df = pd.read_csv('diabetic_data.csv')

In [3]:
df.shape

(101766, 50)

In [4]:
df.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide_metformin', 'glipizide_metformin',
       'glimepiride_pioglitazone', 'metformin_rosiglitazone',
       'metformin_pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

In [5]:
df = df.drop(['encounter_id','weight', 'patient_nbr', 'race', 'admission_type_id', 'time_in_hospital', 'num_procedures', 'discharge_disposition_id', 'admission_source_id', 'payer_code', 'medical_specialty', 'number_outpatient', 'number_emergency'
, 'number_inpatient', 'number_diagnoses', 'change', 'readmitted', 'examide', 'citoglipton'], axis=1)

In [6]:
df.columns

Index(['gender', 'age', 'num_lab_procedures', 'num_medications', 'diag_1',
       'diag_2', 'diag_3', 'max_glu_serum', 'A1Cresult', 'metformin',
       'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
       'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'insulin', 'glyburide_metformin', 'glipizide_metformin',
       'glimepiride_pioglitazone', 'metformin_rosiglitazone',
       'metformin_pioglitazone', 'diabetesMed'],
      dtype='object')

In [7]:
df.dtypes

gender                       object
age                          object
num_lab_procedures            int64
num_medications               int64
diag_1                      float64
diag_2                      float64
diag_3                      float64
max_glu_serum                object
A1Cresult                    object
metformin                    object
repaglinide                  object
nateglinide                  object
chlorpropamide               object
glimepiride                  object
acetohexamide                object
glipizide                    object
glyburide                    object
tolbutamide                  object
pioglitazone                 object
rosiglitazone                object
acarbose                     object
miglitol                     object
troglitazone                 object
tolazamide                   object
insulin                      object
glyburide_metformin          object
glipizide_metformin          object
glimepiride_pioglitazone    

In [8]:
df.isnull().sum()

gender                         3
age                            0
num_lab_procedures             0
num_medications                0
diag_1                      1666
diag_2                      2894
diag_3                      6481
max_glu_serum                  0
A1Cresult                      0
metformin                      0
repaglinide                    0
nateglinide                    0
chlorpropamide                 0
glimepiride                    0
acetohexamide                  0
glipizide                      0
glyburide                      0
tolbutamide                    0
pioglitazone                   0
rosiglitazone                  0
acarbose                       0
miglitol                       0
troglitazone                   0
tolazamide                     0
insulin                        0
glyburide_metformin            0
glipizide_metformin            0
glimepiride_pioglitazone       0
metformin_rosiglitazone        0
metformin_pioglitazone         0
diabetesMe

In [9]:
df['age'] = df['age'].replace(to_replace= {
'[0-10)': '5',
'[10-20)': '15',
'[20-30)': '25',
'[30-40)': '35',
'[40-50)': '45',
'[50-60)': '55',
'[60-70)': '65',
'[70-80)': '75',
'[80-90)': '85',
'[90-100)': '95'
})

In [10]:
df['age']=df['age'].astype(int)

In [11]:
df['diag_1'] = df['diag_1'].fillna(df['diag_1'].mean())
df['diag_2'] = df['diag_2'].fillna(df['diag_2'].mean())
df['diag_3'] = df['diag_3'].fillna(df['diag_3'].mean())

In [12]:
df['race'] = df['gender'].dropna()

In [13]:
df['diabetesMed'] = df['diabetesMed'].apply(lambda x: 1 if x == 'Yes' else 0)
df['gender'] = df['gender'].apply(lambda x: 1 if x == 'Female' else 0)

In [14]:
df['race'] = df['gender'].dropna()

In [15]:
df.isnull().sum()

gender                      0
age                         0
num_lab_procedures          0
num_medications             0
diag_1                      0
diag_2                      0
diag_3                      0
max_glu_serum               0
A1Cresult                   0
metformin                   0
repaglinide                 0
nateglinide                 0
chlorpropamide              0
glimepiride                 0
acetohexamide               0
glipizide                   0
glyburide                   0
tolbutamide                 0
pioglitazone                0
rosiglitazone               0
acarbose                    0
miglitol                    0
troglitazone                0
tolazamide                  0
insulin                     0
glyburide_metformin         0
glipizide_metformin         0
glimepiride_pioglitazone    0
metformin_rosiglitazone     0
metformin_pioglitazone      0
diabetesMed                 0
race                        0
dtype: int64

In [16]:
pandas_profiling.ProfileReport(df)

Summarize dataset: 100%|██████████| 81/81 [04:17<00:00,  3.18s/it, Completed]                                     
Generate report structure: 100%|██████████| 1/1 [01:07<00:00, 67.99s/it]
Render HTML: 100%|██████████| 1/1 [00:21<00:00, 21.46s/it]




In [17]:
df.columns

Index(['gender', 'age', 'num_lab_procedures', 'num_medications', 'diag_1',
       'diag_2', 'diag_3', 'max_glu_serum', 'A1Cresult', 'metformin',
       'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
       'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'insulin', 'glyburide_metformin', 'glipizide_metformin',
       'glimepiride_pioglitazone', 'metformin_rosiglitazone',
       'metformin_pioglitazone', 'diabetesMed', 'race'],
      dtype='object')

In [18]:
a = pd.get_dummies(df.max_glu_serum, prefix='max_glu_serum')
b = pd.get_dummies(df.A1Cresult, prefix='A1Cresult')
c = pd.get_dummies(df.metformin, prefix='metformin')
d = pd.get_dummies(df.repaglinide, prefix='repaglinide')
e = pd.get_dummies(df.nateglinide, prefix='nateglinide')
f = pd.get_dummies(df.chlorpropamide, prefix='chlorpropamide')
g = pd.get_dummies(df.glimepiride, prefix='glimepiride')
h = pd.get_dummies(df.acetohexamide, prefix='acetohexamide')
i = pd.get_dummies(df.glipizide, prefix='glipizide')
j = pd.get_dummies(df.glyburide, prefix='glyburide')
k = pd.get_dummies(df.tolbutamide, prefix='tolbutamide')
l = pd.get_dummies(df.pioglitazone, prefix='pioglitazone')
m = pd.get_dummies(df.rosiglitazone, prefix='rosiglitazone')
n = pd.get_dummies(df.acarbose, prefix='acarbose')
o = pd.get_dummies(df.miglitol, prefix='miglitol')
p = pd.get_dummies(df.troglitazone, prefix='troglitazone')
q = pd.get_dummies(df.tolazamide, prefix='tolazamide')
r = pd.get_dummies(df.insulin, prefix='insulin')
s = pd.get_dummies(df.glyburide_metformin, prefix='glyburide_metformin')
t = pd.get_dummies(df.glipizide_metformin, prefix='glipizide_metformin')
u = pd.get_dummies(df.glimepiride_pioglitazone, prefix='glimepiride_pioglitazone')
v = pd.get_dummies(df.metformin_rosiglitazone, prefix='metformin_rosiglitazone')
w = pd.get_dummies(df.metformin_pioglitazone, prefix='metformin_pioglitazone')

In [19]:
one_hot=pd.concat([a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w], axis=1)

In [20]:
one_hot

Unnamed: 0,max_glu_serum_>200,max_glu_serum_>300,max_glu_serum_None,max_glu_serum_Norm,A1Cresult_>7,A1Cresult_>8,A1Cresult_None,A1Cresult_Norm,metformin_Down,metformin_No,...,glyburide_metformin_Steady,glyburide_metformin_Up,glipizide_metformin_No,glipizide_metformin_Steady,glimepiride_pioglitazone_No,glimepiride_pioglitazone_Steady,metformin_rosiglitazone_No,metformin_rosiglitazone_Steady,metformin_pioglitazone_No,metformin_pioglitazone_Steady
0,0,0,1,0,0,0,1,0,0,1,...,0,0,1,0,1,0,1,0,1,0
1,0,0,1,0,0,0,1,0,0,1,...,0,0,1,0,1,0,1,0,1,0
2,0,0,1,0,0,0,1,0,0,1,...,0,0,1,0,1,0,1,0,1,0
3,0,0,1,0,0,0,1,0,0,1,...,0,0,1,0,1,0,1,0,1,0
4,0,0,1,0,0,0,1,0,0,1,...,0,0,1,0,1,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,0,0,1,0,0,1,0,0,0,0,...,0,0,1,0,1,0,1,0,1,0
101762,0,0,1,0,0,0,1,0,0,1,...,0,0,1,0,1,0,1,0,1,0
101763,0,0,1,0,0,0,1,0,0,0,...,0,0,1,0,1,0,1,0,1,0
101764,0,0,1,0,0,0,1,0,0,1,...,0,0,1,0,1,0,1,0,1,0


In [21]:
df2= df.drop(['max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 
'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'insulin', 'glyburide_metformin', 'glipizide_metformin', 'glimepiride_pioglitazone', 'metformin_rosiglitazone', 
'metformin_pioglitazone'],axis=1)

In [22]:
df2

Unnamed: 0,gender,age,num_lab_procedures,num_medications,diag_1,diag_2,diag_3,diabetesMed,race
0,1,5,41,1,250.83,438.674883,418.179285,0,1
1,1,15,59,18,276.00,250.010000,255.000000,1,1
2,1,25,11,13,648.00,250.000000,418.179285,1,1
3,0,35,44,16,8.00,250.430000,403.000000,1,0
4,0,45,51,8,197.00,157.000000,250.000000,1,0
...,...,...,...,...,...,...,...,...,...
101761,0,75,51,16,250.13,291.000000,458.000000,1,0
101762,1,85,33,18,560.00,276.000000,787.000000,1,1
101763,0,75,53,9,38.00,590.000000,296.000000,1,0
101764,1,85,45,21,996.00,285.000000,998.000000,1,1


In [23]:
df_concat=pd.concat([df2,one_hot], axis=1)

In [24]:
df_concat

Unnamed: 0,gender,age,num_lab_procedures,num_medications,diag_1,diag_2,diag_3,diabetesMed,race,max_glu_serum_>200,...,glyburide_metformin_Steady,glyburide_metformin_Up,glipizide_metformin_No,glipizide_metformin_Steady,glimepiride_pioglitazone_No,glimepiride_pioglitazone_Steady,metformin_rosiglitazone_No,metformin_rosiglitazone_Steady,metformin_pioglitazone_No,metformin_pioglitazone_Steady
0,1,5,41,1,250.83,438.674883,418.179285,0,1,0,...,0,0,1,0,1,0,1,0,1,0
1,1,15,59,18,276.00,250.010000,255.000000,1,1,0,...,0,0,1,0,1,0,1,0,1,0
2,1,25,11,13,648.00,250.000000,418.179285,1,1,0,...,0,0,1,0,1,0,1,0,1,0
3,0,35,44,16,8.00,250.430000,403.000000,1,0,0,...,0,0,1,0,1,0,1,0,1,0
4,0,45,51,8,197.00,157.000000,250.000000,1,0,0,...,0,0,1,0,1,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,0,75,51,16,250.13,291.000000,458.000000,1,0,0,...,0,0,1,0,1,0,1,0,1,0
101762,1,85,33,18,560.00,276.000000,787.000000,1,1,0,...,0,0,1,0,1,0,1,0,1,0
101763,0,75,53,9,38.00,590.000000,296.000000,1,0,0,...,0,0,1,0,1,0,1,0,1,0
101764,1,85,45,21,996.00,285.000000,998.000000,1,1,0,...,0,0,1,0,1,0,1,0,1,0


In [25]:
#Regresion Logistica
df2.columns

Index(['gender', 'age', 'num_lab_procedures', 'num_medications', 'diag_1',
       'diag_2', 'diag_3', 'diabetesMed', 'race'],
      dtype='object')

In [26]:
#Aqui elegiremos 2 campos: Porcentaje de personas de estatos bajos, y cantidad promedio de habitaciones por vivienda.
X = df2[['age', 'num_lab_procedures','num_medications', 'diag_1', 'diag_2', 'diag_3']]
Y = df2['diabetesMed']

# Dividamos la muestra en el subconjunto de entrenamiento y el subconjunto de prueba
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state=5)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(71236, 6)
(30530, 6)
(71236,)
(30530,)


In [27]:
lin_model = LinearRegression() #generar una instancia de regresion lineal (asignada a un objeto)
lin_model.fit(X_train, Y_train)

LinearRegression()

In [28]:
print(lin_model.coef_)
print(lin_model.intercept_)

[-7.26234543e-04 -4.39288848e-04  1.00668080e-02 -6.27499542e-05
 -3.93398716e-05 -4.94739997e-05]
0.7441941385445053


In [29]:
y_train_predict = lin_model.predict(X_train)
MSE = mean_squared_error(Y_train,y_train_predict)
print("Entrenamiento: MSE ="+str(MSE))

y_test_predict = lin_model.predict(X_test)
MSE = (mean_squared_error(Y_test, y_test_predict))
print("Pruebas: MSE ="+str(MSE))

Entrenamiento: MSE =0.17046415282221547
Pruebas: MSE =0.17037386698258428


In [30]:
df_predicciones = pd.DataFrame({'valor_real':Y_test, 'prediccion':y_test_predict})
df_predicciones = df_predicciones.reset_index(drop = True)
df_predicciones.head(10)

Unnamed: 0,valor_real,prediccion
0,0,0.738935
1,1,0.843732
2,0,0.713915
3,1,0.815835
4,1,0.706377
5,0,0.736898
6,1,0.762099
7,1,0.765994
8,1,0.79198
9,0,0.783531


In [31]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error

In [32]:

X = df2[['age', 'num_lab_procedures','num_medications', 'diag_1', 'diag_2', 'diag_3']]
Y = df2['diabetesMed']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state=5)

In [33]:
logreg = LogisticRegression(random_state = 42)
logreg.fit(X_train, Y_train)

LogisticRegression(random_state=42)

In [34]:
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier

X_axis = list(range(1, 31))
acc = pd.Series()
x = range(1,31)

for i in list(range(1, 31)):
    knn_model = KNeighborsClassifier(n_neighbors = i) 
    knn_model.fit(X_train, Y_train)
    prediction = knn_model.predict(X_test)
    acc = acc.append(pd.Series(metrics.accuracy_score(prediction, Y_test)))
plt.plot(X_axis, acc)
plt.xticks(x)
plt.title("Finding best value for n_estimators")
plt.xlabel("n_estimators")
plt.ylabel("Accuracy")
plt.grid()
plt.show()
print('Highest value: ',acc.values.max())

Highest value:  0.7692761218473633


In [35]:
# K nearest neighbors Algorithm
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 24, metric = 'minkowski', p = 2)
knn.fit(X_train, Y_train)

KNeighborsClassifier(n_neighbors=24)

In [36]:
# Support Vector Classifier Algorithm
from sklearn.svm import SVC
svc = SVC(kernel = 'linear', random_state = 42)
svc.fit(X_train, Y_train)

SVC(kernel='linear', random_state=42)

In [37]:
# Naive Bayes Algorithm
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(X_train, Y_train)

GaussianNB()

In [38]:
# Decision tree Algorithm
from sklearn.tree import DecisionTreeClassifier
dectree = DecisionTreeClassifier(criterion = 'entropy', random_state = 42)
dectree.fit(X_train, Y_train)

DecisionTreeClassifier(criterion='entropy', random_state=42)

In [39]:
# Random forest Algorithm
from sklearn.ensemble import RandomForestClassifier
ranfor = RandomForestClassifier(n_estimators = 11, criterion = 'entropy', random_state = 42)
ranfor.fit(X_train, Y_train)

RandomForestClassifier(criterion='entropy', n_estimators=11, random_state=42)

In [40]:
# Making predictions on test dataset
Y_pred_logreg = logreg.predict(X_test)
Y_pred_knn = knn.predict(X_test)
Y_pred_svc = svc.predict(X_test)
Y_pred_nb = nb.predict(X_test)
Y_pred_dectree = dectree.predict(X_test)
Y_pred_ranfor = ranfor.predict(X_test)

In [41]:
# Evaluating using accuracy_score metric
from sklearn.metrics import accuracy_score
accuracy_logreg = accuracy_score(Y_test, Y_pred_logreg)
accuracy_knn = accuracy_score(Y_test, Y_pred_knn)
accuracy_svc = accuracy_score(Y_test, Y_pred_svc)
accuracy_nb = accuracy_score(Y_test, Y_pred_nb)
accuracy_dectree = accuracy_score(Y_test, Y_pred_dectree)
accuracy_ranfor = accuracy_score(Y_test, Y_pred_ranfor)

In [42]:
# Accuracy on test set
print("Logistic Regression: " + str(accuracy_logreg * 100))
print("K Nearest neighbors: " + str(accuracy_knn * 100))
print("Support Vector Classifier: " + str(accuracy_svc * 100))
print("Naive Bayes: " + str(accuracy_nb * 100))
print("Decision tree: " + str(accuracy_dectree * 100))
print("Random Forest: " + str(accuracy_ranfor * 100))

Logistic Regression: 77.00622338683263
K Nearest neighbors: 76.67867671143138
Support Vector Classifier: 77.05535538814281
Naive Bayes: 76.98984605306256
Decision tree: 66.6852276449394
Random Forest: 75.40779561087454


In [43]:
# Confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Y_test, Y_pred_knn)
cm

array([[  195,  6830],
       [  290, 23215]])

In [44]:
df_concat.columns

Index(['gender', 'age', 'num_lab_procedures', 'num_medications', 'diag_1',
       'diag_2', 'diag_3', 'diabetesMed', 'race', 'max_glu_serum_>200',
       'max_glu_serum_>300', 'max_glu_serum_None', 'max_glu_serum_Norm',
       'A1Cresult_>7', 'A1Cresult_>8', 'A1Cresult_None', 'A1Cresult_Norm',
       'metformin_Down', 'metformin_No', 'metformin_Steady', 'metformin_Up',
       'repaglinide_Down', 'repaglinide_No', 'repaglinide_Steady',
       'repaglinide_Up', 'nateglinide_Down', 'nateglinide_No',
       'nateglinide_Steady', 'nateglinide_Up', 'chlorpropamide_Down',
       'chlorpropamide_No', 'chlorpropamide_Steady', 'chlorpropamide_Up',
       'glimepiride_Down', 'glimepiride_No', 'glimepiride_Steady',
       'glimepiride_Up', 'acetohexamide_No', 'acetohexamide_Steady',
       'glipizide_Down', 'glipizide_No', 'glipizide_Steady', 'glipizide_Up',
       'glyburide_Down', 'glyburide_No', 'glyburide_Steady', 'glyburide_Up',
       'tolbutamide_No', 'tolbutamide_Steady', 'pioglitazone_Dow

In [45]:
#Random Forest

In [46]:
X = df_concat.drop(['diabetesMed'], axis=1)
y = df_concat['diabetesMed']

In [47]:
rf_params = {"n_estimators" :[100,200,500,1000], 
             "max_features": [3,5,7], 
             "min_samples_split": [2,5,10,30],
            "max_depth": [3,5,8,None]}

In [48]:
rf_model = RandomForestClassifier(random_state = 12345)

In [49]:
gs_cv = GridSearchCV(rf_model, 
                    rf_params,
                    cv = 10,
                    n_jobs = -1,
                    verbose = 2).fit(X, y)

Fitting 10 folds for each of 192 candidates, totalling 1920 fits


In [50]:
gs_cv.best_params_

{'max_depth': None,
 'max_features': 7,
 'min_samples_split': 2,
 'n_estimators': 100}

In [51]:
rf_tuned = RandomForestClassifier(**gs_cv.best_params_)


In [52]:
rf_tuned = rf_tuned.fit(X,y)


In [53]:
cross_val_score(rf_tuned, X, y, cv = 10).mean()


0.9998820851278285

In [54]:
feature_imp = pd.Series(rf_tuned.feature_importances_,
                        index=X.columns).sort_values(ascending=False)

sns.barplot(x=feature_imp, y=feature_imp.index)
plt.xlabel('Significance Score Of Variables')
plt.ylabel('Variables')
plt.title("Variable Severity Levels")
plt.show()

In [55]:
#2) LightGBM Tuning

In [56]:
lgbm = LGBMClassifier(random_state = 12345)


In [57]:
lgbm_params = {"learning_rate": [0.01, 0.03, 0.05, 0.1, 0.5],
              "n_estimators": [500, 1000, 1500],
              "max_depth":[3,5,8]}

In [58]:
gs_cv = GridSearchCV(lgbm, 
                     lgbm_params, 
                     cv = 10, 
                     n_jobs = -1, 
                     verbose = 2).fit(X, y)

Fitting 10 folds for each of 45 candidates, totalling 450 fits


In [59]:
gs_cv.best_params_

{'learning_rate': 0.01, 'max_depth': 8, 'n_estimators': 1000}

In [60]:
lgbm_tuned = LGBMClassifier(**gs_cv.best_params_).fit(X,y)

In [61]:
cross_val_score(lgbm_tuned, X, y, cv = 10).mean()

0.999911564328678

In [62]:
feature_imp = pd.Series(lgbm_tuned.feature_importances_,
                        index=X.columns).sort_values(ascending=False)

sns.barplot(x=feature_imp, y=feature_imp.index)
plt.xlabel('Significance Score Of Variables')
plt.ylabel('Variables')
plt.title("Variable Severity Levels")
plt.show()

In [63]:
#XGBoost Tuning

In [64]:
xgb = GradientBoostingClassifier(random_state = 12345)

In [65]:
xgb_params = {
    "learning_rate": [0.01, 0.1, 0.2, 1],
    "min_samples_split": np.linspace(0.1, 0.5, 10),
    "max_depth":[3,5,8],
    "subsample":[0.5, 0.9, 1.0],
    "n_estimators": [100,1000]}

In [66]:
xgb_cv_model  = GridSearchCV(xgb,xgb_params, cv = 10, n_jobs = -1, verbose = 2).fit(X, y)

Fitting 10 folds for each of 720 candidates, totalling 7200 fits


In [None]:
xgb_cv_model.best_params_

In [None]:
xgb_tuned = GradientBoostingClassifier(**xgb_cv_model.best_params_).fit(X,y)

In [None]:
cross_val_score(xgb_tuned, X, y, cv = 10).mean()


In [None]:
feature_imp = pd.Series(xgb_tuned.feature_importances_,
                        index=X.columns).sort_values(ascending=False)

sns.barplot(x=feature_imp, y=feature_imp.index)
plt.xlabel('Significance Score Of Variables')
plt.ylabel('Variables')
plt.title("Variable Severity Levels")
plt.show()

In [None]:
#Comparacion de modelos

In [None]:
models = []

models.append(('RF', RandomForestClassifier(random_state = 12345, max_depth = 8, max_features = 7, min_samples_split = 2, n_estimators = 500)))
models.append(('XGB', GradientBoostingClassifier(random_state = 12345, learning_rate = 0.1, max_depth = 5, min_samples_split = 0.1, n_estimators = 100, subsample = 1.0)))
models.append(("LightGBM", LGBMClassifier(random_state = 12345, learning_rate = 0.01,  max_depth = 3, n_estimators = 1000)))

# evaluate each model in turn
results = []
names = []

In [None]:
for name, model in models:
    
        kfold = KFold(n_splits = 10, random_state = 12345)
        cv_results = cross_val_score(model, X, y, cv = 10, scoring= "accuracy")
        results.append(cv_results)
        names.append(name)
        msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
        print(msg)
        
# boxplot algorithm comparison
fig = plt.figure(figsize=(15,10))
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()