# Model Training to make Pickle File 

Classification, the process of predicting if an individual has diabetes based on SES.

In [1]:
from sqlalchemy import create_engine
import psycopg2
from config import db_password

import pandas as pd
import numpy as np
from collections import Counter
import sklearn as skl
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
import pickle

In [2]:
# localserver, the connection string
dbEngine = f"postgresql://postgres:{db_password}@127.0.0.1:5433/Drops_of_Jupyter"

# create the database engine
engine = create_engine(dbEngine)
conn = engine.connect()

In [3]:
# Read data from PostgreSQL database table and load into a DataFrame instance
ghs_df = pd.read_sql("select * from general_health_status", conn);
ghs_df

Unnamed: 0,ID,General_Health_Status,Diabetes,Prediabetes,Weight_Lbs,Categorical_BMI
0,H056808,Very_Good,no,yes,199.0,Overweight
1,H018779,Very_Good,yes,yes,205.0,Overweight
2,H049265,Very_Good,no,no,160.0,Overweight
3,H007699,Fair,no,no,190.0,Obese
4,H066034,Good,no,no,250.0,Obese
...,...,...,...,...,...,...
29477,H012375,Very_Good,no,no,140.0,Overweight
29478,H052160,Fair,yes,yes,220.0,Obese
29479,H051563,Very_Good,no,no,130.0,Overweight
29480,H058432,Good,no,no,168.0,Healthy_Weight


In [4]:
# Read data from PostgreSQL database table and load into a DataFrame instance
ind_df = pd.read_sql("select * from individual", conn)
ind_df

Unnamed: 0,ID,Region,Age,Gender,Education,Race,Poverty_Ratio
0,H056808,South,50,Male,Grade_1-11,White_Only,1.93
1,H018779,South,53,Male,Associates_Academic_Program,African_American_Only,4.45
2,H049265,South,56,Male,Bachelor,White_Only,5.94
3,H007699,South,57,Female,Some_College_no_degree,White_Only,3.70
4,H066034,South,25,Male,High_School_Graduate,African_American_Only,1.66
...,...,...,...,...,...,...,...
29477,H012375,West,70,Female,Masters,White_Only,5.11
29478,H052160,West,35,Female,Associates_Academic_Program,,3.03
29479,H051563,West,72,Female,High_School_Graduate,White_Only,2.07
29480,H058432,West,58,Male,Some_College_no_degree,White_Only,2.05


In [5]:
# merge two dfs
clean_df2 = ghs_df.merge(ind_df, how='inner', on='ID')
clean_df2

Unnamed: 0,ID,General_Health_Status,Diabetes,Prediabetes,Weight_Lbs,Categorical_BMI,Region,Age,Gender,Education,Race,Poverty_Ratio
0,H056808,Very_Good,no,yes,199.0,Overweight,South,50,Male,Grade_1-11,White_Only,1.93
1,H018779,Very_Good,yes,yes,205.0,Overweight,South,53,Male,Associates_Academic_Program,African_American_Only,4.45
2,H049265,Very_Good,no,no,160.0,Overweight,South,56,Male,Bachelor,White_Only,5.94
3,H007699,Fair,no,no,190.0,Obese,South,57,Female,Some_College_no_degree,White_Only,3.70
4,H066034,Good,no,no,250.0,Obese,South,25,Male,High_School_Graduate,African_American_Only,1.66
...,...,...,...,...,...,...,...,...,...,...,...,...
29477,H012375,Very_Good,no,no,140.0,Overweight,West,70,Female,Masters,White_Only,5.11
29478,H052160,Fair,yes,yes,220.0,Obese,West,35,Female,Associates_Academic_Program,,3.03
29479,H051563,Very_Good,no,no,130.0,Overweight,West,72,Female,High_School_Graduate,White_Only,2.07
29480,H058432,Good,no,no,168.0,Healthy_Weight,West,58,Male,Some_College_no_degree,White_Only,2.05


In [6]:
# columns in df
clean_df2.columns

Index(['ID', 'General_Health_Status', 'Diabetes', 'Prediabetes', 'Weight_Lbs',
       'Categorical_BMI', 'Region', 'Age', 'Gender', 'Education', 'Race',
       'Poverty_Ratio'],
      dtype='object')

In [7]:
clean_df2.dropna(inplace=True)
clean_df2

Unnamed: 0,ID,General_Health_Status,Diabetes,Prediabetes,Weight_Lbs,Categorical_BMI,Region,Age,Gender,Education,Race,Poverty_Ratio
0,H056808,Very_Good,no,yes,199.0,Overweight,South,50,Male,Grade_1-11,White_Only,1.93
1,H018779,Very_Good,yes,yes,205.0,Overweight,South,53,Male,Associates_Academic_Program,African_American_Only,4.45
2,H049265,Very_Good,no,no,160.0,Overweight,South,56,Male,Bachelor,White_Only,5.94
3,H007699,Fair,no,no,190.0,Obese,South,57,Female,Some_College_no_degree,White_Only,3.70
4,H066034,Good,no,no,250.0,Obese,South,25,Male,High_School_Graduate,African_American_Only,1.66
...,...,...,...,...,...,...,...,...,...,...,...,...
29476,H044531,Good,yes,yes,160.0,Overweight,West,69,Female,Masters,White_Only,7.67
29477,H012375,Very_Good,no,no,140.0,Overweight,West,70,Female,Masters,White_Only,5.11
29479,H051563,Very_Good,no,no,130.0,Overweight,West,72,Female,High_School_Graduate,White_Only,2.07
29480,H058432,Good,no,no,168.0,Healthy_Weight,West,58,Male,Some_College_no_degree,White_Only,2.05


In [8]:
clean_df2.drop(['ID', 'General_Health_Status', 'Region'], axis=1, inplace=True)

In [9]:
# columns in df
clean_df2.columns

Index(['Diabetes', 'Prediabetes', 'Weight_Lbs', 'Categorical_BMI', 'Age',
       'Gender', 'Education', 'Race', 'Poverty_Ratio'],
      dtype='object')

In [10]:
clean_df2['Diabetes'].value_counts()

no     22773
yes     2565
Name: Diabetes, dtype: int64

### Split data into train and test

In [11]:
# define our variables of interest
#get dummies for categorical dummies
encoded_df = pd.get_dummies(clean_df2, columns=['Diabetes', 'Prediabetes', 
                                                'Categorical_BMI', 'Gender', 
                                                'Education', 'Race'])
encoded_df= encoded_df.drop(columns= 'Diabetes_no')
# get target
y= encoded_df['Diabetes_yes']

X= encoded_df.drop(columns='Diabetes_yes')

In [12]:
X.describe()

Unnamed: 0,Weight_Lbs,Age,Poverty_Ratio,Prediabetes_no,Prediabetes_yes,Categorical_BMI_Healthy_Weight,Categorical_BMI_Obese,Categorical_BMI_Overweight,Categorical_BMI_Underweight,Gender_Female,...,Education_Greater_Than_Master,Education_High_School_Graduate,Education_Masters,Education_Some_College_no_degree,Race_AIAN_AND_other,Race_AIAN_Only,Race_African_American_Only,Race_Asian_Only,Race_Other,Race_White_Only
count,25338.0,25338.0,25338.0,25338.0,25338.0,25338.0,25338.0,25338.0,25338.0,25338.0,...,25338.0,25338.0,25338.0,25338.0,25338.0,25338.0,25338.0,25338.0,25338.0,25338.0
mean,176.791302,53.17385,4.394485,0.849672,0.150328,0.330452,0.300458,0.359302,0.009788,0.541637,...,0.04215,0.217855,0.119149,0.151551,0.009354,0.008012,0.113269,0.064725,0.015629,0.789013
std,39.682977,18.419904,3.020485,0.357399,0.357399,0.470385,0.458466,0.479805,0.098449,0.498273,...,0.200936,0.412796,0.32397,0.358592,0.096262,0.08915,0.316927,0.246045,0.124036,0.408018
min,100.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,146.0,37.0,1.98,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,173.0,54.0,3.71,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,200.0,68.0,6.04,1.0,0.0,1.0,1.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,299.0,99.0,11.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [13]:
X.columns

Index(['Weight_Lbs', 'Age', 'Poverty_Ratio', 'Prediabetes_no',
       'Prediabetes_yes', 'Categorical_BMI_Healthy_Weight',
       'Categorical_BMI_Obese', 'Categorical_BMI_Overweight',
       'Categorical_BMI_Underweight', 'Gender_Female', 'Gender_Male',
       'Education_12th_Grade_no_diploma',
       'Education_Associates_Academic_Program',
       'Education_Associates_Occupational_Technical_Vocational',
       'Education_Bachelor', 'Education_GED_Equivalent',
       'Education_Grade_1-11', 'Education_Greater_Than_Master',
       'Education_High_School_Graduate', 'Education_Masters',
       'Education_Some_College_no_degree', 'Race_AIAN_AND_other',
       'Race_AIAN_Only', 'Race_African_American_Only', 'Race_Asian_Only',
       'Race_Other', 'Race_White_Only'],
      dtype='object')

In [14]:
# check the balance of the target value
y.value_counts()

0    22773
1     2565
Name: Diabetes_yes, dtype: int64

In [15]:
# normal train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
Counter(y_train)

Counter({0: 17079, 1: 1924})

### Scale the data

In [16]:
# create a scaler instance
X_scaler = skl.preprocessing.StandardScaler()

In [17]:
# fit/train the scaler
X_scaler.fit(X_train)

In [18]:
# scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [19]:
# pickle scaler
pickle.dump(X_scaler, open('scaler.pkl','wb'))

In [20]:
# 75% train, 25% test for X
print(X_train_scaled.shape)
print(X_test_scaled.shape)

(19003, 27)
(6335, 27)


In [21]:
# 75% train, 25% test for y
print(y_train.shape)
print(y_test.shape)

(19003,)
(6335,)


### Random oversampling

Use more of the rare class records in the classification model, aka upsample. 

In [22]:
# implement random oversampling
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train_scaled, y_train)

Counter(y_resampled)

Counter({0: 17079, 1: 17079})

### Logistic regression

In [23]:
# create a logistic regression model
model1 = LogisticRegression(solver='lbfgs', random_state=1)
model2 = LogisticRegressionCV(cv=5, penalty='l1', solver='saga', 
            max_iter=10000, random_state=1)
print(model1)
print(model2)

LogisticRegression(random_state=1)
LogisticRegressionCV(cv=5, max_iter=10000, penalty='l1', random_state=1,
                     solver='saga')


In [24]:
# fit (train) model using the training data
model1.fit(X_resampled, y_resampled)

In [25]:
# fit (train) model using the training data
model2.fit(X_resampled, y_resampled)

### Make predictions, measure accuracy

In [26]:
# cal. the balanced accuracy score
y_pred1 = model1.predict(X_test_scaled)
score1 = balanced_accuracy_score(y_test, y_pred1)

print('Accuracy score: ', score1)

Accuracy score:  0.8247883339991133


In [27]:
# cal. the balanced accuracy score
y_pred2 = model2.predict(X_test_scaled)
score2 = balanced_accuracy_score(y_test, y_pred2)

print('Accuracy score: ', score2)

Accuracy score:  0.8252273926573501


### Confusion matrix
Each row in a confusion matrix represents an actual calss, which each column represents a predicted class.

In [28]:
matrix1 = confusion_matrix (y_test, y_pred1)
print(matrix1)

[[4738  956]
 [ 117  524]]


In [29]:
matrix2 = confusion_matrix (y_test, y_pred2)
print(matrix2)

[[4743  951]
 [ 117  524]]


### Precision and Recall

In [30]:
# precision
precision_score(y_test, y_pred1)

0.35405405405405405

In [31]:
# precision
precision_score(y_test, y_pred2)

0.3552542372881356

In [32]:
# recall/sensitivity
recall_score(y_test, y_pred1)

0.8174726989079563

In [33]:
# recall/sensitivity
recall_score(y_test, y_pred2)

0.8174726989079563

### Classification Report  

In [34]:
report1 = classification_report(y_test, y_pred1)
print(report1)

              precision    recall  f1-score   support

           0       0.98      0.83      0.90      5694
           1       0.35      0.82      0.49       641

    accuracy                           0.83      6335
   macro avg       0.66      0.82      0.70      6335
weighted avg       0.91      0.83      0.86      6335



In [35]:
report2 = classification_report(y_test, y_pred2)
print(report2)

              precision    recall  f1-score   support

           0       0.98      0.83      0.90      5694
           1       0.36      0.82      0.50       641

    accuracy                           0.83      6335
   macro avg       0.67      0.83      0.70      6335
weighted avg       0.91      0.83      0.86      6335



### Make Pickle Files 

#### After comparing the 2 models we have selected the LogisticRegressionCV due to the slightly higher accuracy. 

In [36]:
# Saving to pickle file 
# Saving model to current directory
# Pickle serializes objects so they can be saved to a file, and loaded in a program again later on.
pickle.dump(model2, open('model.pkl','wb'))

### Test Pickles

#### Our App feeds user input with feature names, I'm testing an array here and that's why we have the warning you see below. 
#### We do not have a warning when using pickle in app because we feed a dataframe through. 

In [37]:
#Loading model to test the results
testmodel = pickle.load(open('model.pkl','rb'))
testscaler = pickle.load(open('scaler.pkl','rb'))

scaled_test1 = testscaler.transform([[165, 39, 3.77, 1, 0, 1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0]])
print(testmodel.predict(scaled_test1))


[0]




In [38]:
scaled_test1_2 = testscaler.transform([[125, 29, 3.77, 1, 0, 1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0]])
print(testmodel.predict(scaled_test1_2))


[0]




In [39]:
scaled_test1_3 = testscaler.transform([[125, 20, 3.77, 1, 0, 1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1]])
print(testmodel.predict(scaled_test1_3))

[0]


