<a href="https://colab.research.google.com/github/MoseAIML/Moses-/blob/main/Model_Quality_and_Improvements.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

You will be required to submit: ● A GitHub repository with your project written in Python or R. Instructions Problem Statement As a data professional working for a pharmaceutical company, you need to develop a model that predicts whether a patient will be diagnosed with diabetes. The model needs to have an accuracy score greater than 0.85. You will be required to document the following steps:

Data Importation
Data Cleaning
Data Modeling (Using Decision Trees, Random Forest and Logistic Regression)
Model Evaluation
Hyparameter Tuning
Hyparameter Tuning

In [1]:
# Import Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# To preview all rows and columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
     

In [2]:
# Load dataset
diabetes_df = pd.read_csv('https://bit.ly/DiabetesDS')

# Preview first 5 records
diabetes_df.head()
     

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:

#standardization of column names
diabetes_df.columns = diabetes_df.columns.str.strip().str.lower().str.replace(')','').str.replace('?','')
diabetes_df.head(2)
     

  diabetes_df.columns = diabetes_df.columns.str.strip().str.lower().str.replace(')','').str.replace('?','')


Unnamed: 0,pregnancies,glucose,bloodpressure,skinthickness,insulin,bmi,diabetespedigreefunction,age,outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0


In [4]:

# Data shape 
diabetes_df.shape

(768, 9)

In [5]:

# Looking for duplicates
diabetes_df.duplicated().sum()
     

0

In [6]:

diabetes_df.dtypes

pregnancies                   int64
glucose                       int64
bloodpressure                 int64
skinthickness                 int64
insulin                       int64
bmi                         float64
diabetespedigreefunction    float64
age                           int64
outcome                       int64
dtype: object

In [7]:

# Search for null values 

diabetes_df.isnull().sum()

pregnancies                 0
glucose                     0
bloodpressure               0
skinthickness               0
insulin                     0
bmi                         0
diabetespedigreefunction    0
age                         0
outcome                     0
dtype: int64

Machine Learning

In [8]:

#preparing data 
x = diabetes_df.drop(['outcome'], axis = 1)
y = diabetes_df['outcome']  

#spliting the dataset (ratio 3:1)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 0)

#confirm size of datasets
print(diabetes_df.shape)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(768, 9)
(576, 8)
(192, 8)
(576,)
(192,)


In [9]:

from sklearn.tree import DecisionTreeClassifier

# Model Creation with random state -> 12345
dc_model = DecisionTreeClassifier(random_state=12345, max_depth=5)

#train a model
dc_model.fit(X_train,y_train)

#predict answers 
dec_y_pred = dc_model.predict(X_test) 

from sklearn.metrics import accuracy_score 

#Classifier report
from sklearn.metrics import classification_report 
print(classification_report(y_test, dec_y_pred))

              precision    recall  f1-score   support

           0       0.80      0.88      0.84       130
           1       0.69      0.55      0.61        62

    accuracy                           0.78       192
   macro avg       0.75      0.72      0.73       192
weighted avg       0.77      0.78      0.77       192



In [10]:

for depth in range (1,20):
  model = DecisionTreeClassifier(max_depth=depth)
  #training the model
  model.fit(X_train,y_train)
  #find predictions using validation set 
  dec_y_pred=model.predict(X_test)

  print("max_depth =", depth, ": ", end =' ')
  print(accuracy_score(dec_y_pred, y_test))

max_depth = 1 :  0.75
max_depth = 2 :  0.75
max_depth = 3 :  0.734375
max_depth = 4 :  0.7447916666666666
max_depth = 5 :  0.78125
max_depth = 6 :  0.734375
max_depth = 7 :  0.7604166666666666
max_depth = 8 :  0.7708333333333334
max_depth = 9 :  0.796875
max_depth = 10 :  0.7708333333333334
max_depth = 11 :  0.7552083333333334
max_depth = 12 :  0.7447916666666666
max_depth = 13 :  0.7239583333333334
max_depth = 14 :  0.703125
max_depth = 15 :  0.7447916666666666
max_depth = 16 :  0.7604166666666666
max_depth = 17 :  0.7239583333333334
max_depth = 18 :  0.71875
max_depth = 19 :  0.7239583333333334


In [11]:
from sklearn.ensemble import RandomForestClassifier

#create a model and assign it to a variable 
Forest_model = RandomForestClassifier(random_state=42, max_depth=6,max_features=5,n_jobs=-1, 
                                      min_samples_split=5,bootstrap = True,n_estimators = 459)


#train a model by calling the fit() method 
Forest_model.fit(X_train,y_train)

#predict answers 
forest_y_pred = Forest_model.predict(X_test) 

#Model Evaluation
#evaluation metric in sklearn library
from sklearn.metrics import accuracy_score 

#Classifier report
from sklearn.metrics import classification_report 
print(classification_report(y_test, forest_y_pred))

              precision    recall  f1-score   support

           0       0.81      0.90      0.85       130
           1       0.73      0.56      0.64        62

    accuracy                           0.79       192
   macro avg       0.77      0.73      0.75       192
weighted avg       0.79      0.79      0.78       192



In [12]:
for depth in range (1,10):
  Forest_model = RandomForestClassifier(max_depth=depth)
  #training the model
  Forest_model.fit(X_train,y_train)
  #find predictions using validation set 
  forest_y_pred=Forest_model.predict(X_test)

  print("max_depth =", depth, ": ", end =' ')
  print(accuracy_score(forest_y_pred, y_test))

max_depth = 1 :  0.7135416666666666
max_depth = 2 :  0.71875
max_depth = 3 :  0.7708333333333334
max_depth = 4 :  0.7864583333333334
max_depth = 5 :  0.7864583333333334
max_depth = 6 :  0.7760416666666666
max_depth = 7 :  0.796875
max_depth = 8 :  0.7916666666666666
max_depth = 9 :  0.7864583333333334


In [13]:
for job in range (1,10):
  Forest_model = RandomForestClassifier(n_jobs=job)
  #training the model
  Forest_model.fit(X_train,y_train)
  #find predictions using validation set 
  forest_y_pred=Forest_model.predict(X_test)

  print("n_jobs =", job, ": ", end =' ')
  print(accuracy_score(forest_y_pred, y_test))
     

n_jobs = 1 :  0.796875
n_jobs = 2 :  0.7864583333333334
n_jobs = 3 :  0.78125
n_jobs = 4 :  0.7760416666666666
n_jobs = 5 :  0.796875
n_jobs = 6 :  0.8177083333333334
n_jobs = 7 :  0.78125
n_jobs = 8 :  0.7916666666666666
n_jobs = 9 :  0.78125


In [14]:
from sklearn.ensemble import RandomForestRegressor 
rf = RandomForestRegressor(random_state=42, n_jobs=-1)
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint

#code to create the parameter distribution.

# specify parameters and distributions to sample from
param_dist = {"max_depth": [3, None],
              "max_features": sp_randint(1, X_train.shape[1]),
              "min_samples_split": sp_randint(2, 11),
              "bootstrap": [True, False],
              "n_estimators": sp_randint(100, 500)}

random_search = RandomizedSearchCV(rf, param_distributions=param_dist,
                                   n_iter=10, cv=5, random_state=42)
random_search.fit(X_train,y_train)   

print(random_search.best_params_)

{'bootstrap': True, 'max_depth': 3, 'max_features': 5, 'min_samples_split': 5, 'n_estimators': 459}


In [15]:
#Logistic Regression Classifer
#import decision tree from sklearn library
from sklearn.linear_model import LogisticRegression

#create a model and assign it to a variable 
LogisticRegression_model = LogisticRegression(random_state=12345, solver='liblinear', 
                                              C=1.0)

#train a model by calling the fit() method 
LogisticRegression_model.fit(X_train,y_train)

#predict answers 
logistic_y_pred = LogisticRegression_model.predict(X_test) 

#Model Evaluation
#evaluation metric in sklearn library
from sklearn.metrics import accuracy_score 

#Classifier report
from sklearn.metrics import classification_report 
print('Logistic Regression classifier:')
print(classification_report(y_test, logistic_y_pred))

Logistic Regression classifier:
              precision    recall  f1-score   support

           0       0.82      0.92      0.87       130
           1       0.77      0.58      0.66        62

    accuracy                           0.81       192
   macro avg       0.79      0.75      0.76       192
weighted avg       0.80      0.81      0.80       192

