In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

: 

In [None]:
data = pd.read_csv('/kaggle/input/diabetes-dataset/diabetes.csv')
data

: 

# **Exploring Data**

In [None]:
from time import time # to calculate the time taken by different machine learning models to train for different algorithms
from IPython.display import display # Allows the use of display() for dataframes
import matplotlib.pyplot as plt
import sklearn

: 

# **Preparing data**

In [None]:
# Filling missing values

# Count the number of missing values in the skin thickness coloumn
print('Number of missing values in skin thickness coloumn: ', (data['SkinThickness'] == 0).sum())

# Count the number of missing values in the insulin coloumn

print('Number of missing values in insulin coloumn: ', (data['Insulin'] == 0).sum())

: 

In [None]:
# Replacing zeros in both the coloumns with mean
data.loc[data['SkinThickness'] == 0, 'SkinThickness'] = data['SkinThickness'].mean().astype(np.int64)
data.loc[data['Insulin'] == 0, 'Insulin'] = data['Insulin'].mean().astype(np.int64)

: 

In [None]:
# check replacement
# Count the number of missing values in the skin thickness coloumn
print('Number of missing values in skin thickness coloumn: ', (data['SkinThickness'] == 0).sum())

# Count the number of missing values in the insulin coloumn

print('Number of missing values in insulin coloumn: ', (data['Insulin'] == 0).sum())

: 

In [None]:
data.head()

: 

In [None]:
# Total number of records
n_records = len(data.index)

# Number of records where outcome = 1
n_1 = data[data.Outcome == 1].shape[0]

# Number of records where outcome = 0
n_0 = data[data.Outcome == 0].shape[0]

#percentage of records where outcome = 1
n1_perc = (n_1/n_records)*100

print(n1_perc)

: 

In [None]:
# Split into target and features data

outcome_r = data['Outcome']
features_r = data.drop('Outcome', axis = 1)

# Visualise the skewed continuous features of original data
fig = plt.figure(figsize = (11,5))

# Skewed feature plotting

for i,feature in enumerate(['SkinThickness','Insulin']):
    ax = fig.add_subplot(1,2,i+1)
    ax.hist(data[feature], bins = 25, color = '#00A0A0')
    ax.set_title("'%s' Feature Distribution"%(feature), fontsize = 14)
    ax.set_xlabel("value")
    ax.set_ylabel("number of records")

: 

In [None]:
# Log transformation of the skewed functions
skewed = ['SkinThickness','Insulin']
features_log_transformed = pd.DataFrame(data = features_r)
features_log_transformed[skewed] = features_r[skewed].apply(lambda x: np.log(x+1))

: 

In [None]:
# Scaling the data
from sklearn.preprocessing import MinMaxScaler
# initialise a scaler then apply it to the features
scaler = MinMaxScaler()
numerical = ['Insulin','SkinThickness','DiabetesPedigreeFunction']

features_log_minmax_transform = pd.DataFrame(data = features_log_transformed)
features_log_minmax_transform[numerical] = scaler.fit_transform(features_log_transformed[numerical])

features_final = features_log_minmax_transform
features_final.head()

: 

# **Training and Testing Data**

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(features_final,outcome_r,test_size = 0.2,random_state = 42)
print(X_train.shape[0])
print(X_test.shape[0])

: 

In [None]:
from time import time
from sklearn.metrics import fbeta_score,accuracy_score
def train_predict(learner,sample_size,X_train,Y_train,X_test,Y_test):
    results = {}
    start = time()
    learner = learner.fit(X_train[:sample_size],Y_train[:sample_size])
    end = time()
    results['train_time'] = end - start
    
    #prediction on X_test and first 300 samples from X_train
    
    start = time()
    predictions_test = learner.predict(X_test)
    predictions_train = learner.predict(X_train[:300])
    end = time()
    results['pred_time'] = end - start
    
    results['acc_train'] = accuracy_score(Y_train[:300],predictions_train)
    
    results['acc_test'] = accuracy_score(Y_test,predictions_test)
    
    results['f_train'] = fbeta_score(Y_train[:300],predictions_train,pos_label = 1,average = 'binary',beta = 0.5)
    
    results['f_test'] = fbeta_score(Y_test,predictions_test,pos_label = 1,average = 'binary',beta = 0.5)
    
    print("{} trained on {} samples".format(learner.__class__.__name__,sample_size))
    
    return results

: 

In [None]:
# now passing algorithms one by one into the pipeline made above

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

#initialize the three models

clf_a = LogisticRegression(random_state = 42)
clf_b = RandomForestClassifier()
clf_c = AdaBoostClassifier(DecisionTreeClassifier(max_depth = 5),random_state = 42)

# number of samples for 1%, 10% and 100% of the sample set

samples_100 = len(Y_train)
samples_10 = int(0.1*samples_100)
samples_1 = int(0.01*samples_100)

results = {}
for clf in [clf_a, clf_b, clf_c]:
    clf_name = clf.__class__.__name__
    results[clf_name] = {}
    for i,samples in enumerate ([samples_1,samples_10,samples_100]):
        results[clf_name][i] = train_predict(clf,samples,X_train,Y_train,X_test,Y_test)

: 

In [None]:
import matplotlib.patches as mpatches
def evaluate(results, accuracy, f1):
  
    # Create figure
    fig, ax = plt.subplots(2, 3, figsize = (11,7))

    # Constants
    bar_width = 0.3
    colors = ['#A00000','#00A0A0','#00A000']
    
    # Super loop to plot four panels of data
    for k, learner in enumerate(results.keys()):
        for j, metric in enumerate(['train_time', 'acc_train', 'f_train', 'pred_time', 'acc_test', 'f_test']):
            for i in np.arange(3):
                
                # Creative plot code
                ax[j//3, j%3].bar(i+k*bar_width, results[learner][i][metric], width = bar_width, color = colors[k])
                ax[j//3, j%3].set_xticks([0.45, 1.45, 2.45])
                ax[j//3, j%3].set_xticklabels(["1%", "10%", "100%"])
                ax[j//3, j%3].set_xlabel("Training Set Size")
                ax[j//3, j%3].set_xlim((-0.1, 3.0))
    
    # Add unique y-labels
    ax[0, 0].set_ylabel("Time (in seconds)")
    ax[0, 1].set_ylabel("Accuracy Score")
    ax[0, 2].set_ylabel("F-score")
    ax[1, 0].set_ylabel("Time (in seconds)")
    ax[1, 1].set_ylabel("Accuracy Score")
    ax[1, 2].set_ylabel("F-score")
    
    # Add titles
    ax[0, 0].set_title("Model Training")
    ax[0, 1].set_title("Accuracy Score on Training Subset")
    ax[0, 2].set_title("F-score on Training Subset")
    ax[1, 0].set_title("Model Predicting")
    ax[1, 1].set_title("Accuracy Score on Testing Set")
    ax[1, 2].set_title("F-score on Testing Set")
    
    # Add horizontal lines for naive predictors
    ax[0, 1].axhline(y = accuracy, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed')
    ax[1, 1].axhline(y = accuracy, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed')
    ax[0, 2].axhline(y = f1, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed')
    ax[1, 2].axhline(y = f1, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed')
    
    # Set y-limits for score panels
    ax[0, 1].set_ylim((0, 1))
    ax[0, 2].set_ylim((0, 1))
    ax[1, 1].set_ylim((0, 1))
    ax[1, 2].set_ylim((0, 1))

    # Create patches for the legend
    patches = []
    for i, learner in enumerate(results.keys()):
        patches.append(mpatches.Patch(color = colors[i], label = learner))
    plt.legend(handles = patches, bbox_to_anchor = (-.80, 2.53), \
               loc = 'upper center', borderaxespad = 0., ncol = 3, fontsize = 'large')
    
    # Aesthetics
    plt.suptitle("Performance Metrics for Three Supervised Learning Models", fontsize = 16, y = 1.10)
    plt.tight_layout()
    plt.show()
    

: 

In [None]:
#Calculate accuracy, precision and recall
accuracy = n_1/n_records
precision = n_1/n_records
recall = np.sum(outcome_r)/np.sum(outcome_r)

fscore = (1+np.square(0.5))*precision*recall/((np.square(0.5)*precision)+recall)

print("Naive Predictor: [Accuracy score: {:.4f}, F-score: {:.4f}]".format(accuracy, fscore))

: 

In [None]:
evaluate(results, accuracy, fscore)

: 

# **Model Tuning**

Model tuning involves adjusting the hyperparameters of the machine learning model to improve it's performance.

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, fbeta_score, accuracy_score

clf = RandomForestClassifier(random_state=42)

parameters = {'n_estimators':[50,100,150],'max_depth': [5, 10, 15, 20]}

scorer = make_scorer(fbeta_score, beta=0.5)
acc_scorer = make_scorer(accuracy_score)

grid_obj = GridSearchCV(clf, parameters, scoring=scorer)

#Fit the grid search object to the training data and find the optimal parameters using fit()
grid_fit = grid_obj.fit(X_train, Y_train)

#Get the best estimator for classifier
best_clf = grid_fit.best_estimator_

#Make predictions using the unoptimized and optimized classifiers
predictions = clf.fit(X_train, Y_train).predict(X_test)
best_predictions = best_clf.predict(X_test)

print("Random Forest")
print("Unoptimized model accuracy: {:.4f}".format(accuracy_score(Y_test, predictions)))
print("Optimized model accuracy: {:.4f}".format(accuracy_score(Y_test, best_predictions)))
print("Unoptimized model F-score: {:.4f}".format(fbeta_score(Y_test, predictions, beta=0.5)))
print("Optimized model F-score: {:.4f}".format(fbeta_score(Y_test, best_predictions, beta=0.5)))

: 

In [None]:
def feature_plot(importances, X_train, Y_train):
    
    # Display the three most important features
    indices = np.argsort(importances)[::-1]
    columns = X_train.columns.values[indices[:5]]
    values = importances[indices][:5]

    # Creat the plot
    fig = plt.figure(figsize = (5,5))
    plt.title("Normalized Weights for First Five Most Predictive Features", fontsize = 16)
    plt.bar(np.arange(5), values, width = 0.6, align="center", color = '#00A000', \
          label = "Feature Weight")
    plt.bar(np.arange(5) - 0.3, np.cumsum(values), width = 0.2, align = "center", color = '#00A0A0', \
          label = "Cumulative Feature Weight")
    plt.xticks(np.arange(5), columns)
    plt.xlim((-0.5, 4.5))
    plt.ylabel("Weight", fontsize = 12)
    plt.xlabel("Feature", fontsize = 12)
    
    plt.legend(loc = 'upper center')
    plt.tight_layout()
    plt.show()  

: 

In [None]:
# Extracting important features
model = best_clf
importances = model.feature_importances_
feature_plot(importances,X_train,Y_train)

: 

# **Feature Selection**
using a subset of the available features in the data can significantly reduce the training and prediction time of a model, but it may also effect the model's performance

In [None]:
from sklearn.base import clone

X_train_reduced = X_train[X_train.columns.values[(np.argsort(importances)[::-1])[:3]]]
X_test_reduced = X_test[X_test.columns.values[(np.argsort(importances)[::-1])[:3]]]

clf = (clone(best_clf)).fit(X_train_reduced, Y_train)

reduced_predictions = clf.predict(X_test_reduced)

print("Final Model trained on full data\n------")
print("Accuracy on testing data: {:.4f}".format(accuracy_score(Y_test, best_predictions)))
print("F-score on testing data: {:.4f}".format(fbeta_score(Y_test, best_predictions, beta = 0.5)))
print("\nFinal Model trained on reduced data\n------")
print("Accuracy on testing data: {:.4f}".format(accuracy_score(Y_test, reduced_predictions)))
print("F-score on testing data: {:.4f}".format(fbeta_score(Y_test, reduced_predictions, beta = 0.5)))

: 

In [None]:
import pickle
pickle_out = open('trained_model.sav','wb')
pickle.dump(best_clf, pickle_out)
pickle_out.close()

: 

: 