In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
df = pd.read_csv('./diabetes.csv')
df.shape

In [None]:
df.head()

All patients here are females at least 21 years old.

- Pregnancies: Number of times pregnant
- Glucose: Plasma glucose concentration a 2 hours in an oral glucose tolerance test
- BloodPressure: Diastolic blood pressure (mm Hg)
- SkinThickness: Triceps skin fold thickness (mm)
- Insulin: 2-Hour serum insulin (mu U/ml)
- BMI: Body mass index (weight in kg/(height in m)^2)
- DiabetesPedigreeFunction: Diabetes pedigree function
- Age: Age (years)
- Outcome: Class variable (0 or 1) 
    - 1 -> Tested positive for Diabetes

In [None]:
df.info()

In [None]:
df.describe().transpose()

In [None]:
print(f"Total NULL values present in this are {df.isna().sum().sum()}")
df.isna().sum()

### Is the Output data imbalanced ?

A classification data set with skewed class proportions is called imbalanced. Classes that make up a large proportion of the data set are called majority classes. Those that make up a smaller proportion are minority classes.

|Degree of imbalance|	Proportion of Minority Class|
|---|----|
|Mild | 	20-40% of the data set|
|Moderate	 | 1-20% of the data set|
|Extreme | 	<1% of the data set|

In [None]:
sns.countplot(df['Outcome'])

In [None]:
percentage_of_minority_data = (df[df['Outcome']==1].shape[0]/df['Outcome'].shape[0]*100)

percentage_of_minority_data

Obervation:
- Data is mildly Imbalanced

### Distribution of Each Column

In [None]:
from scipy.stats import skew
for col in df.drop('Outcome', axis = 1).columns:
    print("Skewness for the column {} is {}".format(col, df[col].skew()))

In [None]:
plt.figure(figsize=(20,25), facecolor='white', dpi=150)
plotnumber = 1

for column in df.columns[:-1]:
    if plotnumber<=9 :     # as there are 9 columns in the data
        ax = plt.subplot(3,3,plotnumber)
        sns.kdeplot(df[column], fill=True)
        plt.xlabel(column,fontsize=20)
        #plt.ylabel('Salary',fontsize=20)
    plotnumber+=1
plt.show()

In [None]:
df.sample(4)

In [None]:
df['Age'].describe()

### Adding Age Groups

In [None]:
df_copy = df.copy()



def ageGroup(age):
    if age<=30:
        return 'A'
    elif age<=40:
        return'B'
    elif  age<=50:
        return 'C'
    else:
        return'D'

In [None]:
df_copy['AgeGroup'] = df_copy['Age'].apply(ageGroup)

In [None]:
df_copy['AgeGroup'].value_counts()

In [None]:
plt.figure(figsize=(8,4), dpi=150)

sns.countplot(x='AgeGroup', data=df_copy)

In [None]:
plt.figure(figsize=(8,4), dpi=150)
sns.boxplot(data=df, y='Age', x='Outcome')
plt.title('Age vs Outcome')
plt.show()

In [None]:
group_wise_positive = df_copy.groupby(['AgeGroup','Outcome']).count().reset_index()

group_wise_positive

In [None]:


for grp in group_wise_positive['AgeGroup'].unique():
   temp = group_wise_positive[group_wise_positive['AgeGroup']==grp]
   (temp[temp['Outcome']==1]['Age']/np.sum(temp['Age']))*100
   print(f"GROUP {grp}:  {(temp[temp['Outcome']==1]['Age']/np.sum(temp['Age']))*100}")
  

In [None]:
# if we have less features, we can use 3d scatter plot
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(dpi=150)
ax = fig.add_subplot(111, projection='3d')
ax.scatter(df_copy['Age'][df_copy['AgeGroup']=='A'],df_copy['BloodPressure'][df_copy['AgeGroup']=='A'],df_copy['Pregnancies'][df_copy['AgeGroup']=='A'], c=df_copy['Outcome'][df_copy['AgeGroup']=='A'])


In [None]:
fig = plt.figure(dpi=150)
ax = fig.add_subplot(111, projection='3d')
ax.scatter(df_copy['Age'][df_copy['AgeGroup']=='B'],df_copy['BloodPressure'][df_copy['AgeGroup']=='B'],df_copy['Pregnancies'][df_copy['AgeGroup']=='B'], c=df_copy['Outcome'][df_copy['AgeGroup']=='B'])


In [None]:
fig = plt.figure(dpi=150)
ax = fig.add_subplot(111, projection='3d')

ax.scatter(df_copy['Age'][df_copy['AgeGroup']=='C'],df_copy['BloodPressure'][df_copy['AgeGroup']=='C'],df_copy['Pregnancies'][df_copy['AgeGroup']=='C'], c=df_copy['Outcome'][df_copy['AgeGroup']=='C'])

Observation:
- Most of the patients are on the age group of 21-30. In fact they are doubled when compared with remaining all other age groups together
- Younger people(21-30) have less changes of having Diabetes when compared to other age groups.
- The median of the age of diabetic people is greater than that of non-diabetic people.

In [None]:
import warnings
warnings.filterwarnings("ignore")


def distplot(col_name):
    """
    A function that will plot the distribution of column 'col_name' for diabetic and non-diabetic people separately
    """
    plt.figure(figsize=(8,4), dpi=150)

    sns.distplot(x=df[col_name][df.Outcome == 1], color ="red", rug = True)
    sns.distplot(x=df[col_name][df.Outcome == 0], color ="lightblue",rug = True)
    plt.legend(['Diabetes', 'No Diabetes'])


In [None]:
def mean_target(var):
    """
    A function that will return the mean values for 'var' column depending on whether the person
    is diabetic or not
    """
    a=df.query('Outcome == 0')[var].mean()
    b=df.query('Outcome == 1')[var].mean()

    return pd.DataFrame([a,b], columns=[var])

### BMI

In [None]:
df.head()

In [None]:
avg_bmi_for_1 = df[df['Outcome']==1]['BMI'].mean()

avg_bmi_for_1

In [None]:
avg_bmi_for_0 = df[df['Outcome']==0]['BMI'].mean()

avg_bmi_for_0

In [None]:
mean_target('BMI')

In [None]:
distplot('BMI')

In [None]:
plt.figure(figsize=(8,4), dpi=150)

my_pal = {0: "lightyellow", 1: "lightpink"}
sns.boxplot(x = 'Outcome', y = 'BMI', data = df, palette = my_pal)
plt.title('BMI vs Outcome')
plt.show()

Observation:
- Average BMI is higher for patients having Diabetes

### Glucose

- Average Sugar level for adults should be from 90 to 130 mg/dL (5.0 to 7.2 mmol/L) to be considered as normal.

In [None]:
df['Glucose'].mean()

In [None]:
df[df['Outcome']==1]['Glucose'].mean()

print(f'Average Glucose level for Patients having Diabetes is ',df[df['Outcome']==1]['Glucose'].mean())

In [None]:
df[df['Outcome']==0]['Glucose'].mean()

print(f'Average Glucose level for  Patients NOT having Diabetes is ',df[df['Outcome']==0]['Glucose'].mean())

In [None]:
plt.figure(figsize=(8,4), dpi=150)

# sns.displot(data=df, x='Glucose', kde=True)

plt.hist(df['Glucose'], 100)
 
# plotting mean line
plt.axvline(df['Glucose'].mean(), color='k', linestyle='dashed', linewidth=2)
 
# showing the plot
plt.show()

In [None]:
distplot('Glucose')

In [None]:
mean_target('Glucose')

In [None]:
plt.figure(figsize=(8,4), dpi=150)
my_pal = {0: "lightgrey", 1: "lightyellow"}

sns.boxplot(data=df, x='Outcome', y='Glucose', palette=my_pal)

plt.title('Glucose vs Outcome')
plt.show()

Observation:
- Mean of Sugar levels in diabetic patients is high when compared to Non-diabetic
- Median of the Glucose level of Diabetic People is greater than the 75th Percentile of the glucose level of non-diabetic people. Therefore having a high glucose level does increase the chances of having diabetes.

### DiabetesPedigreeFunction: 
- Diabetes pedigree function (a function which scores likelihood of diabetes based on family history)

In [None]:
df.head()

In [None]:
df['DiabetesPedigreeFunction'].describe()

In [None]:
plt.figure(figsize=(8,4), dpi=150)

sns.scatterplot(data=df, x='DiabetesPedigreeFunction',y='Outcome', hue='Outcome')

In [None]:
plt.figure(figsize=(8,4), dpi=150)

sns.boxplot(data=df, x='Outcome', y='DiabetesPedigreeFunction', palette={0: "lightgreen", 1: "lightpink"})

plt.title('DPF vs Outcome')
plt.show()

Observation:
- From above plot we cannot conclusively decide result by using 'Pedigree' function value

### BLOOD PRESSURE

- High blood pressure is twice as likely to strike a person with diabetes than a person without diabetes.

In [None]:
df.sample(5)

In [None]:
plt.figure(figsize=(8,4), dpi=150)

sns.kdeplot(data=df, x='BloodPressure')

In [None]:
df['BloodPressure'].describe()

In [None]:
# df[(df['BloodPressure']==0) & (df['Outcome']==0)]['BloodPressure'] = 'a'

df[df['Outcome']==0]['BloodPressure'].replace(0,df[df['Outcome']==0]['BloodPressure'].mean(axis=0), inplace=True)
df[df['Outcome']==1]['BloodPressure'].replace(0,df[df['Outcome']==1]['BloodPressure'].mean(axis=0), inplace=True)

## The above lines gives warning and doesn't change the values as intended

In [None]:
df['BloodPressure'].replace(0, df['BloodPressure'].mean(), inplace=True)

In [None]:
df['BloodPressure'].describe()

In [None]:
plt.figure(figsize=(8,4), dpi=150)

sns.kdeplot(data=df, x='BloodPressure', fill=True)


In [None]:
distplot('BloodPressure')

In [None]:
mean_target('BloodPressure')

In [None]:
plt.figure(figsize=(8,4), dpi=150)

sns.boxplot(x = 'Outcome', y = 'BloodPressure', data = df, palette = 'Blues')
plt.title('BP vs Outcome')
plt.show()

Relationship between __Age__ and __BloodPressure__

In [None]:
plt.figure(figsize=(8,4), dpi=150)

sns.jointplot(data=df, x='Age', y='BloodPressure', kind='reg', color='blue')

Observation :
- The mean of the blood pressure is greater for diabetic people as compared to the non-diabetic people
- As the age increases, generally the Blood Pressure also increases

### PREGNANCIES

In [None]:
plt.figure(figsize=(8,4), dpi=150)

sns.kdeplot(data=df, x='Pregnancies', fill=True)

In [None]:
df['Pregnancies'].describe()

In [None]:
plt.figure(figsize=(8,4), dpi=150)

sns.boxplot(data=df, x='Pregnancies')

In [None]:
outliers_index_pregnancies = df[df['Pregnancies']>13].index

outliers_index_pregnancies

In [None]:
df.drop(outliers_index_pregnancies, axis=0, inplace=True)

In [None]:
plt.figure(figsize=(8,4), dpi=150)

sns.boxplot(data=df, x='Pregnancies')

In [None]:
fig, ax = plt.subplots()

labels = ['Diabetic', 
         'Non-Diabetic']
percentages = [34.89, 65.10]
explode=(0.1,0)
ax.pie(percentages, explode=explode, labels=labels, autopct='%1.0f%%', 
       shadow=False, startangle=0,   
       pctdistance=1.2,labeldistance=1.4)
ax.legend(frameon=False, bbox_to_anchor=(1.5,0.8))
plt.show()

In [None]:
def msv_1(data, thresh = 20, color = 'black', edgecolor = 'black', height = 3, width = 15):
    
    plt.figure(figsize = (width, height))
    percentage = (data.isnull().mean()) * 100
    percentage.sort_values(ascending = False).plot.bar(color = color, edgecolor = edgecolor)
    plt.axhline(y = thresh, color = 'r', linestyle = '-')
    
    plt.title('Missing values percentage per column', fontsize=20, weight='bold' )
    
    plt.text(len(data.isnull().sum()/len(data))/1.7, thresh+2.5, f'Columns with more than {thresh}% missing values', fontsize=12, color='crimson',
         ha='left' ,va='top')
    plt.text(len(data.isnull().sum()/len(data))/1.7, thresh - 0.5, f'Columns with less than {thresh}% missing values', fontsize=12, color='green',
         ha='left' ,va='top')
    plt.xlabel('Columns', size=15, weight='bold')
    plt.ylabel('Missing values percentage')
    plt.yticks(weight ='bold')
    
    return plt.show()
msv_1(df, 20, color=sns.color_palette('Reds',15))

In [None]:
df[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']] = df[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']].replace(0, np.nan)

In [None]:
msv_1(df, 20, color=sns.color_palette('Reds',15))

- Since Gluocse, BP, Skinthickness and BMI are normally distrubuted, we can replace the missing values with their __mean__
-  For columns like Insulin and DiabetesPedigreeFunction, we will have to replace them will median due to the effect of skewness.

In [None]:
df['Insulin'] = df['Insulin'].fillna(df['Insulin'].median()) 

for col in ['Glucose', 'BloodPressure', 'SkinThickness', 'BMI']:
    df[col] = df[col].fillna(df[col].mean())

In [None]:
msv_1(df, 10, color=sns.color_palette('Greens',15))

Observation:
- Insulin has closed to 50% missing values
- SkinThickness has almost 30% missing values

In [None]:
plt.figure(figsize=(8,4), dpi=150)

corr = df.corr()
# print(corr)
sns.heatmap(corr, 
         xticklabels=corr.columns, 
         yticklabels=corr.columns,
         annot=True)

In [None]:
plt.figure(figsize=(8,4), dpi=150)

corr = df.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=1.0, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot = True)

- From above Heatmap, it is clear that there is NO __Multicollinearity__ between the independent features

## MODEL BUILDING

### - STANDARDIZING THE DATA

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
X = df.drop('Outcome', axis=1)
y = df['Outcome']

In [None]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42)

### Standardizing the data. Why?


- Standardizing a dataset involves rescaling the distribution of values so that the mean of observed values is 0 and the standard deviation is 1.

- This can be thought of as subtracting the mean value or centering the data. Scaling the features is of utmost importance because different features are in different scales. Let's say the Age has values in double digits, whereas the DPF is of the kind float, the effect of the Age feature will be more as compared to the DPF

- Best practice is to use only the training set to figure out how to scale / normalize, then blindly apply the same transform to the test set.

- For example, say you're going to normalize the data by removing the mean and dividing out the variance. If you use the whole dataset to figure out the feature mean and variance, you're using knowledge about the distribution of the test set to set the scale of the training set - 'leaking' information.

- The right way to do this is to use only the training set to calculate the mean and variance, normalize the training set, and then at test time, use that same (training) mean and variance to normalize the test set.

In [None]:
scaler = StandardScaler()
sclaed_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression

logistic_model = LogisticRegression()

logistic_model.fit(sclaed_X_train,y_train)

In [None]:
logistic_model.coef_

In [None]:
y_pred = logistic_model.predict(scaled_X_test)

y_pred

In [None]:
y_pred_probability = logistic_model.predict_proba(scaled_X_test)

y_pred_probability

## PERFORMANCE METRICS

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
accuracy = accuracy_score(y_test,y_pred)
accuracy

In [None]:
confusion_mat = confusion_matrix(y_test,y_pred)
confusion_mat

In [None]:
from sklearn.metrics import plot_confusion_matrix

plt.figure(dpi=150)
plot_confusion_matrix(logistic_model,scaled_X_test,y_test)

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
from sklearn.linear_model import LogisticRegressionCV

In [None]:
logistic_model_cv = LogisticRegressionCV()

logistic_model_cv.fit(sclaed_X_train,y_train)

In [None]:
logistic_model_cv.C_

In [None]:
logistic_model_cv.get_params()

In [None]:
logistic_model_cv.coef_

In [None]:
y_pred_cv = logistic_model_cv.predict(scaled_X_test)

In [None]:
coeffs = pd.Series(index=X.columns,data=logistic_model_cv.coef_[0])

coeffs

In [None]:
plt.figure(figsize=(8,4),dpi=150)
coeffs = coeffs.sort_values()
sns.barplot(x=coeffs.index,y=coeffs.values)

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,plot_confusion_matrix

confusion_matrix(y_test,y_pred_cv)

In [None]:
plot_confusion_matrix(logistic_model_cv,scaled_X_test,y_test)

In [None]:
print(classification_report(y_test,y_pred_cv))