## 1. Load required library packages

In [2]:
!pip install matplotlib
#Load packages
import pandas as pd
import numpy as np
import sklearn.model_selection as mod
import sklearn.neighbors as nei
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_absolute_error, mean_squared_error, roc_auc_score, average_precision_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection  import cross_val_score
from sklearn.feature_selection import RFECV
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

Collecting matplotlib
  Downloading matplotlib-3.10.0-cp311-cp311-win_amd64.whl.metadata (11 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Downloading contourpy-1.3.1-cp311-cp311-win_amd64.whl.metadata (5.4 kB)
Collecting cycler>=0.10 (from matplotlib)
  Using cached cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Downloading fonttools-4.56.0-cp311-cp311-win_amd64.whl.metadata (103 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib)
  Downloading kiwisolver-1.4.8-cp311-cp311-win_amd64.whl.metadata (6.3 kB)
Collecting pillow>=8 (from matplotlib)
  Downloading pillow-11.1.0-cp311-cp311-win_amd64.whl.metadata (9.3 kB)
Collecting pyparsing>=2.3.1 (from matplotlib)
  Downloading pyparsing-3.2.1-py3-none-any.whl.metadata (5.0 kB)
Downloading matplotlib-3.10.0-cp311-cp311-win_amd64.whl (8.0 MB)
   ---------------------------------------- 0.0/8.0 MB ? eta -:--:--
   ---------------- ----------------------- 3.4/8.0 MB 18.4 MB/s eta 0:00:01


## 2. Data Exploration:

In [3]:
#Load dataset (CSV file) using Pandas
url="diabetes.csv"
pima = pd.read_csv(url)

In [4]:
# Check dimension of the DataFrame
pima.shape

(768, 9)

In [5]:
# Check the type of 'pima'
type(pima)

pandas.core.frame.DataFrame

In [6]:
# Get row indices 
pima_row_idx = pima.index
pima_row_idx

RangeIndex(start=0, stop=768, step=1)

In [7]:
# Get the column names 
pima_col_idx = pima.columns
pima_col_idx

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [8]:
# Get data type for each attribute 
pima.dtypes

Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
Outcome                       int64
dtype: object

In [9]:
# Check the first 5 rows
pima.head (5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [10]:
# Check missing values
pima.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [11]:
# Create Nutritional status column 
Nutritional_status = pd.Series([])

# 3. Feature Engineering : (Data Preprocessing)

In [13]:
# Handling missing values
# Replace missing values (0s) with NaN for numeric columns only
numeric_cols = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin']
pima[numeric_cols] = pima[numeric_cols].replace(0, pd.NA)

In [14]:
# Imputation - using mean imputation for numeric columns
numeric_cols = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin']
pima[numeric_cols] = pima[numeric_cols].fillna(pima[numeric_cols].mean())

In [15]:
# Nutritional status based on BMI
for i in range(len(pima)): 
    if pima['BMI'][i] == 0.0: 
        Nutritional_status[i]="NA"
    
    elif pima['BMI'][i] < 18.5: 
        Nutritional_status[i]="Underweight"
        
    elif pima['BMI'][i] < 25: 
        Nutritional_status[i]="Normal"
  
    elif 25 <= pima['BMI'][i] < 30: 
        Nutritional_status[i]="Overweight"
  
    elif pima['BMI'][i] >= 30: 
        Nutritional_status[i]="Obese"
        
    else: 
        Nutritional_status[i]= pima['BMI'][i]

In [16]:
# Check if the column "Nutritional Status" already exists
if "Nutritional Status" not in pima.columns:
    # Define Nutritional_status data here, as a list, Series, or another appropriate structure
    Nutritional_status = [0] * len(pima)
    # Insert the "Nutritional Status" column at index 6
    pima.insert(6, "Nutritional Status", Nutritional_status)

In [17]:
# Check df containing new column
pima.head (5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,Nutritional Status,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,155.548223,33.6,0,0.627,50,1
1,1,85.0,66.0,29.0,155.548223,26.6,0,0.351,31,0
2,8,183.0,64.0,29.15342,155.548223,23.3,0,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,0,2.288,33,1


In [18]:
pima['Nutritional Status'].value_counts()

Nutritional Status
0    768
Name: count, dtype: int64

In [20]:
# Create OGTT_Interpretation (Interpretation of Glucose level) column 
OGTT_Interpretation = pd.Series([]) 

In [21]:
# Interpretation of OGTT (The oral glucose tolerance test) - using OGTT levels recommended by DIABETES UK (2019)
for i in range(len(pima)): 
    if pima['Glucose'][i] == 0.0: 
        OGTT_Interpretation [i]="NA"
    elif pima['Glucose'][i] <= 140: 
        OGTT_Interpretation [i]="Normal"
    elif pima['Glucose'][i] > 198: 
        OGTT_Interpretation[i]="Diabetic Level"
    else: 
        OGTT_Interpretation [i]= pima['Glucose'][i]

In [22]:
# Insert new column - Glucose Result
pima.insert(2, "Glucose Result", OGTT_Interpretation)

In [23]:
pima['Glucose Result'].value_counts()

Glucose Result
Normal            576
146.0               9
158.0               8
144.0               7
147.0               7
143.0               6
151.0               6
154.0               6
173.0               6
162.0               6
155.0               5
145.0               5
142.0               5
181.0               5
179.0               5
180.0               5
141.0               5
148.0               4
168.0               4
152.0               4
165.0               4
187.0               4
189.0               4
197.0               4
166.0               3
183.0               3
184.0               3
164.0               3
194.0               3
163.0               3
171.0               3
150.0               3
196.0               3
156.0               3
167.0               3
161.0               3
159.0               2
176.0               2
174.0               2
153.0               2
188.0               2
170.0               2
195.0               2
193.0               2
175.0            

In [24]:
Impaired_Glucose_Tolerance_Diabetic = ((pima ['Glucose'] > 140 ) & (pima ['Glucose'] <= 198) & (pima ['Outcome'] == 1)).sum()
Impaired_Glucose_Tolerance_Diabetic

np.int64(131)

In [25]:
Normal_Glucose_Diabetic = ((pima ['Glucose'] != 0 ) & (pima ['Glucose'] <= 140) & (pima ['Outcome'] == 1)).sum()
Normal_Glucose_Diabetic

np.int64(136)

In [26]:
# Create Percentile of skin thickness column
Percentile_skin_thickness = pd.Series([]) 

In [27]:
# Check how many women are 80 or older (the Percentile skin thickness depend on skin fold and age)
pima['Age'].value_counts()

Age
22    72
21    63
25    48
24    46
23    38
28    35
26    33
27    32
29    29
31    24
41    22
30    21
37    19
42    18
33    17
36    16
38    16
32    16
45    15
34    14
46    13
40    13
43    13
39    12
35    10
44     8
50     8
51     8
52     8
58     7
54     6
47     6
49     5
60     5
53     5
57     5
48     5
63     4
66     4
55     4
62     4
59     3
56     3
65     3
67     3
61     2
69     2
72     1
81     1
64     1
70     1
68     1
Name: count, dtype: int64

In [28]:
#  Check skin fold thickness Percentile
for i in range(len(pima)):
    if 20.0 <= pima["Age"][i] <= 79.0: 
        
        if pima["SkinThickness"][i] == 0.0: 
            Percentile_skin_thickness[i]=" 0 NA"
    
        elif pima["SkinThickness"][i] < 11.9: 
            Percentile_skin_thickness[i]="1 <P5th"
        
        elif pima["SkinThickness"][i] == 11.9: 
            Percentile_skin_thickness[i]="2 P5th"
        
        elif 11.9 < pima["SkinThickness"][i] < 14.0: 
            Percentile_skin_thickness[i]="3 P5th - P10th"
        
        elif pima["SkinThickness"][i] == 14.0: 
            Percentile_skin_thickness[i]="4 P10th"
        
        elif 14.0 < pima["SkinThickness"][i] < 15.8: 
            Percentile_skin_thickness[i]="5 P10th - P15th"
        
        elif pima["SkinThickness"][i] == 15.8: 
            Percentile_skin_thickness[i]="6 P15th"
    
        elif 15.8 < pima["SkinThickness"][i] < 18.0: 
            Percentile_skin_thickness[i]="7 P15th - P25th"
    
        elif pima["SkinThickness"][i] == 18.0: 
            Percentile_skin_thickness[i]="8 P25th"
        
        elif 18.0 < pima["SkinThickness"][i] < 23.5: 
            Percentile_skin_thickness[i]="9 P25th - P50th"
   
        elif pima["SkinThickness"][i] == 23.5: 
            Percentile_skin_thickness[i]="10 P50th"
        
        elif 23.5 < pima["SkinThickness"][i] < 29.0: 
            Percentile_skin_thickness[i]="11 P50th - P75th"
        
        elif pima["SkinThickness"][i] == 29.0: 
            Percentile_skin_thickness[i]="12 P75th"
        
        elif 29.0 < pima["SkinThickness"][i] < 31.9: 
            Percentile_skin_thickness[i]="13 P75th - P85th"
        
        elif pima["SkinThickness"][i] == 31.9: 
            Percentile_skin_thickness[i]="14 P85th"
        
        elif 31.9 < pima["SkinThickness"][i] < 33.7: 
            Percentile_skin_thickness[i]="15 P85th - P90th"
    
        elif pima["SkinThickness"][i] == 33.7: 
            Percentile_skin_thickness[i]="16 P90th"
        
        elif 33.7 < pima["SkinThickness"][i] < 35.9: 
            Percentile_skin_thickness[i]="17 P90th - P95th"
    
        elif pima["SkinThickness"][i] == 35.9: 
            Percentile_skin_thickness[i]="18 P95th"
        
        elif pima["SkinThickness"][i] > 35.9: 
            Percentile_skin_thickness[i]="19 >P95th"
            
    elif pima["Age"][i] >= 80.0:  #Only 1 woman is 81 years old
        if  pima["SkinThickness"][i] > 31.7: 
            Percentile_skin_thickness[i]="20 >P95th"

In [29]:
# Insert new column - Percentile of skin thickness
pima.insert(4, "Percentile skin thickness", Percentile_skin_thickness)

In [30]:
# Check the first 5 rows
pima.head(5)

Unnamed: 0,Pregnancies,Glucose,Glucose Result,BloodPressure,Percentile skin thickness,SkinThickness,Insulin,BMI,Nutritional Status,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,148.0,72.0,17 P90th - P95th,35.0,155.548223,33.6,0,0.627,50,1
1,1,85.0,Normal,66.0,12 P75th,29.0,155.548223,26.6,0,0.351,31,0
2,8,183.0,183.0,64.0,13 P75th - P85th,29.15342,155.548223,23.3,0,0.672,32,1
3,1,89.0,Normal,66.0,9 P25th - P50th,23.0,94.0,28.1,0,0.167,21,0
4,0,137.0,Normal,40.0,17 P90th - P95th,35.0,168.0,43.1,0,2.288,33,1


In [31]:
diabetic_malnourished_st = ((pima ['SkinThickness'] < 15.8) & (pima ['Outcome'] == 1)).sum()
diabetic_malnourished_st

np.int64(6)

In [366]:
diabetic_malnourished_bmi = ((pima ['BMI'] < 18.5) & (pima ['Outcome'] == 1)).sum()
diabetic_malnourished_bmi

0

In [32]:
diabetic_malnourished_bmi_st = ((pima ['BMI'] < 18.5) & (pima ['SkinThickness'] < 15.8) & (pima ['Outcome'] == 1)).sum()
diabetic_malnourished_bmi_st

np.int64(0)

In [36]:
# Minimum
print(pima.dtypes)
pima = pima.apply(pd.to_numeric, errors='coerce')  # Convert all possible columns to numbers
pima.min()

Pregnancies                    int64
Glucose                      float64
Glucose Result                object
BloodPressure                float64
Percentile skin thickness     object
SkinThickness                float64
Insulin                      float64
BMI                          float64
Nutritional Status             int64
DiabetesPedigreeFunction     float64
Age                            int64
Outcome                        int64
dtype: object


Pregnancies                    0.000
Glucose                       44.000
Glucose Result               141.000
BloodPressure                 24.000
Percentile skin thickness        NaN
SkinThickness                  7.000
Insulin                       14.000
BMI                            0.000
Nutritional Status             0.000
DiabetesPedigreeFunction       0.078
Age                           21.000
Outcome                        0.000
dtype: float64

In [37]:
# Maximum
pima.max()

Pregnancies                   17.00
Glucose                      199.00
Glucose Result               198.00
BloodPressure                122.00
Percentile skin thickness       NaN
SkinThickness                 99.00
Insulin                      846.00
BMI                           67.10
Nutritional Status             0.00
DiabetesPedigreeFunction       2.42
Age                           81.00
Outcome                        1.00
dtype: float64

In [38]:
#Check if the sample were classified as Underweight presented a BMI lower than 18.5
pima_underweight = pima[pima['Nutritional Status'] =='Underweight']
pima_underweight

Unnamed: 0,Pregnancies,Glucose,Glucose Result,BloodPressure,Percentile skin thickness,SkinThickness,Insulin,BMI,Nutritional Status,DiabetesPedigreeFunction,Age,Outcome


In [39]:
pima['Outcome'].value_counts()

Outcome
0    500
1    268
Name: count, dtype: int64

In [40]:
# Another way of counting the outcome (diabetes)
count_not_diabetic = len(pima[pima['Outcome']==0])
count_not_diabetic

500

In [41]:
# Check the average of features grouped by Outcome (Diabetes)
pima.groupby('Outcome').mean(numeric_only=True)

Unnamed: 0_level_0,Pregnancies,Glucose,Glucose Result,BloodPressure,Percentile skin thickness,SkinThickness,Insulin,BMI,Nutritional Status,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,3.298,110.710121,157.066667,70.935397,,27.768651,142.210761,30.3042,0.0,0.429734,31.19
1,4.865672,142.165573,167.366412,75.147324,,31.736944,180.431548,35.142537,0.0,0.5505,37.067164


In [42]:
pima.mean(numeric_only=True)

Pregnancies                    3.845052
Glucose                      121.686763
Glucose Result               164.130890
BloodPressure                 72.405184
Percentile skin thickness           NaN
SkinThickness                 29.153420
Insulin                      155.548223
BMI                           31.992578
Nutritional Status             0.000000
DiabetesPedigreeFunction       0.471876
Age                           33.240885
Outcome                        0.348958
dtype: float64

## 3. Model Selection

In [44]:
# Model Selection - Choose appropriate classification algorithms to train on the dataset (Using Logistic Regression algorithm)
pima_all = pima.loc[(pima['BMI'] != 0) & (pima['Insulin'] != 0) & (pima['BloodPressure'] != 0) & (pima['Glucose'] != 0)]
logRegression = LogisticRegression()

#1st Iteration - 8 variables
var1=["Pregnancies", "Glucose","BloodPressure","SkinThickness","Insulin", "BMI","DiabetesPedigreeFunction", "Age"]
X=pima_all[var1]
y=pima_all.Outcome

In [45]:
## Defining the model and assigning Y (Dependent) and X (Independent Variables)
logit_model=sm.Logit(y,X)

## Fitting the model and publishing the results
result=logit_model.fit()
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.579003
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:                Outcome   No. Observations:                  757
Model:                          Logit   Df Residuals:                      749
Method:                           MLE   Df Model:                            7
Date:                Wed, 12 Feb 2025   Pseudo R-squ.:                  0.1069
Time:                        23:26:53   Log-Likelihood:                -438.31
converged:                       True   LL-Null:                       -490.76
Covariance Type:            nonrobust   LLR p-value:                 1.038e-19
                               coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------
Pregnancies                  0.1303      0.030      4.405      0.000       0.072

In [46]:
#2nd Iteration - 3 variables
var2=["Pregnancies", "Glucose","BloodPressure"]
X=pima_all[var2]
logit_model=sm.Logit(y,X)
result=logit_model.fit()
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.582791
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:                Outcome   No. Observations:                  757
Model:                          Logit   Df Residuals:                      754
Method:                           MLE   Df Model:                            2
Date:                Wed, 12 Feb 2025   Pseudo R-squ.:                  0.1010
Time:                        23:26:56   Log-Likelihood:                -441.17
converged:                       True   LL-Null:                       -490.76
Covariance Type:            nonrobust   LLR p-value:                 2.903e-22
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
Pregnancies       0.1191      0.025      4.774      0.000       0.070       0.168
Glucose           0.

In [48]:
var2=["Pregnancies", "Glucose","BloodPressure"]
X=pima_all[var2]
y=pima_all.Outcome
logRegression.fit(X,y)

## Defining the y_pred variable for the predicting values. 
y_pred=logRegression.predict(X)

## Calculating the precision of the model
print(classification_report(y,y_pred))

              precision    recall  f1-score   support

           0       0.77      0.88      0.82       491
           1       0.71      0.52      0.60       266

    accuracy                           0.76       757
   macro avg       0.74      0.70      0.71       757
weighted avg       0.75      0.76      0.75       757



In [49]:
## Confusion matrix gives the number of cases that the model is able to accurately predict the outcomes, and the number of cases the model gives false positive and false negatives
confusion_matrix = confusion_matrix(y, y_pred)
print(confusion_matrix)

[[433  58]
 [127 139]]


## 3. Model Training

In [50]:
# Model Training: Split the dataset into training and testing sets. Train the selected models on the training data.
# Predict the outcome - knn
inputs = pima_all[['Glucose', 'Insulin', 'SkinThickness', 'BMI', 'Age']]

In [51]:
outputs = pima_all['Outcome']

In [52]:
knn = nei.KNeighborsClassifier(n_neighbors=5)

In [53]:
knn.fit(inputs, outputs)

In [54]:
# Evaluate knn
(knn.predict(inputs) == outputs).sum()

np.int64(613)

In [55]:
# Split the dataset into training and testing sets - use the existing training i/p & o/p
inputs_train, inputs_test, outputs_train, outputs_test = mod.train_test_split(inputs,outputs, test_size =0.33)

In [56]:
knn = nei.KNeighborsClassifier(n_neighbors=5)
knn.fit(inputs_train, outputs_train)

In [57]:
(knn.predict(inputs_test) == outputs_test).sum()

np.int64(185)

In [58]:
#Count how many women are Positive (1) and Negative (0) for diabetes in the test set 
outputs_test.value_counts()

Outcome
0    166
1     84
Name: count, dtype: int64

## 4. Model Evaluation

In [59]:
# Model Evaluation: Evaluate the models using appropriate metrics such as MAE, MSE, RMSE, accuracy, precision, recall, and F1-score. Use techniques like cross-validation to ensure the robustness of the results.

# Mean Absolute Error (MAE)
# Train a logistic regression model
logRegression.fit(inputs_train, outputs_train)

In [60]:
# Predict the target variable on the testing set
y_pred = logRegression.predict(inputs_test)

# Calculate mean absolute error
mae = mean_absolute_error(outputs_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

Mean Absolute Error (MAE): 0.216


In [61]:
# Mean Squared Error (MSE)

# Standardize the features
scalar = StandardScaler()
X_train_scaled = scalar.fit_transform(inputs_train)
X_test_scaled = scalar.transform(inputs_test)

In [62]:
# Train a logistic regression model
logRegression.fit(X_train_scaled, outputs_train)

In [63]:
# Predict the target variable on the testing set
y_pred = logRegression.predict(X_test_scaled)

# Calculate Mean Squared Error
mse = mean_squared_error(outputs_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 0.216


In [64]:
# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)

Root Mean Squared Error: 0.46475800154489


In [65]:
# Check classification accuracy with knn = 5
outputs_pred = knn.predict(inputs_test)
accuracy = metrics.accuracy_score(outputs_test, outputs_pred)
accuracy

0.74

In [66]:
# 10-fold cross-validation with knn = 5
scores = cross_val_score (knn, inputs, outputs, cv =10, scoring = 'accuracy')
scores

array([0.65789474, 0.73684211, 0.69736842, 0.64473684, 0.72368421,
       0.73684211, 0.72368421, 0.85333333, 0.72      , 0.70666667])

In [68]:
scores.mean()

np.float64(0.7201052631578947)

In [69]:
# precision
new_inputs = pima_all[['Glucose', 'SkinThickness', 'Insulin', 'BMI', 'Age']]
new_inputs_train, new_inputs_test, outputs_train, outputs_test = mod.train_test_split(new_inputs,outputs, test_size =0.33)
mnb = MultinomialNB()
mnb.fit(inputs_train, outputs_train)
out_pred = mnb.predict(inputs_test)

precision_nb = metrics.precision_score(outputs_test, out_pred)
precision_nb

0.3047619047619048

In [70]:
# Recall
recall = metrics.recall_score(outputs_test, out_pred)
recall

0.4050632911392405

In [71]:
class_rep = classification_report(outputs_test, out_pred)
class_rep

'              precision    recall  f1-score   support\n\n           0       0.68      0.57      0.62       171\n           1       0.30      0.41      0.35        79\n\n    accuracy                           0.52       250\n   macro avg       0.49      0.49      0.48       250\nweighted avg       0.56      0.52      0.53       250\n'

In [72]:
out_pred_prob = mnb.predict_proba(inputs_test)[:,1]
out_pred_prob

array([6.27089748e-01, 1.50075980e-01, 6.91150045e-02, 6.03681338e-01,
       7.51802419e-01, 4.60255903e-06, 1.13649474e-02, 3.18781788e-01,
       9.61332121e-03, 4.86286439e-01, 4.29658805e-01, 5.73075054e-02,
       7.03087401e-01, 2.37375009e-01, 1.46592297e-01, 1.62667691e-01,
       6.12353253e-09, 1.78748853e-01, 8.39655893e-01, 3.27243059e-01,
       2.35678730e-01, 3.24699135e-01, 1.44887104e-01, 2.46129102e-01,
       9.80455508e-01, 5.00667597e-01, 9.24799670e-01, 9.85310293e-01,
       1.71993763e-01, 6.79653650e-01, 5.84113044e-01, 6.53637454e-02,
       3.62152593e-01, 1.25313353e-02, 1.33379924e-01, 7.87970003e-01,
       9.51361200e-01, 2.92892497e-01, 4.76586270e-01, 9.60900419e-01,
       6.65387541e-02, 7.61706658e-01, 9.96221478e-01, 1.17819717e-02,
       7.81215781e-01, 9.89612668e-01, 9.69040682e-01, 2.03816977e-01,
       5.81571735e-01, 6.37539036e-01, 1.09686130e-01, 1.39172620e-01,
       2.86810951e-01, 2.67116489e-02, 9.68219334e-02, 3.58644800e-01,
      

In [73]:
ras = roc_auc_score(outputs_test, out_pred_prob)
ras

np.float64(0.4736842105263158)

In [74]:
aps = average_precision_score(outputs_test, out_pred_prob)
aps

np.float64(0.3034814721915452)

In [75]:
# F1-Score
f1 = f1_score(outputs_test, y_pred)
print("F1-score:", f1)

F1-score: 0.24489795918367346


In [76]:
# Cross Validation
# Recursive feature elimination with cross-validation

kf = KFold(10)


In [77]:
skf = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)

In [78]:
lg = LogisticRegression()
rfecv = RFECV (estimator=lg,step=1, cv=skf, scoring='accuracy')
rfecv.fit(inputs, outputs)


In [80]:
feature_names = pima_all.columns[:10]
feature_names

Index(['Pregnancies', 'Glucose', 'Glucose Result', 'BloodPressure',
       'Percentile skin thickness', 'SkinThickness', 'Insulin', 'BMI',
       'Nutritional Status', 'DiabetesPedigreeFunction'],
      dtype='object')

In [82]:
X = pima_all[feature_names]

In [83]:
new_features = list(filter(lambda x: x[1],zip(feature_names, rfecv.support_)))
new_features

[('Pregnancies', np.True_),
 ('Glucose', np.True_),
 ('Glucose Result', np.True_),
 ('BloodPressure', np.True_),
 ('Percentile skin thickness', np.True_)]

In [84]:
# Cross-validation
scores = cross_val_score (lg, inputs, outputs, cv =10, scoring = 'accuracy')
scores

array([0.76315789, 0.77631579, 0.73684211, 0.72368421, 0.73684211,
       0.77631579, 0.78947368, 0.81333333, 0.74666667, 0.85333333])

## 6. Hyper parameter Tuning

In [85]:
rf = RandomForestClassifier()

# Define hyper parameters grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_features': [10, 'sqrt'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [86]:
# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_grid, n_iter=100, cv=5, verbose=2, random_state=42, n_jobs=-1)

# Fit the model to the training data
random_search.fit(inputs_train, outputs_train)


Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [88]:
y_pred = random_search.predict(inputs_test)
print("Accuracy:", accuracy_score(outputs_test, y_pred))

Accuracy: 0.584


## 7. Final Model Selection

In [89]:
# Evaluate the best model on the test set
precision = precision_score(outputs_test, y_pred)
recall = recall_score(outputs_test, y_pred)
f1 = f1_score(outputs_test, y_pred)
roc_auc = roc_auc_score(outputs_test, y_pred)

print(f"Accuracy: {accuracy_score(outputs_test, y_pred)}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")
print(f"ROC AUC: {roc_auc}")

Accuracy: 0.584
Precision: 0.12121212121212122
Recall: 0.05063291139240506
F1-score: 0.07142857142857142
ROC AUC: 0.4405211340587757


It seems like the model's performance is quite low overall, but it's essential to consider the specific requirements of the problem. In this scenario, considering the low recall and F1-score, we might prioritize models that better capture true positive cases. However, the F1-score is a commonly used metric for imbalanced datasets like the Pima Indians Diabetes Dataset, as it balances both precision and recall.