# Step 01: Loading Dataset and Providing Column Names 

In [172]:
import pandas as pd

# Define the column names
column_names = [
    'Number of times pregnant',
    'Plasma glucose concentration',
    'Diastolic blood pressure',
    'Triceps skinfold thickness',
    '2-Hour serum insulin',
    'Body mass index',
    'Diabetes pedigree function',
    'Age',
    'Class variable'
]

# Load the dataset
df = pd.read_csv('indiansDiabetes.csv', header=None, names=column_names)
print(df.head())



   Number of times pregnant  Plasma glucose concentration  \
0                         6                           148   
1                         1                            85   
2                         8                           183   
3                         1                            89   
4                         0                           137   

   Diastolic blood pressure  Triceps skinfold thickness  2-Hour serum insulin  \
0                        72                          35                     0   
1                        66                          29                     0   
2                        64                           0                     0   
3                        66                          23                    94   
4                        40                          35                   168   

   Body mass index  Diabetes pedigree function  Age  Class variable  
0             33.6                       0.627   50               1  
1     

In [173]:
df.shape

(768, 9)

# Step 02: Checking Zeros in Specific Columns

In [175]:
import numpy as np


columns_to_replace = ['Plasma glucose concentration', 'Diastolic blood pressure', 
                      'Triceps skinfold thickness', '2-Hour serum insulin', 
                      'Body mass index']

df[columns_to_replace] = df[columns_to_replace].replace(0, np.nan)

nan_check_before = df[columns_to_replace].isna().sum()
print("Count of NaN values per column before imputation:")
print(nan_check_before)

mean_values = df[columns_to_replace].mean()

# Print mean values for debugging
print("\nMean values for imputation:")
print(mean_values)

df[columns_to_replace] = df[columns_to_replace].fillna(mean_values)

nan_check_after = df[columns_to_replace].isna().sum()
print("\nCount of NaN values per column after imputation:")
print(nan_check_after)

Count of NaN values per column before imputation:
Plasma glucose concentration      5
Diastolic blood pressure         35
Triceps skinfold thickness      227
2-Hour serum insulin            374
Body mass index                  11
dtype: int64

Mean values for imputation:
Plasma glucose concentration    121.686763
Diastolic blood pressure         72.405184
Triceps skinfold thickness       29.153420
2-Hour serum insulin            155.548223
Body mass index                  32.457464
dtype: float64

Count of NaN values per column after imputation:
Plasma glucose concentration    0
Diastolic blood pressure        0
Triceps skinfold thickness      0
2-Hour serum insulin            0
Body mass index                 0
dtype: int64


# Step 03: Normalizing the features in range of 0-1

In [177]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

features = df.drop(columns=['Class variable'])

scaled_features = scaler.fit_transform(features)

scaled_df = pd.DataFrame(scaled_features, columns=features.columns)

scaled_df['Class variable'] = df['Class variable'].values

print(scaled_df.head())

   Number of times pregnant  Plasma glucose concentration  \
0                  0.352941                      0.670968   
1                  0.058824                      0.264516   
2                  0.470588                      0.896774   
3                  0.058824                      0.290323   
4                  0.000000                      0.600000   

   Diastolic blood pressure  Triceps skinfold thickness  2-Hour serum insulin  \
0                  0.489796                    0.304348              0.170130   
1                  0.428571                    0.239130              0.170130   
2                  0.408163                    0.240798              0.170130   
3                  0.428571                    0.173913              0.096154   
4                  0.163265                    0.304348              0.185096   

   Body mass index  Diabetes pedigree function       Age  Class variable  
0         0.314928                    0.234415  0.483333               

# Step 04: Splitting the Dataset in train and test (80-20)

In [179]:
from sklearn.model_selection import train_test_split

X = scaled_df.drop(columns=['Class variable'])
y = scaled_df['Class variable']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


# Step 05: Training, Predicting and Calculating varoius metrics 

In [181]:
from sklearn.linear_model import Perceptron, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score

# Initialize the models
models = {
    'Perceptron': Perceptron(),
    'LogisticRegression': LogisticRegression(),
    'DecisionTree': DecisionTreeClassifier(),
    'RandomForest': RandomForestClassifier(),
}

# Dictionary to store results (accuracy, precision, recall, F1-score)
results = {}

# Train the models, get metrics, and print classification report
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Calculate various metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    # Store the results in the dictionary
    results[name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1
    }
    
    # Print the classification report for each model
    print(f'{name} Classification Report:\n')
    print(classification_report(y_test, y_pred))
    print('---' * 20)  # Separator between models


Perceptron Classification Report:

              precision    recall  f1-score   support

           0       0.74      0.92      0.82       100
           1       0.72      0.39      0.51        54

    accuracy                           0.73       154
   macro avg       0.73      0.65      0.66       154
weighted avg       0.73      0.73      0.71       154

------------------------------------------------------------
LogisticRegression Classification Report:

              precision    recall  f1-score   support

           0       0.75      0.83      0.79       100
           1       0.60      0.48      0.54        54

    accuracy                           0.71       154
   macro avg       0.68      0.66      0.66       154
weighted avg       0.70      0.71      0.70       154

------------------------------------------------------------
DecisionTree Classification Report:

              precision    recall  f1-score   support

           0       0.71      0.79      0.75       100
