# Import required libraries

In [139]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import RandomizedSearchCV


In [140]:
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
import pickle

In [121]:
df = pd.read_csv('D:\\Codes\\PIMA diabetes - End to End ML Proj\\notebook\\diabetes.csv')

In [122]:
df.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


# X , y Split

In [148]:
X = df.drop(columns = ['Outcome'],axis = 1)
X

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63
764,2,122,70,27,0,36.8,0.340,27
765,5,121,72,23,112,26.2,0.245,30
766,1,126,60,0,0,30.1,0.349,47


In [149]:
y = df['Outcome']
y


0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64

# ColumnTransformer - Standardisation

In [150]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

In [151]:
num_features = X.select_dtypes(exclude="object").columns
num_processor = StandardScaler()

In [152]:
preprocessor = ColumnTransformer([("Standard Scaler",num_processor,num_features)]) # for  numerical columns

In [153]:
X = preprocessor.fit_transform(X)

In [154]:
X.shape

(768, 8)

# Train - Test Split

In [158]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=65)
X_train.shape, X_test.shape

((537, 8), (231, 8))

In [159]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [132]:
# def evaluation(true,predicted):
#     mae = mean_absolute_error(true,predicted)
#     # mse = mean_squared_error(true,predicted)
#     rmse = np.sqrt(mse)
#     r2_value = r2_score(true,predicted)
#     return mae,mse,rmse,r2_value

In [86]:
# a,b,c,d = evaluation([20,30,40],[30,40,40])

In [87]:
# print(a," ",b," ",c," ",d," ")

6.666666666666667   66.66666666666667   8.16496580927726   0.0  


#### - Logistic Regression
#### - Decision Trees
#### - Random Forest
#### - Support Vector Machines (SVM)
#### - K-Nearest Neighbors (KNN)
#### - Naive Bayes
#### - Gradient Boosting Machines (GBM)
#### - Neural Networks
#### - AdaBoost
#### - XGBoost
#### - CatBoost

In [157]:
# # Classification models

# models = {
#     "Logistic Regression":LogisticRegression(),
#     "Decision Trees": DecisionTreeClassifier(),
#     "Random Forest": RandomForestClassifier(),
#     "Support Vector Machines (SVM)": SVC(),
#     "K-Nearest Neighbors (KNN)": KNeighborsClassifier(),
#     "Naive Bayes": GaussianNB(),
#     "Gradient Boosting Machines (GBM)": GradientBoostingClassifier(),
#     "Neural Networks": MLPClassifier(),
#     "AdaBoost": AdaBoostClassifier(),
 
# }

In [160]:

models = {
    'SVC': SVC(),
    'LogisticRegression': LogisticRegression(),
    'RandomForest': RandomForestClassifier(),
    'DecisionTree': DecisionTreeClassifier(),
    'KNeighbors': KNeighborsClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'GradientBoosting':GradientBoostingClassifier(),
    'NeuralNetwork': MLPClassifier(),
}

from sklearn.model_selection import GridSearchCV

# param_grid = {
#     'SVC': {'C':[0.001, 0.01, 0.1, 1], 'kernel': ['linear', 'rbf']},
#     'LogisticRegression': {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 100]},
#     'RandomForest': {'n_estimators': [10,20,50, 100, 200], 'max_depth': [None,3,2, 10, 20]},
#     'DecisionTree': {'max_depth':[3, 5, 7, 9, 11, 13]},
#     'KNeighbors': {'n_neighbors': [3, 20, 2]},
#     'AdaBoost':{'n_estimators': [10,20,50, 100, 200]}
    
# }

param_grid = {
    'SVC': {'C':[0.001, 0.01, 0.1, 1], 'kernel': ['linear', 'rbf']},
    'LogisticRegression': {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 100]},
    'RandomForest': {'n_estimators': [10,20,50, 100, 200], 'max_depth': [None,3,2, 10, 20]},
    'DecisionTree': {'max_depth':[3, 5, 7, 9, 11, 13]},
    'KNeighbors': {'n_neighbors': [3, 20, 2]},
    'AdaBoost':{'n_estimators': [10,20,50, 100, 200]},
    'GradientBoosting': {'n_estimators': [50, 100, 200], 'learning_rate': [0.05, 0.1, 0.2], 'max_depth': [3, 5, 7]},
    'NeuralNetwork': {'hidden_layer_sizes': [(50,), (100,), (50,50), (100,50)], 'activation': ['logistic', 'relu'], 'alpha': [0.0001, 0.001, 0.01]}
}


results_list = []

for model_name, model in models.items():
    grid_search = GridSearchCV(model, param_grid[model_name], cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    best_params = grid_search.best_params_ # gives the best parameter
    accuracy = grid_search.best_score_ # gives the best score obtained by selecting the best parameter
    results_list.append({'Model': model_name, 'Best_Params': best_params, 'Accuracy': accuracy}) # appending as dict into results_list list

results_df = pd.DataFrame(results_list) # converting the list of dicts into dataframe(Structured format)

results_df



Unnamed: 0,Model,Best_Params,Accuracy
0,SVC,"{'C': 1, 'kernel': 'linear'}",0.772828
1,LogisticRegression,{'C': 1},0.759761
2,RandomForest,"{'max_depth': 10, 'n_estimators': 200}",0.767307
3,DecisionTree,{'max_depth': 11},0.720613
4,KNeighbors,{'n_neighbors': 20},0.741173
5,AdaBoost,{'n_estimators': 20},0.741121
6,GradientBoosting,"{'learning_rate': 0.1, 'max_depth': 3, 'n_esti...",0.759796
7,NeuralNetwork,"{'activation': 'relu', 'alpha': 0.001, 'hidden...",0.774697


In [162]:
results_df.sort_values(by=["Accuracy"],ascending=False)

Unnamed: 0,Model,Best_Params,Accuracy
7,NeuralNetwork,"{'activation': 'relu', 'alpha': 0.001, 'hidden...",0.774697
0,SVC,"{'C': 1, 'kernel': 'linear'}",0.772828
2,RandomForest,"{'max_depth': 10, 'n_estimators': 200}",0.767307
6,GradientBoosting,"{'learning_rate': 0.1, 'max_depth': 3, 'n_esti...",0.759796
1,LogisticRegression,{'C': 1},0.759761
4,KNeighbors,{'n_neighbors': 20},0.741173
5,AdaBoost,{'n_estimators': 20},0.741121
3,DecisionTree,{'max_depth': 11},0.720613
