# Features Selection

In [2]:
# importing the important libraries
import pandas as pd
import numpy as np

# train test split
from sklearn.model_selection import train_test_split

# model
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

# Evalution
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay

# Serialization
import pickle

### Dataset loading

In [10]:
# dataset path
dataset_path = "./notebooks/cleaned_dataset.csv"

# importing dataset and converting data into pandas dataframe
df = pd.read_csv(dataset_path)

FileNotFoundError: [Errno 2] No such file or directory: './notebooks/cleaned_dataset.csv'

### Data preview

In [None]:
df.head(3)

# Selecting Target and features

In [None]:
features = ["Pregnancies",	"Glucose",	"BloodPressure",	"SkinThickness",	"Insulin",	"BMI",	"DiabetesPedigreeFunction",	"Age"]
target = ["Outcome"]

### Here target is known and it is categorical so we need to perform supervised classification problem

In [None]:
X = df[features]
y = df[target]

# Train Test Split
The dataset is split into training and testing sets using train_test_split(). Here, we use 80% of the data for training and 20% for testing.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state=42)

In [None]:
# checking the shape of train dataset
X_train.shape, y_train.shape

In [None]:
# checking the shape of test dataset
X_test.shape, y_test.shape

# Modeling

# 1. Logistic Regression
* The model is trained on the training data using fit().

In [None]:
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

# Prediction using Logistic Regressoin
* We make predictions on the test set using predict()

In [None]:
lr_model_y_pred = lr_model.predict(X_test)

# Evaulation
* Finally, we evaluate the model using f1_score, classification report, and confusion matrix. These metrics provide insights into how well the model performs.

### a. F1 score

In [None]:
lr_model_f1_score = f1_score(y_true=y_test, y_pred=lr_model_y_pred)
print(f"The f1 score of logistic regression is {round(lr_model_f1_score, 3)}.")

### b. Classification report

In [None]:
print(classification_report(y_true=y_test, y_pred=lr_model_y_pred))

### c. Confusion matrix

In [None]:
ConfusionMatrixDisplay.from_estimator(lr_model, X_test, y_test)

# 2. Decision Tree Classification
* The model is trained on the training data using fit().

In [None]:
dtc_model = DecisionTreeClassifier()
dtc_model.fit(X_train, y_train)

# Prediction using decission tree classification
* We make predictions on the test set using predict()

In [None]:
dtc_model_y_pred = dtc_model.predict(X_test)

# Evaluation
* Finally, we evaluate the model using f1_score and classification report. These metrics provide insights into how well the model performs.

### a. F1 Score

In [None]:
dtc_model_f1_score = f1_score(y_true=y_test, y_pred=dtc_model_y_pred)
print(f"The f1 score of decission tree classification  is {round(dtc_model_f1_score, 3)}.")

### b. Classification report

In [None]:
print(classification_report(y_true=y_test, y_pred=dtc_model_y_pred))

### 3. SVM
* The model is trained on the training data using fit().

In [None]:
svc_model = SVC()
svc_model.fit(X_train,y_train)

# Prediction using SVM
* We make predictions on the test set using predict()

In [None]:
svc_model_y_pred = svc_model.predict(X_test)

# Evaluation
* Finally, we evaluate the model using f1_score. These metrics provide insights into how well the model performs.

In [None]:
svc_model_f1_score = f1_score(y_true=y_test, y_pred=svc_model_y_pred)
print(f"The f1 score of SVM is {round(svc_model_f1_score, 3)}.")

### 4. KNN
* The model is trained on the training data using fit().

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=3) # n_neighbours is hyperparameter to be tuned.
knn_model.fit(X_train, y_train)

# prediction using KNN
* We make predictions on the test set using predict()

In [None]:
knn_model_y_pred = knn_model.predict(X_test)

# Evaluation
* Finally, we evaluate the model using f1_score. These metrics provide insights into how well the model performs.

In [None]:
knn_model_f1_score = f1_score(y_true=y_test, y_pred=knn_model_y_pred)
print(f"The f1 score of KNN is {round(knn_model_f1_score, 3)}.")

# Hyperparameter tuning

In [None]:
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 6)]
max_depth = [int(x) for x in np.linspace(start = 5, stop = 30, num = 4)]

In [None]:
random_grid = {
    'n_estimators' : n_estimators,
    'max_features': ['auto', 'sqrt'],
    'max_depth' : max_depth,
    'min_samples_split': [5, 10, 10, 100]
}
random_forest_model = RandomForestClassifier(max_depth=2, random_state=42)

rf_random = RandomizedSearchCV(estimator = random_forest_model, param_distributions = random_grid, cv = 3, verbose = 2,
                  n_jobs = -1 )

In [None]:
rf_random.fit(X_train, y_train)

# prediction using random Forest classifier
* We make predictions on the test set using predict()

In [None]:
rf_random_y_pred = rf_random.predict(X_test)

# Evaluation
* Finally, we evaluate the model using f1_score. These metrics provide insights into how well the model performs.

In [None]:
rf_random_f1_score = f1_score(y_true=y_test, y_pred=knn_model_y_pred)
print(f"The f1 score of random forest classifier is {round(rf_random_f1_score, 3)}.")

# Comparison of F1 score of different algorithms

In [None]:
algorithms = {
    "Algorithms": ["LogisticRegression","DecissionTresClassifier","SVC","KNeighborsClassifier","RandomForestClassifier"],
    "F1 Score": [lr_model_f1_score,dtc_model_f1_score,svc_model_f1_score,knn_model_f1_score,rf_random_f1_score]
}


In [None]:
f1_score_datafram = pd.DataFrame(algorithms)

In [None]:
f1_score_datafram

### F1 score of logistic regression is high among all algorithms. Therefore, LogisticRegression is best for this data

# Serialization

In [None]:
with open("final_model.pickle","bw") as file:
    pickle.dump(lr_model, file) # serializing logistic regression model