## 1. Importing the required libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler,RobustScaler
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.ensemble import RandomForestClassifier
import warnings 
warnings.filterwarnings('ignore')

## 2. Loading the dataset

In [2]:
data = pd.read_csv(r"C:\Users\vijayram\Data Science Intern\cleaned_data.csv")

## 3. Splitting the dataset

In [19]:
# Data Splitting (independent and dependent variables)
X = data.drop(['Outcome','NewBMI_Obesity class I', 'NewBMI_Obesity class II',
       'NewBMI_Obesity class III', 'NewBMI_Overweight', 'NewBMI_Underweight',
       'NewInsulin_Normal', 'NewGlucose_Low', 'NewGlucose_Normal',
       'NewGlucose_Pre_diabetes'],axis = 1) # Independent variables
y = data['Outcome']  # Dependent variable

In [20]:
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## 4. Data Scaling

In [21]:
# Scaling the data

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)



## 5. Model Building

In [22]:
# Initialize the Random Forest Classifier
classifier = RandomForestClassifier(random_state=2, class_weight='balanced')

In [23]:
# Define the parameter distribution for RandomizedSearchCV
param_dist = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['rbf', 'linear', 'poly', 'sigmoid']
}

In [24]:
# Define parameter grid for RandomizedSearchCV
param_dist_rf = {
    'n_estimators': [50, 100,150, 200],             # Number of trees in the forest
    'max_depth': [None, 0,10,20],             # Maximum depth of each tree
    'min_samples_split': [10,20,30,40],                 # Minimum number of samples required to split a node
    'min_samples_leaf': [2,4,10,15],                  # Minimum number of samples required at each leaf node
    'bootstrap': [True, False]                      # Whether to use bootstrap samples when building trees
}


In [25]:
# Set up the RandomizedSearchCV with cross-validation
random_search = RandomizedSearchCV(
    estimator= classifier,
    param_distributions=param_dist_rf,
    n_iter=10,                    # Number of parameter settings that are sampled
    cv=5,                         # 5-fold cross-validation
    verbose=0,                    # Show details of the process
    random_state=2,              # For reproducibility
    n_jobs=-1                     # Use all available cores
)


In [26]:
# Fit the RandomizedSearchCV to the training data
random_search.fit(X_train, y_train)

In [27]:
# Get the best parameters found by the search
best_params = random_search.best_params_
print("Best parameters for Random Forest: ", best_params)

Best parameters for Random Forest:  {'n_estimators': 200, 'min_samples_split': 40, 'min_samples_leaf': 4, 'max_depth': 10, 'bootstrap': False}


In [28]:
# Use the best estimator to make predictions
best_rf_model = random_search.best_estimator_
y_pred = best_rf_model.predict(X_test)


In [29]:
# Evaluate the model's performance
accuracy_rf = accuracy_score(y_test, y_pred)
print("Random Forest accuracy (%):", accuracy_rf * 100)

Random Forest accuracy (%): 79.22077922077922


In [30]:
# Confusion matrix of the tuned model
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)


Confusion Matrix:
 [[83 24]
 [ 8 39]]


In [31]:
# Model bias and variance calculations
train_accuracy = best_rf_model.score(X_train, y_train)
test_accuracy = accuracy_rf

# Display the results
print(f'Bias: {train_accuracy*100:.2f} %')
print(f'Variance: {test_accuracy*100:.2f} %')

Bias: 85.99 %
Variance: 79.22 %


In [32]:
# Calculate precision, recall, and F1-score
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Display the results
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')
report = classification_report(y_test,y_pred)
print(report)

Precision: 0.62
Recall: 0.83
F1 Score: 0.71
              precision    recall  f1-score   support

         0.0       0.91      0.78      0.84       107
         1.0       0.62      0.83      0.71        47

    accuracy                           0.79       154
   macro avg       0.77      0.80      0.77       154
weighted avg       0.82      0.79      0.80       154



In [33]:
# Converting to pickle file
import joblib
joblib.dump(best_rf_model, 'model.pkl')
#joblib.dump(sc, 'scaler.pkl')


['model.pkl']

In [34]:
joblib.dump(sc,'scaler.pkl' )

['scaler.pkl']

In [35]:
# Loading the pickle file
best_rf_model = joblib.load('model.pkl')
sc = joblib.load('scaler.pkl')