# ***Engr.Muhammad Javed***

## **Randomized SearchCV**

## *Import Libraries*

In [1]:
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder , StandardScaler , FunctionTransformer
from sklearn.datasets import load_breast_cancer
from sklearn.tree import DecisionTreeClassifier , plot_tree
from sklearn.model_selection import train_test_split ,GridSearchCV , RandomizedSearchCV
from sklearn.linear_model import LogisticRegression , LinearRegression
from sklearn.metrics import confusion_matrix, classification_report
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.neighbors import KNeighborsClassifier
from mlxtend.plotting import plot_decision_regions
from sklearn.svm import SVC , SVR
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics.pairwise import linear_kernel


## *Load Dataset*

In [2]:
Dataset =  pd.read_csv(r"E:\AI and Data Science\ML with Engr.Muhammad Javed\File\diabetes.csv")

## *Show Dataset*


In [3]:
Dataset.head(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


## *Checking Dataset Information*


In [4]:
Dataset.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


## *Viewing Statistical Summary of the Dataset*


In [5]:
Dataset.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


## *Checking for Missing Values in the Dataset*


In [6]:
Dataset.isnull().sum()


Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

## *Split into Features (X) and Target (y)*

In [7]:
x = Dataset.drop(["Outcome"] , axis = 1)
y = Dataset["Outcome"]

## *Train-Test Split*

In [8]:
X_train , X_test , Y_train , Y_test = train_test_split(x , y , random_state= 42 , test_size=0.2)

print("Training Data Shape : " , X_train.shape)
print("Testing Data Shape : " , X_test.shape)

Training Data Shape :  (614, 8)
Testing Data Shape :  (154, 8)


## *Train Decision Tree (Without RandomizedSearchCV)*

In [9]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train , Y_train)


0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [10]:
y_pred = dt.predict(X_test)

print("Accuracy : " , accuracy_score(Y_test , y_pred)*100)
print("Classification Report : " , classification_report(Y_test , y_pred))

Accuracy :  74.67532467532467
Classification Report :                precision    recall  f1-score   support

           0       0.83      0.76      0.79        99
           1       0.62      0.73      0.67        55

    accuracy                           0.75       154
   macro avg       0.73      0.74      0.73       154
weighted avg       0.76      0.75      0.75       154



## *Apply RandomizedSearchCV for Hyperparameter Tuning*

In [11]:
param_dist  = {
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_depth': [3, 5, 7, 9, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

dt = DecisionTreeClassifier(random_state=42)

random_search = RandomizedSearchCV(
    estimator=dt,
    param_distributions=param_dist,
    n_iter=30,              # number of random combinations to try
    scoring='accuracy',
    cv=5,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

# Fit random search
random_search.fit(X_train, Y_train)

# Display best parameters and score
print("Best Parameters:", random_search.best_params_)
print("Best Cross-Validation Accuracy:", random_search.best_score_ * 100)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best Parameters: {'min_samples_split': 10, 'min_samples_leaf': 4, 'max_depth': 3, 'criterion': 'log_loss'}
Best Cross-Validation Accuracy: 75.90030654404904


In [12]:
# Step 7: Evaluate best model
best_dt_random = random_search.best_estimator_

y_pred_random = best_dt_random.predict(X_test)

print("Accuracy (With RandomizedSearchCV):", accuracy_score(Y_test, y_pred_random) * 100)
print("\nClassification Report:\n", classification_report(Y_test, y_pred_random))


Accuracy (With RandomizedSearchCV): 76.62337662337663

Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.84      0.82        99
           1       0.69      0.64      0.66        55

    accuracy                           0.77       154
   macro avg       0.75      0.74      0.74       154
weighted avg       0.76      0.77      0.76       154



## *Train Decision Tree (With RandomizedSearchCV)*

In [14]:
dt1 = DecisionTreeClassifier(min_samples_split =  10, min_samples_leaf = 4, max_depth = 3, criterion = 'log_loss')
dt1.fit(X_train , Y_train)

0,1,2
,criterion,'log_loss'
,splitter,'best'
,max_depth,3
,min_samples_split,10
,min_samples_leaf,4
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [15]:
y_pred1 = dt1.predict(X_test)

print("Accuracy : " , accuracy_score(Y_test , y_pred1)*100)
print("Classification Report : " , classification_report(Y_test , y_pred1))

Accuracy :  76.62337662337663
Classification Report :                precision    recall  f1-score   support

           0       0.81      0.84      0.82        99
           1       0.69      0.64      0.66        55

    accuracy                           0.77       154
   macro avg       0.75      0.74      0.74       154
weighted avg       0.76      0.77      0.76       154

