In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score


In [2]:
df = pd.read_csv('./breast_cancer.csv')

In [3]:
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

In [5]:
# Diagnosis - Y Variable
# Beningn - Cancerous
# Malignant - Non Cancerous

# Checking Distribution of y variable
df['diagnosis'].value_counts()

diagnosis
B    357
M    212
Name: count, dtype: int64

In [6]:
df.shape

(569, 33)

In [7]:
df['diagnosis']=df['diagnosis'].map({'M':1,'B':0})

In [8]:
df['diagnosis']

0      1
1      1
2      1
3      1
4      1
      ..
564    1
565    1
566    1
567    1
568    0
Name: diagnosis, Length: 569, dtype: int64

In [9]:
X = df.iloc[:,[2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21]] # 20 cols
y = df['diagnosis'].values

In [10]:
# Train Test Split

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)


In [11]:
len(X_train)/len(df)*100

79.96485061511423

In [12]:
len(X_test)/len(df)*100

20.035149384885763

In [13]:
# Feature Scaling - Standardisation , Normalisation

from sklearn.preprocessing import StandardScaler

sc= StandardScaler()

X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [14]:
# First Model - Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier
model =RandomForestClassifier()
model.fit(X_train,y_train)


In [15]:

# Predicting the values
y_pred_rf = model.predict(X_test)


In [16]:
# Metrics

from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

result = accuracy_score( y_test,y_pred_rf)
print(result)


0.9298245614035088


In [17]:
cm = confusion_matrix(y_test,y_pred_rf)
print(cm)

[[63  4]
 [ 4 43]]


In [18]:
cr = classification_report(y_test,y_pred_rf)
print(cr)

              precision    recall  f1-score   support

           0       0.94      0.94      0.94        67
           1       0.91      0.91      0.91        47

    accuracy                           0.93       114
   macro avg       0.93      0.93      0.93       114
weighted avg       0.93      0.93      0.93       114



##### Manual HPO

In [19]:
n_estimators_list = [1,2,3,10,50,100]


for estim_list in n_estimators_list:
    model = RandomForestClassifier(n_estimators=estim_list)
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    result = accuracy_score(y_test,y_pred)
    print("Estimator Value :",estim_list)
    print("Accuracy Score :",result)

Estimator Value : 1
Accuracy Score : 0.8771929824561403
Estimator Value : 2
Accuracy Score : 0.9122807017543859


Estimator Value : 3
Accuracy Score : 0.9122807017543859
Estimator Value : 10
Accuracy Score : 0.9473684210526315
Estimator Value : 50
Accuracy Score : 0.9385964912280702
Estimator Value : 100
Accuracy Score : 0.9385964912280702


In [20]:
leaf_sizes = [1,2,3,10,50,100]


for i in leaf_sizes:
    model = RandomForestClassifier(n_estimators=50, min_samples_leaf=i)
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    result = accuracy_score(y_test,y_pred)
    print("Leaf Size Value :",i)
    print("Accuracy Score :",result)

Leaf Size Value : 1
Accuracy Score : 0.9298245614035088
Leaf Size Value : 2
Accuracy Score : 0.9385964912280702
Leaf Size Value : 3
Accuracy Score : 0.9385964912280702
Leaf Size Value : 10
Accuracy Score : 0.9385964912280702
Leaf Size Value : 50
Accuracy Score : 0.9298245614035088
Leaf Size Value : 100
Accuracy Score : 0.9122807017543859


##### Parameters from Manual HPO
- Leaf size = 1
- n_estimators = 10

##### Random Search CV

In [21]:
n_estimators = [int(x) for x in np.linspace(start=100, stop=1000,num=10)]

In [22]:
n_estimators

[100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]

In [23]:
from sklearn.model_selection import RandomizedSearchCV

max_depth = [int(x) for x in np.linspace(start=100, stop=110,num=11)]
min_samples_split =[1,2,4,10,20,50,100]
min_samples_leaf =[2,3,4,5,8,10,20,50,100,200]
bootstrap =[True,False]

# Create the random grid

random_grid = {'n_estimators':n_estimators,
               'max_depth':max_depth,
               'min_samples_split':min_samples_split,
               'min_samples_leaf':min_samples_leaf,
               'bootstrap':bootstrap
            }


On each iteration the algorithm will choose a diffeent combination of the features. Altogether there are 15,400 combinations. However the benefit of random search is that we are not tryimng every combination, but selecting at random to sample a wide range of values.

In [24]:
# Use the random grid to search fro best hyper parameters
# First create base model to tune

rf = RandomForestClassifier()
# Random search parameters using 3 fold cross validation

rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid, n_iter=100 , cv=3, n_jobs=1)
# cv - cross validation - you have to run fror multiple folds - k fold cross validation

#fit the random search model
rf_random.fit(X_train,y_train)



45 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\HP\Desktop\Interview Prep\AI Engineer\.venv\lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\HP\Desktop\Interview Prep\AI Engineer\.venv\lib\site-packages\sklearn\base.py", line 1382, in wrapper
    estimator._validate_params()
  File "c:\Users\HP\Desktop\Interview Prep\AI Engineer\.venv\lib\site-packages\sklearn\base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\HP\Desktop\Interview Prep\AI Engineer\.venv\lib\site-pa

In [25]:
def evaluate(model,test_features,test_labels):
    predictions = model.predict(test_features)
    accuracy = accuracy_score(test_labels,predictions)
    print("Model Preformance")
    print("Accuracy Score = {:0.2f}%".format(accuracy))
    return accuracy

In [26]:
base_accuracy = evaluate(model,X_test,y_test)

Model Preformance
Accuracy Score = 0.91%


In [27]:
best_random = rf_random.best_estimator_

In [28]:
best_random

In [29]:
random_Accuracy = evaluate(best_random,X_test,y_test)

Model Preformance
Accuracy Score = 0.94%


In [30]:
# Improvement is 0.02

##### Grid Search CV

In [31]:
from sklearn.model_selection import GridSearchCV

max_depth = [75,80,85,90,95,100]
max_features= [2,3,4,5]
min_samples_split =[8,10,12]
min_samples_leaf =[1,2,3]
bootstrap =[True,False]
n_estimators =[200,250,300,350,400,450,500]

# Create the random grid

param_grid = {'n_estimators':n_estimators,
               'max_depth':max_depth,
               'min_samples_split':min_samples_split,
               'min_samples_leaf':min_samples_leaf,
               'bootstrap':bootstrap,
               'max_features':max_features
            }

rf_gd = RandomForestClassifier()
#Initiate the Grid Search Model
grid_search = GridSearchCV(estimator=rf_gd,param_grid=param_grid,cv=3,n_jobs = -1)

In [32]:
# Fit the grid search to the data

grid_search.fit(X_train,y_train)
grid_search.best_params_

KeyboardInterrupt: 

In [None]:
best_grid = grid_search.best_estimator_
grid_accuracy = evaluate(best_grid,X_test,y_test)