In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,BaggingClassifier,GradientBoostingClassifier
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.pipeline import make_pipeline

In [2]:
df = pd.read_csv('C:\MLCourse\Learning\Guvi_Final_Projects\cancer.csv')
df['diagnosis'] = df['diagnosis'].map({'B':0,'M':1})
df.head(2)

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902


In [3]:
df.shape

(569, 32)

In [4]:
ss = StandardScaler()
y = df['diagnosis']
X = df.drop('diagnosis', axis=1)
X_ss = ss.fit_transform(X)
x_train_ss, x_test_ss, y_train_ss, y_test_ss = train_test_split(X_ss,y,test_size=0.2, random_state=42)

In [5]:
param_grid_0 = {
    'C' : [0.5,1.0,1.5,2.0,2.5,3.0,3.5,4.0,4.5,5.0],
'kernel':['linear', 'rbf', 'sigmoid'],
'gamma' : ['scale','auto'],
'coef0' : [1.0,1.5,2.0,2.5,3.0,3.5,4.0],
'tol' : [1e-3,2e-3,3e-3,4e-3,5e-3] 
}

CV_svc = GridSearchCV(estimator=SVC(), param_grid=param_grid_0, cv=5,n_jobs=-1,)
CV_svc.fit(x_train_ss, y_train_ss)
print('For Support Vector Classifier')
print(CV_svc.best_params_)
print(CV_svc.best_score_)

For Support Vector Classifier
{'C': 3.0, 'coef0': 1.0, 'gamma': 'auto', 'kernel': 'rbf', 'tol': 0.001}
0.9780219780219781


In [6]:
param_grid_1 = { 
    'n_estimators': [100,200,300,400, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}
CV_rfc = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid_1, cv= 5,n_jobs=-1)
CV_rfc.fit(x_train_ss, y_train_ss)
print('For Random Forest Classifier')
print(CV_rfc.best_params_)
print(CV_rfc.best_score_)

For Random Forest Classifier
{'criterion': 'entropy', 'max_depth': 7, 'max_features': 'log2', 'n_estimators': 400}
0.9670329670329669


In [7]:
param_grid_2 = {'n_estimators' :[50,75,100,125,150],
               'learning_rate' :[0.5,1.0,1.5,2.0,2.5,3.0],
                 'algorithm' : ['SAMME', 'SAMME.R'] 
                }
CV_ada = GridSearchCV(estimator=AdaBoostClassifier(),param_grid=param_grid_2, cv= 5,verbose=3,n_jobs=-1)
CV_ada.fit(x_train_ss, y_train_ss)
print('For AdaBoost Classifier')
print(CV_ada.best_params_)
print(CV_ada.best_score_)

Fitting 5 folds for each of 60 candidates, totalling 300 fits
For AdaBoost Classifier
{'algorithm': 'SAMME', 'learning_rate': 1.0, 'n_estimators': 125}
0.9780219780219781


In [8]:
param_grid_3 = { 
'n_estimators' : [10,20,30,40,50,75,100],
'max_samples' : [0.5,0.6,0.7,0.8,0.9,1.0],    
'max_features' : [0.5,0.6,0.7,0.8,0.9,1.0],
    }
CV_bag = GridSearchCV(estimator=BaggingClassifier(),param_grid=param_grid_3, cv= 5,verbose=3,n_jobs=-1)
CV_bag.fit(x_train_ss, y_train_ss)
print('For Bagging Classifier')
print(CV_bag.best_params_)
print(CV_bag.best_score_)    

Fitting 5 folds for each of 252 candidates, totalling 1260 fits
For Bagging Classifier
{'max_features': 0.8, 'max_samples': 0.7, 'n_estimators': 50}
0.9670329670329672


In [9]:
df_1 = df[df['diagnosis'] == 1]
df_0 = df[df['diagnosis'] == 0]
df_11 = df_1.sample(100)
df_00 = df_0.sample(100)
df_new = pd.concat([df_11,df_00], axis=0)
df_new.drop(columns='id', axis=1)


Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
509,1,15.460,23.95,103.80,731.3,0.11830,0.18700,0.20300,0.08520,0.1807,...,17.11,36.33,117.70,909.4,0.1732,0.49670,0.5911,0.21630,0.3013,0.10670
87,1,19.020,24.59,122.00,1076.0,0.09029,0.12060,0.14680,0.08271,0.1953,...,24.56,30.41,152.90,1623.0,0.1249,0.32060,0.5755,0.19560,0.3956,0.09288
9,1,12.460,24.04,83.97,475.9,0.11860,0.23960,0.22730,0.08543,0.2030,...,15.09,40.68,97.65,711.4,0.1853,1.05800,1.1050,0.22100,0.4366,0.20750
297,1,11.760,18.14,75.00,431.1,0.09968,0.05914,0.02685,0.03515,0.1619,...,13.36,23.39,85.10,553.6,0.1137,0.07974,0.0612,0.07160,0.1978,0.06915
393,1,21.610,22.28,144.40,1407.0,0.11670,0.20870,0.28100,0.15620,0.2162,...,26.23,28.74,172.00,2081.0,0.1502,0.57170,0.7053,0.24220,0.3828,0.10070
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
336,0,12.990,14.23,84.08,514.3,0.09462,0.09965,0.03738,0.02098,0.1652,...,13.72,16.91,87.38,576.0,0.1142,0.19750,0.1450,0.05850,0.2432,0.10090
48,0,12.050,14.63,78.04,449.3,0.10310,0.09092,0.06592,0.02749,0.1675,...,13.76,20.70,89.88,582.6,0.1494,0.21560,0.3050,0.06548,0.2747,0.08301
380,0,11.270,12.96,73.16,386.3,0.12370,0.11110,0.07900,0.05550,0.2018,...,12.84,20.53,84.93,476.1,0.1610,0.24290,0.2247,0.13180,0.3343,0.09215
453,0,14.530,13.98,93.86,644.2,0.10990,0.09242,0.06895,0.06495,0.1650,...,15.80,16.93,103.10,749.9,0.1347,0.14780,0.1373,0.10690,0.2606,0.07810


In [10]:
param_grid_4 = {'loss' : ['deviance', 'exponential'],
'learning_rate' : [0.5,1.0,1.5,2.0],
'n_estimators' : [50,100,150],
'criterion' :['friedman_mse', 'squared_error', 'mse', 'mae'],
 'min_samples_split' : [0.2,0.4,0.6,0.8],
'min_samples_leaf' : [3,4],
 'max_depth' : [3,5,7],
'min_impurity_decrease' : [0.05,0.1,0.15],
'max_features' :['auto', 'sqrt', 'log2'] 
}
CV_gb= GridSearchCV(estimator=GradientBoostingClassifier(),param_grid=param_grid_4, cv= 5,verbose=3,n_jobs=-1)
CV_gb.fit(x_train_ss, y_train_ss)
print('For Gradient Boosting Classifier')
print(CV_gb.best_params_)
print(CV_gb.best_score_)    

Fitting 5 folds for each of 20736 candidates, totalling 103680 fits
For Gradient Boosting Classifier
{'criterion': 'friedman_mse', 'learning_rate': 0.5, 'loss': 'exponential', 'max_depth': 7, 'max_features': 'sqrt', 'min_impurity_decrease': 0.15, 'min_samples_leaf': 4, 'min_samples_split': 0.8, 'n_estimators': 50}
0.9780219780219781


In [11]:
param_grid_5 ={'n_estimators' : [10,20,30,40,50],
'max_depth' : [5,10,15],
'learning_rate' : [0.05,0.1,0.15,0.2],
'gamma' : [0.02,0.05,0.1,0.15,0.2],
'num_parallel_tree': [10,20,30,40,50]  
}
CV_xgb = GridSearchCV(estimator=xgb.XGBClassifier(),param_grid=param_grid_5, cv= 5,verbose=3,n_jobs=-1)
CV_xgb.fit(x_train_ss, y_train_ss)
print('For Extreme Gradient Boosting Classifier')
print(CV_xgb.best_params_)
print(CV_xgb.best_score_)            

Fitting 5 folds for each of 1500 candidates, totalling 7500 fits
For Extreme Gradient Boosting Classifier
{'gamma': 0.05, 'learning_rate': 0.15, 'max_depth': 10, 'n_estimators': 30, 'num_parallel_tree': 10}
0.9670329670329672
