# Building Machine Learning Model to predict whether the cancer is benign or malignant on Breast Cancer Wisconsin Data Set !! Part 4

#### Hyperparameter Optimization : GridSearch with Cross Validation for tuning parameters

#### Logistic Regression : Hyperparameter Tuning and Visualization

In [None]:
# A parameter grid for Logistic Regression
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
penalty = ['l1', 'l2']
C = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
#class_weight = [{1:0.5, 0:0.5}, {1:0.4, 0:0.6}, {1:0.6, 0:0.4}, {1:0.7, 0:0.3}]
solver = ['liblinear', 'saga']

param_grid = dict(penalty=penalty,
                  C=C,
                  #class_weight=class_weight,
                  solver=solver)

In [None]:
model = GridSearchCV(estimator=clf, param_grid=param_grid, scoring='roc_auc', n_jobs=4, cv=skf.split(X_train,y_train), verbose=3)

# Here we go
#start_time = timer(None) # timing starts from this point for "start_time" variable
model.fit(X_train, y_train)
#timer(start_time) # timing ends here for "start_time" variable
best_model = best_mdl()

#### XGBoost : Hyperparameter Tuning and Visualization

In [None]:
# A parameter grid for XGBClassifier
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
clf = XGBClassifier()
# A parameter grid for XGBoost
param_grid = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

In [None]:
model = GridSearchCV(estimator=clf, param_grid=param_grid, scoring='roc_auc', n_jobs=4, cv=skf.split(X_train,y_train), verbose=3)

# Here we go
#start_time = timer(None) # timing starts from this point for "start_time" variable
model.fit(X_train, y_train)
#timer(start_time) # timing ends here for "start_time" variable
best_model = best_mdl()

#### Gradient Boosting Classifier : Hyperparameter Tuning and Visualization

In [None]:
# A parameter grid for XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(max_depth=3, min_samples_split=2, min_samples_leaf=1, subsample=1,max_features='sqrt', random_state=10)
# A parameter grid for GradientBoostingClassifier
param_grid = {'learning_rate':[0.15,0.1,0.05,0.01,0.005,0.001], 'n_estimators':[100,250,500,750,1000,1250,1500,1750]}

In [None]:
model = GridSearchCV(estimator=clf, param_grid=param_grid, scoring='roc_auc', n_jobs=4, cv=skf.split(X_train,y_train), verbose=3)

# Here we go
#start_time = timer(None) # timing starts from this point for "start_time" variable
model.fit(X_train, y_train)
#timer(start_time) # timing ends here for "start_time" variable
best_model = best_mdl()

#### Random Forest : Hyperparameter Tuning and Visualization

In [None]:
# A parameter grid for XGBClassifier
from sklearn.model_selection import GridSearchCV
from xgboost import GradientBoostingClassifier
clf = RandomForestClassifier()
# A parameter grid for GradientBoostingClassifier
param_grid = {'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

In [None]:
model = GridSearchCV(estimator=clf, param_grid=param_grid, scoring='roc_auc', n_jobs=4, cv=skf.split(X_train,y_train), verbose=3)

# Here we go
#start_time = timer(None) # timing starts from this point for "start_time" variable
model.fit(X_train, y_train)
#timer(start_time) # timing ends here for "start_time" variable
best_model = best_mdl()

#### AUC, confusion matrix

In [None]:
y_predicted_test  = best_model.predict(X_test)
y_probabilities_test = best_model.predict_proba(X_test)
y_probabilities_success = y_probabilities_test[:, 1]

from sklearn.metrics import average_precision_score
average_precision = average_precision_score(y_test, y_probabilities_success)
print('Average precision-recall score: {0:0.2f}'.format(average_precision))

false_positive_rate, true_positive_rate, threshold = roc_curve(y_test, y_probabilities_success)

mse        = mean_squared_error(y_test, y_predicted_test)
logloss    = log_loss(y_test, y_predicted_test)
accuracy   = accuracy_score(y_test, y_predicted_test)
precision  = precision_score(y_test, y_predicted_test, average='binary')
recall     = recall_score(y_test, y_predicted_test, average='binary')
F1         = f1_score(y_test, y_predicted_test)
r2         = r2_score(y_test, y_predicted_test)
auc        = roc_auc_score(y_test, y_predicted_test)
cm         = confusion_matrix(y_test, y_predicted_test)

In [None]:
best_model = best_mdl()
Print_Model_Metrics()
Plot_ROC_Precision_Recall()
Plot_Confusion_Matrix()
Plot_Predictor_Importance(True)
#results_df = pd.DataFrame(data={'Observed':y_test, 'Predicted':y_predicted_test[:,1]})
#results_df.to_csv('grid-search--outcome.csv', index=False)