In [1]:
from sklearn.metrics import classification_report , confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import metrics
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
raw_data = datasets.load_digits()
print(raw_data.keys())
X = raw_data.data
y = raw_data.target

In [None]:
knn = KNeighborsClassifier(n_neighbors=7)

In [None]:
X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=.4,random_state=42,stratify=y)

In [None]:
knn.fit(X_train,y_train)

In [None]:
y_pred = knn.predict(X_test)

In [None]:
cm = confusion_matrix(y_test,y_pred)
np.diagonal(cm).sum() /cm.sum() # accuracy report

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
metrics.accuracy_score(y_test,y_pred)  # correctly classified score

# Logistic regression and the Receiver operating characteristic (ROC)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve , roc_auc_score
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score

In [None]:
# ROC is used for Biclass classfication
raw_data = datasets.load_breast_cancer()
X= raw_data.data
y= raw_data.target

In [None]:
X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=.4,random_state=42,stratify=y)
reg = LogisticRegression()

In [None]:
reg.fit(X_train,y_train)

In [None]:
y_pred_proba = reg.predict_proba(X_test)[:,1]

In [None]:
fpr , tpr , thresholds = roc_curve(y_test,y_pred_proba)  # it through an er

In [None]:
plt.plot([0,1],[0,1],'k--')

In [None]:
plt.plot(fpr,tpr,label='Logistic regression')
plt.xlabel('False Postive Rate')
plt.ylabel('True Positive Rate')
plt.title('Logistic Regression ROC Curve')
plt.show()

In [None]:
roc_auc_score(y_test,y_pred_proba) # area under the curve using sklearn

In [None]:
# we also compute AUC using cross_validation
cross_val_score(reg,X,y,cv=5,scoring='roc_auc')

# Hyperparameter Tuning with GridSearchCV

In [None]:
# using GridSearch
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {'n_neighbors':np.arange(1,10)}
param_grid

In [None]:
knn_cv = GridSearchCV(knn,param_grid,cv=5)

In [None]:
knn_cv.fit(X_train,y_train)

In [None]:
knn_cv.best_params_

In [None]:
knn_cv.best_score_

In [None]:
# Import necessary modules
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# Setup the hyperparameter grid
c_space = np.logspace(-5, 8, 15)
param_grid = {'C': c_space}

# Instantiate a logistic regression classifier: logreg
logreg = LogisticRegression()

# Instantiate the GridSearchCV object: logreg_cv
logreg_cv = GridSearchCV(logreg, param_grid, cv=5)

# Fit it to the data
logreg_cv.fit(X,y)

# Print the tuned parameters and score
print("Tuned Logistic Regression Parameters: {}".format(logreg_cv.best_params_)) 
print("Best score is {}".format(logreg_cv.best_score_))


# Hyperparameter tuning with RandomizedSearchCV

In [None]:
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier

In [None]:
raw_data = datasets.load_diabetes()
X = raw_data.data
y = raw_data.target

In [None]:
X.shape

In [None]:
y.shape

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=42,test_size=.3)

In [None]:

# Setup the parameters and distributions to sample from: param_dist
param_dist = {"max_depth": [3, None],
              "max_features": randint(1, 9),
              "min_samples_leaf": randint(1, 9),
              "criterion": ["gini", "entropy"]}

# Instantiate a Decision Tree classifier: tree
tree = DecisionTreeClassifier()

# Instantiate the RandomizedSearchCV object: tree_cv
tree_cv = RandomizedSearchCV(tree,param_dist, cv=5)

# Fit it to the data
tree_cv.fit(X_train,y_train)

# Print the tuned parameters and score
print("Tuned Decision Tree Parameters: {}".format(tree_cv.best_params_))
print("Best score is {}".format(tree_cv.best_score_))


# hold out set reasoning
1. How well model can perform on unseen data?
2. Using all data for cross-validation is not ideal
3. split data into training and hold-out set at the beginning
4. perform grid search cross-validation on training set
5. Choose best hyperparameters and evaluate on hold-out set


# Hold-out set in practice I: Classification

In [None]:
#for i in range(len(raw_data.feature_names)): # all are normal distributed
 #   sns.distplot(X[:,i])
  #  plt.show() 

In [None]:
# diabetes dataset
# Create the hyperparameter grid
c_space = np.logspace(-5, 8, 15)
param_grid = {'C':c_space, 'penalty': ['l1', 'l2']}

# Instantiate the logistic regression classifier: logreg
logreg = LogisticRegression()

# Create train and test sets
#X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.4,random_state=42)

# Instantiate the GridSearchCV object: logreg_cv
logreg_cv = GridSearchCV( logreg , param_grid , cv=5)

# Fit it to the training data
logreg_cv.fit(X,y)

# Print the optimal parameters and best score
print("Tuned Logistic Regression Parameter: {}".format(logreg_cv.best_params_))
print("Tuned Logistic Regression Accuracy: {}".format(logreg_cv.best_score_))


# Hold-out set in practice II: Regression

Remember lasso and ridge regression from the previous chapter? Lasso used the L1 penalty to regularize, while ridge used the L2 penalty. There is another type of regularized regression known as the elastic net. In elastic net regularization, the penalty term is a linear combination of the L1 and L2 penalties:

elastic_regression =   a∗L1 + b∗L2

In [14]:
# Import necessary modules
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
# Create train and test sets
raw_data = datasets.load_diabetes()
X = raw_data.data
y = raw_data.target
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.4,random_state=42)

# Create the hyperparameter grid
l1_space = np.linspace(0, 1, 30)
param_grid = {'l1_ratio':l1_space}

# Instantiate the ElasticNet regressor: elastic_net
elastic_net = ElasticNet()

# Setup the GridSearchCV object: gm_cv
gm_cv = GridSearchCV(elastic_net,param_grid,cv=5)

# Fit it to the training data
gm_cv.fit(X_train,y_train)

# Predict on the test set and compute metrics
y_pred = gm_cv.predict(X_test)
r2 = gm_cv.score(X_test,y_test)
mse = mean_squared_error(y_test,y_pred)
print("Tuned ElasticNet l1 ratio: {}".format(gm_cv.best_params_))
print("Tuned ElasticNet R squared: {}".format(r2))
print("Tuned ElasticNet MSE: {}".format(mse))


Tuned ElasticNet l1 ratio: {'l1_ratio': 1.0}
Tuned ElasticNet R squared: 0.3421656776375358
Tuned ElasticNet MSE: 3848.4618529416193
