Review Homework Code, presented in steps 1-5.


In [1]:
import numpy as np

Part 1: Write code to load a dataset and split it into X_train, y_train, X_test, y_test.

In [2]:
# Import MNIST dataset
from sklearn.datasets import fetch_openml
print("Importing dataset")
mnist = fetch_openml('mnist_784', version=1)
print("Finished importing dataset")

Importing dataset
Finished importing dataset


In [3]:
# Assigning data and labels
X, y = mnist["data"], mnist["target"]
# Already split and shuffled, just assign
print("Splitting into training and test set (NOTE: Only taking first 30k for training, 5k for testing aka half")
X_train, X_test, y_train, y_test = X[:30000], X[30000:35000], y[:30000], y[30000:35000]
print("Finished importing dataset")

Splitting into training and test set (NOTE: Only taking first 30k for training, 5k for testing akak half
Finished importing dataset


Part 2: Write a pipeline to preprocess your features. Apply it to your train and test set.

In [4]:
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

print("Creating pipeline")
pipeline = Pipeline([
                ("scaler", StandardScaler()),
                ("linear_svc", LinearSVC(dual=False, multi_class="ovr", random_state=42))
])
print("Finished creating pipeline")
print("Training pipeline on training set")
pipeline.fit(X_train, y_train)
print("Finished training")
print("Testing on test set")
pipeline_score = pipeline.score(X_test, y_test)
print("Finished testing on test set")
print("Score: ", pipeline_score)

Creating pipeline
Finished creating pipeline
Training pipeline on training set
Finished training
Testing on test set
Finished testing on test set
Score:  0.8958


Part 3: Write code to find good hyperparameters for a given model.

In [5]:
# Get keys to find which parameters to pass the classifier in the pipeline
pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'scaler', 'linear_svc', 'scaler__copy', 'scaler__with_mean', 'scaler__with_std', 'linear_svc__C', 'linear_svc__class_weight', 'linear_svc__dual', 'linear_svc__fit_intercept', 'linear_svc__intercept_scaling', 'linear_svc__loss', 'linear_svc__max_iter', 'linear_svc__multi_class', 'linear_svc__penalty', 'linear_svc__random_state', 'linear_svc__tol', 'linear_svc__verbose'])

In [6]:
from sklearn.model_selection import RandomizedSearchCV
# Create parameters to check
param_rnd = [
             {'linear_svc__C': [.01, .1, 1, 10, 100]},
             {'linear_svc__tol': [0.01, 0.025, 0.1, 0.25, 1]}
]

# Search through param_rnd with cross-validation 
rnd_clf = RandomizedSearchCV(pipeline, param_rnd, n_iter=10, n_jobs=10, random_state=42)
print("Training rnd_clf")
rnd_clf.fit(X_train, y_train)
print("Finished training rnd_clf")

Training rnd_clf
Finished training rnd_clf


Part 4: Write code to evaluate your model.

In [7]:
# Print best hyperparameters and score
print("Best hyperparameters: ", rnd_clf.best_params_)
print("Best score: ", rnd_clf.best_score_)

Best hyperparameters: 
{'linear_svc__C': 0.01}
Best score: 
0.9017333333333333


(Run locally)<br>
Best hyperparameters:<br>
{'linear_svc__C': 0.01}<br>
Best score:<br>
0.9017333333333333<br><br>
NOTE: Clearly the tolerance wasn't set properly, so I need to reassign the pipeline so as not to have the svc have the tol hyperparameter. However with more restricted tolerances running time will likely increase, so hold steady until I get new results.

In [8]:
# Test and show accuracy. Need to change how model is evaluated. Will use F1 score probably.
print("Testing rnd_clf")
rnd_clf_score = rnd_clf.score(X_test, y_test)
print("Finished testing rnd_clf")
print("Score: ", rnd_clf_score)

Testing rnd_clf
Finished testing rnd_clf
Score:  0.8948


(Run locally)<br>
Testing rnd_clf<br>
Finished testing rnd_clf<br>
Score:  0.8948

Part 5: Write code to create an instance of each of the models we covered, find good hyperparameters using a subset of your data, train it using cross-validation and find its performance, and evaluate it on your test set.<br><br>
Working on it.

In [9]:
# Taking smaller split of data for speedrunning because #computerlivesmatter
print("Splitting into training and test set (NOTE: Only taking first 9k for training, 1k for testing")
X_train, X_test, y_train, y_test = X[:9000], X[9000:10000], y[:9000], y[9000:10000]

Splitting into training and test set (NOTE: Only taking first 9k for training, 1k for testing


In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

# Classifier instantiation, as well as list for looping purposes. 
# Likely able to loop through the classifiers for grid search,
# but unsure how to loop through param_dist or how to pass those on to a pipeline.
# May simply create an individual param_dist variable for each model as they require
# creating those distributions manually anyway.

log_clf = LogisticRegression()
svc_clf = SVC()
linear_svc_clf = LinearSVC()
sgd_clf = SGDClassifier()
dt_clf = DecisionTreeClassifier()
rf_clf = RandomForestClassifier()
knn_clf = KNeighborsClassifier()

# classifiers list
clf_list = [log_clf, svc_clf, linear_svc_clf, sgd_clf, dt_clf, rf_clf, knn_clf]

In [38]:
pipeline = Pipeline([
                ("scaler", StandardScaler()),
                ("classifier", "passthrough")
])
pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'scaler', 'classifier', 'scaler__copy', 'scaler__with_mean', 'scaler__with_std'])

In [35]:
# Individual param_dist variable for each model
# NEEDS DOING (possibly last thing, then note cleanup and print statement removal or on/off switch)
log_param_dist = [
             {'classifier': clf_list[0]}
]
svc_param_dist = [
             {'classifier': clf_list[1]}
]
linear_svc_param_dist = [
             {'classifier': clf_list[2]}
]
sgd_param_dist = [
             {'classifier': clf_list[3]}
]
dt_param_dist = [
             {'classifier': clf_list[4]}
]
rf_param_dist = [
             {'classifier': clf_list[5]}
]
knn_param_dist = [
             {'classifier': clf_list[6]}
]

# hyperparemeter distributions list
param_dist_list = [log_param_dist, svc_param_dist, linear_svc_param_dist, 
                   sgd_param_dist, dt_param_dist, rf_param_dist, knn_param_dist]

In [37]:
from sklearn.model_selection import RandomizedSearchCV
# Loop through each classifier, train, cross-validate, and test each.
for i in range(7):
    # Get classifier and parameters from lists
    clf = clf_list[i]
    param_dist = param_dist_list[i]
    
    print("Processing ", clf, ": ")
    
    # Perform search and cross-validation
    rnd_search = RandomizedSearchCV(pipeline, param_dist, n_iter=5, n_jobs=-1, random_state=42)
    print("Training ", clf, ": ")
    rnd_search.fit(X_train, y_train)
    print("Finished training ", clf)
    
    # Print best hyperparameters and score
    print("Best hyperparameters: ", rnd_search.best_params_)
    print("Best score: ", rnd_search.best_score_)
    
    # Test and show accuracy. Need to change how model is evaluated. Will use F1 score probably.
    print("Testing rnd_search")
    grid_search_score = rnd_search.score(X_test, y_test)
    print("Finished testing rnd_search")
    print("Score: ", rnd_search_score) 

Processing  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False) : 
Training  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False) : 


TypeError: ignored