# Q3 Using Scikit-Learn

In [1]:
import numpy as np
import pandas as pd
import time
import gc
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_validate, train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer

In [2]:
# Change to your GA Tech ID
ga_id = 'ksims35'
# Requires a print() statement do not modify below print statement
print(ga_id)

ksims35


# Q3.1 Data Import and Cleansing Setup

In [3]:
# XXX
# TODO: Read in all the data. Replace the 'xxx' with the path to the data set. We've started this for you. 
# XXX

# -------------------------------
# ADD CODE HERE
data = pd.read_csv('pulsar_stars.csv')
# -------------------------------


# XXX
# TODO: Separate out the x_data and y_data. We've started this for you.
# XXX

# -------------------------------
# ADD CODE HERE
x_data = data.iloc[:, :-1]
y_data = data.iloc[:, -1]
# -------------------------------

In [4]:
# XXX
# TODO: Split 70% of the data into training and 30% into test sets. Call them x_train, x_test, y_train and y_test.
# Use the train_test_split method in sklearn with the parameter 'shuffle' set to true and the 'random_state' 
# set to 614.
# 

# -------------------------------
# ADD CODE HERE
# -------------------------------
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.30, shuffle=True, random_state=614)

# Q3.2 Linear Regression 

In [5]:
# XXX
# TODO: Create a LinearRegression classifier and train it.
# XXX

# -------------------------------
# ADD CODE HERE
# 
# -------------------------------
model = LinearRegression()
model.fit(x_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [6]:
# XXX
# TODO: Test its accuracy (on the training set) using the accuracy_score method.
# Note: Round the output values greater than or equal to 0.5 to 1 and those less than 0.5 to 0. You can use any method that satisfies the requriements.
# XXX

# -------------------------------
# ADD CODE HERE
# Requires a print() statement
# -------------------------------
acc = accuracy_score(y_train.values, [1 if pred >= 0.5 else 0 for pred in model.predict(x_train)])
print(acc)

0.9720625798212005


In [7]:
# XXX
# TODO: Test its accuracy (on the testing set) using the accuracy_score method.
# Note: Round the output values greater than or equal to 0.5 to 1 and those less than 0.5 to 0. You can use any method that satisfies the requriements.
# XXX

# -------------------------------
# ADD CODE HERE
# Requires a print() statement
# -------------------------------
acc = accuracy_score(y_test.values, [1 if pred >= 0.5 else 0 for pred in model.predict(x_test)])
print(acc)

0.9696461824953445


# Q3.3 Random Forest Classifier

In [8]:
# XXX
# TODO: Create a RandomForestClassifier and train it.
# Set 'random_state' to 614
# XXX

# -------------------------------
# ADD CODE HERE
# -------------------------------
clf = RandomForestClassifier(random_state=614)
clf.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=614,
                       verbose=0, warm_start=False)

In [9]:
# XXX
# TODO: Test its accuracy on the training set using the accuracy_score method.
# XXX

# -------------------------------
# ADD CODE HERE
# Requires a print() statement
# -------------------------------
acc = accuracy_score(y_train.values, clf.predict(x_train))
print(acc)

1.0


In [10]:
# XXX
# TODO: Test its accuracy on the test set using the accuracy_score method.
# XXX

# -------------------------------
# ADD CODE HERE
# Requires a print() statement
# -------------------------------
acc = accuracy_score(y_test.values, clf.predict(x_test))
print(acc)

0.9800744878957169


## Q3.3.1 Feature Importance

In [11]:
# XXX
# TODO: Determine the feature importance as evaluated by the Random Forest Classifier.
# XXX

# -------------------------------
# ADD CODE HERE
# Requires a print() statement
# -------------------------------
print(clf.feature_importances_)

[0.2229389  0.04576524 0.31748998 0.19307467 0.07611149 0.04671938
 0.05356464 0.0443357 ]


In [12]:
# XXX
# TODO: Sort them in the descending order and print the feature numbers[0 to ...].
#       Hint: There is a direct function available in sklearn to achieve this. Also checkout argsort() function in Python.
# XXX

# -------------------------------
# ADD CODE HERE
# Requires a print() statement
# -------------------------------
print([x[0] for x in sorted(zip(range(8), clf.feature_importances_), key=lambda x: x[1], reverse=True)])

[2, 0, 3, 4, 6, 5, 1, 7]


## Q3.3.2 Hyper-parameter Tuning

In [13]:
# XXX
# TODO: Tune the hyper-parameters 'n_estimators' and 'max_depth'.
# 'n_estimators': [4, 16, 256]
# 'max_depth': [2, 8, 16]
# XXX

# -------------------------------
# ADD CODE HERE
# -------------------------------
clf = RandomForestClassifier(random_state=614)
param_grid = {'n_estimators': [4,16,256],
              'max_depth': [2,8,16]}

grid = GridSearchCV(estimator=clf, param_grid=param_grid)
grid.fit(x_train, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False, random_state=614,
                              

In [14]:
# XXX
# TODO: Get the best params, using .best_params_
# XXX

# -------------------------------
# ADD CODE HERE
# Requires a print() statement
# -------------------------------
print(grid.best_params_)

{'max_depth': 16, 'n_estimators': 256}


In [15]:
# XXX
# TODO: Get the best score, using .best_score_.
# XXX

# -------------------------------
# ADD CODE HERE
# Requires a print() statement
# -------------------------------
print(grid.best_score_)

0.9798049551336275


# Q3.4 Support Vector Machine

## Q3.4.1 Pre-process

In [16]:
# XXX
# TODO: Pre-process the data to standardize it, otherwise the grid search will take much longer.
# XXX

# -------------------------------
# ADD CODE HERE
# -------------------------------
scaler = StandardScaler().fit(x_train)

x_scaled_train = scaler.transform(x_train)
x_scaled_test = scaler.transform(x_test)

## Q3.4.2 Classification

In [17]:
# XXX
# TODO: Create a SVC classifier and train it.
# XXX

# -------------------------------
# ADD CODE HERE
# -------------------------------
clf = SVC()
clf.fit(x_scaled_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [18]:
# XXX
# TODO: Test its accuracy on the training set using the accuracy_score method.
# XXX

# -------------------------------
# ADD CODE HERE
# Requires a print() statement
# -------------------------------
print(accuracy_score(y_train, clf.predict(x_scaled_train)))

0.9802043422733078


In [19]:
# XXX
# TODO: Test its accuracy on the test set using the accuracy_score method.
# XXX

# -------------------------------
# ADD CODE HERE
# Requires a print() statement
# -------------------------------
print(accuracy_score(y_test, clf.predict(x_scaled_test)))

0.9757914338919925


## Q3.4.3 Hyper-parameter Tuning

In [20]:
# XXX
# TODO: Tune the hyper-parameters 'C' and 'kernel' (use rbf and linear).
# 'kernel':('linear', 'rbf') 
# 'C':[0.01, 0.1, 1.0]
# XXX

# -------------------------------
# ADD CODE HERE
# -------------------------------
clf = SVC()
param_grid = {'kernel':['linear', 'rbf'],
              'C': [0.01,0.1,1.0]}

grid = GridSearchCV(estimator=clf, param_grid=param_grid, n_jobs=-1, return_train_score=True)
grid.fit(x_scaled_train, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [0.01, 0.1, 1.0], 'kernel': ['linear', 'rbf']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring=None, verbose=0)

In [21]:
# XXX
# TODO: Get the best score, using .best_score_.
# Note: Set n_jobs=-1 and return_train_score=True
# XXX

# -------------------------------
# ADD CODE HERE
# Requires a print() statement
# -------------------------------
print(grid.best_score_)

0.9798849547513114


In [22]:
# XXX
# TODO: Calculate the training and test set accuracy values after hyperparameter tuning and standardization. 
# XXX

# -------------------------------
# ADD CODE HERE
# -------------------------------
train_pred = grid.predict(x_scaled_train)
test_pred = grid.predict(x_scaled_test)

In [23]:
# XXX
# TODO: Test its accuracy (on the training set) using the accuracy_score method.
# XXX

# -------------------------------
# ADD CODE HERE
# Requires a print() statement
# -------------------------------
print(accuracy_score(y_train, train_pred))

0.9797254150702427


In [24]:
# XXX
# TODO: Test its accuracy (on the test set) using the accuracy_score method.
# XXX

# -------------------------------
# ADD CODE HERE
# Requires a print() statement
# -------------------------------
print(accuracy_score(y_test, test_pred))

0.9778398510242086


## Q3.4.4 Cross Validation Results 

In [25]:
# XXX
# TODO: Get the rank test score for all hyperparameter values that you obtained in Q3.4.3. The 
# GridSearchCV class holds a  ‘cv_results_’ dictionary that should help you report these metrics easily.
# XXX

# -------------------------------
# ADD CODE HERE
# Requires a print() statement
# -------------------------------
print(grid.cv_results_['rank_test_score'])

[5 6 3 4 1 2]


In [26]:
# XXX
# TODO: Get the mean testing score for all of hyperparameter values that you obtained in Q3.4.3. The 
# GridSearchCV class holds a  ‘cv_results_’ dictionary that should help you report these metrics easily.
# XXX
# -------------------------------
# ADD CODE HERE
# Requires a print() statement
# -------------------------------
print(grid.cv_results_['mean_test_score'])

[0.97597358 0.9699073  0.97876735 0.97788907 0.97988495 0.97940588]


# Q3.5 PCA

In [27]:
# XXX
# TODO: Perform dimensionality reduction of the data using PCA.
#       Set parameters n_component to 8 and svd_solver to 'full'. Keep other parameters at their default value.
# XXX

# NOTE: Use the full x data set for this section 'x_data'

# -------------------------------
# ADD CODE HERE
# You should see an output here of PCA(copy=True....)
# -------------------------------
decomp = PCA(n_components=8, svd_solver='full')
decomp.fit(x_data)

PCA(copy=True, iterated_power='auto', n_components=8, random_state=None,
    svd_solver='full', tol=0.0, whiten=False)

In [28]:
# XXX
# TODO: Get percentage of variance explained by each of the selected components
# XXX

# -------------------------------
# ADD CODE HERE
# Requires a print() statement
# -------------------------------
print(decomp.explained_variance_)

[1.16350775e+04 1.04446764e+03 5.49743692e+02 8.22777890e+01
 3.25545877e+01 1.28041973e+01 5.21704598e-01 3.73967707e-02]


In [29]:
# XXX
# TODO: Get the singular values corresponding to each of the selected components.
# XXX

# -------------------------------
# ADD CODE HERE
# Requires a print() statement
# -------------------------------
print(decomp.singular_values_)

[14430.28004546  4323.5214088   3136.68022725  1213.47665361
   763.30168073   478.70316467    96.62788002    25.87063984]
