In [17]:
%config InlineBackend.figure_format = 'retina'

import numpy as np
import random
import sys
import pickle
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.metrics import accuracy_score
import seaborn as sns

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from skorch import NeuralNetRegressor

from sklearn.model_selection import GridSearchCV

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)  

cuda:0


In [14]:
def split_X_y(data):
    data_X = data[:, :-1]
    data_y = data[:, -1]
    return data_X, data_y

def split_train_test(data, train_ratio):
    X, y = split_X_y(data)
    
    num_training = len(X) * train_ratio
    try:
        assert(num_training % 1 == 0)
    except:
        print('len(X) = ' + str(len(X)))
        print('train ratio = ' + str(train_ratio))
        print(str(num_training) + ' training samples')
        print(len(X))
        for i in range(1, len(X)):
            if len(X) % i == 0:
                print(i)
    num_training = int(num_training)
                
    perm = np.random.permutation(len(X))
    X_shuffled = X[perm]
    y_shuffled = y[perm]

    X_train = X_shuffled[:num_training]
    y_train = y_shuffled[:num_training]
    X_test = X_shuffled[num_training:]
    y_test = y_shuffled[num_training:]

    return (X_train, y_train), (X_test, y_test)
    
def draw_heatmap_2d(training_errors, gamma_list, C_list, title, xlabel, ylabel, large=True):
    if large:
        plt.figure(figsize = (10,6))
    ax = sns.heatmap(training_errors, annot=True, fmt='.3f', 
                     xticklabels=gamma_list, yticklabels=C_list)
    ax.collections[0].colorbar.set_label("error")
    ax.set(xlabel=xlabel, ylabel=ylabel)
    bottom, top = ax.get_ylim()
    ax.set_ylim(bottom + 0.5, top - 0.5)
    plt.title(title)
    plt.show()

In [15]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=10, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(in_channels=10, out_channels=20, kernel_size=3, stride=1, padding=1)
        self.fc1 = nn.Linear(20*8*8, 100)
        self.fc2 = nn.Linear(100, 10) 
        
    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 20*8*8)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        
        return x

In [16]:
def cnn(X_train, Y_train, X_test, Y_test, draw=False):
    net = NeuralNetRegressor(Net, max_epochs=100, lr=0.001, verbose=1)
    
    params = {
        'lr': [0.001,0.005, 0.01, 0.05, 0.1, 0.2, 0.3],
        'max_epochs': list(range(500,5500, 500))
    }

    gs = GridSearchCV(net, params, refit=False, scoring='r2', verbose=1, cv=10)
    
    grid_search.fit(X_train, Y_train)
    
    cross_val_errors = 1 - grid_search.cv_results_['mean_test_score'].reshape(4,9)
    if draw:
        draw_heatmap_2d(cross_val_errors, k_list, metric_list, title='cross-validation error w.r.t k and distance metric', ylabel='metric', xlabel='k', large=True)
    
    best_params = grid_search.best_params_
    print("Best params: {}".format(best_params))

    best_test_accuracy = (Y_test == grid_search.best_estimator_.predict(X_test)).sum() / len(Y_test)
    best_train_accuracy = (Y_train == grid_search.best_estimator_.predict(X_train)).sum() / len(Y_train)

    return best_train_accuracy, best_test_accuracy

### Load preprocessed data

In [None]:
data_dir = 'processed_data/'

# Choose: encoded, normalized, pca (Note: the encoded and normalized dataset files were too big for gradescope)
option = 'normalized'

adult_data = pickle.load(open(data_dir + 'adult_data_%s.p' % option, 'rb'))
adult_data = adult_data

grade_data = pickle.load(open(data_dir + 'grade_data_%s.p' % option, 'rb'))
grade_data = grade_data

In [None]:
dataset_names = ['Adult', 'Grades']
datasets = [adult_data, grade_data]
models = [cnn]
model_names = ['CNN']
partition = 0.8
num_trials = 3
best_loss = sys.maxsize

for i, dataset in enumerate(datasets):
    random.shuffle(dataset)
    if len(dataset) > len(adult_data):
        dataset = dataset[:len(adult_data)]
        datasets[i] = dataset
    
    print('DATASET: ' + dataset_names[i])
    print(dataset.shape)      
    print('% TRAINING: ' + str(partition))
    data_train, data_test = split_train_test(dataset, partition)

### Train

In [None]:
for i, dataset in enumerate(datasets):      
    print('DATASET: ' + dataset_names[i])
    print('% TRAINING: ' + str(partition))
    print('-------------------------------------')

    model_acc_sums = np.zeros((len(models), 2))
    for k in range(num_trials):
        print('TRIAL: %d' % (k+1))
        print('---------')
        data_train, data_test = split_train_test(dataset, partition)

        for j, model in enumerate(models):
            print('MODEL: %s' % model_names[j])

            train_acc, test_acc = model(data_train[0][:], data_train[1][:], data_test[0][:], data_test[1][:], draw=True)

            print('TRIAL TRAIN ACCURACY FOR DATASET FOR MODEL: ' + str(train_acc))
            print('TRIAL TEST ACCURACY FOR DATASET FOR MODEL: ' + str(test_acc))
            print()

            model_acc_sums[j][0] += train_acc
            model_acc_sums[j][1] += test_acc

    print('-----RESULTS-----')
    print('DATASET: ' + dataset_names[i])
    print('% TRAINING: ' + str(partition))
    print()
    for j, model in enumerate(model_names):
        print('MODEL: %s' % model_names[j])
        avg_train_acc = round(model_acc_sums[j][0] / num_trials, 2)
        avg_test_acc = round(model_acc_sums[j][1] / num_trials, 2)

        print('AVG TRAIN ACCURACY FOR %s WITH %s DATASET WITH %s TRAINING SPLIT: %s' % (model_names[j], dataset_names[i], partition, str(avg_train_acc)))
        print('AVG TEST ACCURACY FOR %s WITH %s DATASET WITH %s TRAINING SPLIT: %s' % (model_names[j], dataset_names[i], partition, str(avg_test_acc)))
        print()
    print()