# Tutorial 3 - Fundamental ML Algorithms Part I

In [None]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression as LR
from sklearn import tree
from sklearn.linear_model import LogisticRegression as LogReg
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import matplotlib.image as mpimg
# import graphviz

### 1. Linear Regression

We wish the predict the energy output of a certain date, given information on current temperature and pressure readings

In [None]:
# Load dataset for linear regression, and inspect it

### FILL IN ###
df_energy.head(5)

In [None]:
# Remove date column, rv1, and rv2 columns 
df_energy.drop('date', axis=1, inplace=True) # axis=1 to indicate we're dropping a column, inplace means to not make a copy
df_energy.drop('rv1', axis=1, inplace=True)
df_energy.drop('rv2', axis=1, inplace=True)

# Inspect modified dataset
### FILL IN ###

In [None]:
# Sum appliances and lights for total energy

### FILL IN ###
df_energy.head(5)

In [None]:
# Then drop appliances and lights, as they are now redundant features

df_energy.drop('Appliances', axis=1, inplace=True)
df_energy.drop('lights', axis=1, inplace=True)
df_energy.head(5)

In [None]:
# Split the dataset into features and labels for training, testing

### FILL IN ###
df_energy_X.head(5)

In [None]:
df_energy_y.head(5)

In [None]:
def PF(X, degree):
    """phi(X) polynomial expansion
    
    Method that takes in dataframe, and degree of polynomial
    and outputs polynomial expansion of dataframe
    """
    
    X = np.asarray(X)
    tmp_raise = X
    for i in range(2, degree+1):
        tmp_raise = np.append(tmp_raise, np.power(X, i), axis=1)
    X = tmp_raise
    finalarr = np.ones((X.shape[0], X.shape[1]+1))
    finalarr[:,:-1] = X
    return finalarr

In [None]:
# Do 60-20-20 train_test_split for training set, validation set, and test set

def split(X, y):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.4, random_state=42)

    X_val, X_test, y_val, y_test = train_test_split(
        X_test, y_test, test_size=0.5, random_state=42)

    return X_train, X_val, X_test, y_train, y_val, y_test

In [None]:
# Create list to store validation accuracy for different polynomial expansions
validation_acc = []

# For different polynomial bases, fit phi(x) into linear regression model, then compute R^2 = 1 - Rss/Rtot

degree = 10
for i in range(1,degree +1):
    X_poly = PF(df_energy_X, i)
    X_train, X_val, X_test, y_train, y_val, y_test = split(X_poly, df_energy_y)
    
    # Initialize linear reg model and fit data
    
    ### FILL IN ###
    
    print('Training set R^2 for degree {}: {}'.format(i, np.round(LR_model.score(X_train, y_train), 4)))
    print('Validation set R^2 for degree {}: {} \n'. format(i, np.round(LR_model.score(X_val, y_val),4)))
    
    validation_acc.append(np.round(LR_model.score(X_val, y_val),4))

In [None]:
# print out values of validation accuracy

### FILL IN ###

In [None]:
# Display validation R^2 for polynomials from degree 1 to 10

plt.plot([i+1 for i in range(len(validation_acc))], validation_acc)
plt.axis([1, degree, -0.1, 1]), plt.title('Validation Accuracy of Linear Regression on Different Polynomial Expansions of X')
plt.show()

### 2. Logistic Regression  on the MNIST Handwritten Digits Dataset

### MNIST Dataset
The Mnist Dataset is too large to be on github, you can download it here: https://pjreddie.com/media/files/mnist_train.csv

In [None]:
np_mnist = np.loadtxt(open("mnist_train.csv", "rb"), delimiter=",", skiprows=1)

In [None]:
# Convert to pandas dataframe for easier manipulation

### FILL IN ###
df_mnist.shape

In [None]:
df_mnist.rename(columns={0: 'label'}, inplace=True)

# Display 5 sample values for 28x28 images
### FILL IN ###

In [None]:
# Separate features and labels
df_mnist_X = df_mnist.drop('label', axis=1)
df_mnist_y = df_mnist['label']

In [None]:
# Use previous split helper method to split data in train, val, test sets
X_train, X_val, X_test, y_train, y_val, y_test = ### FILL IN ###

#### We attempt to classify the different digits in MNIST using logistic regression

In [None]:
# When initializing, need to change optimization method for multiclass labels

# Initialize log reg model with optimization method that can handle multinomial loss

### FILL IN ###

# Fit on training data

print('Logistic Regression Training Accuracy: {}'.format(np.round(LRmodel.score(X_train, y_train), 5)))
print('Logistic Regression Validation Accuracy: {}'.format(np.round(LRmodel.score(X_val, y_val), 5)))

In [None]:
# Cross validation code that we won't run now, but you can try it in your own time
# This should take approximately 10 minutes to run (go grab dinner in between?)
# Recalling that C is inversely proportional to regularization strength

'''
for i in range(0.3, 1.3, 0.1):
    LRmodel = LR(C=i, solver='lbfgs')
    LRmodel.fit(X_train, y_train)
    print('Logistic Regression Training Accuracy with C = {}: {}'
         .format(i, LRmodel.score(X_train, y_train)))
    print('Logistic Regression Validation Accuracy with C = {}: {}'
         .format(i, LRmodel.score(X_val, y_val)))
'''

### 3. Decision Trees on the Iris dataset

In [None]:
df_iris = ### FILL IN ###
df_iris.head(10)

In [None]:
df_iris_X = ### FILL IN ###
df_iris_y = ### FILL IN ###

X_train, X_val, X_test, y_train, y_val, y_test = split(df_iris_X, df_iris_y)

In [None]:
clf = tree.DecisionTreeClassifier()
clf

In [None]:
clf = clf.fit(X_train, y_train)
print('Decision Tree train set accuracy: {}'.format(clf.score(X_train, y_train)))
print('Decision Tree validation set accuracy: {}'.format(clf.score(X_val, y_val)))

In [None]:
# Code Isaac used to generate the tree.png file using graphviz

# dot_data = tree.export_graphviz(clf, out_file=None, 
#                          feature_names=df_iris_X.columns,  
#                          class_names=df_iris_y,  
#                          filled=True, rounded=True,  
#                          special_characters=True)  
# graph = graphviz.Source(dot_data)  
# graph 

In [None]:
img_tree = mpimg.imread('tree.png')

plt.figure(figsize=(20,20))
plt.imshow(img_tree)

In [None]:
# Score your current classifier on your test set, after training on the validation set

### FILL IN ###
### FILL IN ###