In [0]:
import numpy as np
import pandas as pd

Mount your Google Drive. In this notebook, we assume that 'report1' folder is placed directly under 'My Drive'.

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
!ls /content/drive/My\ Drive  #You should be able to see 'report1' folder by running this.

Sample code (Ridge regression) for the assignment.

In [0]:
def ridgeRegres(xArr,yArr, gamma):
    xMat = np.mat(xArr)
    yMat = np.mat(yArr)
    
    xTx = xMat*xMat.T + gamma*np.eye(xMat.shape[0])
    if np.linalg.det(xTx) == 0.0:
        print("This matrix is singular, cannot do inverse")
        return
    ws = np.linalg.solve(xTx, xMat*yMat)
    return ws

def RMSE(vec1, vec2):
    return np.sqrt(np.mean(np.power(vec1-vec2,2)))

In [0]:
root_dir="/content/drive/My Drive/report1/"
N = 20000
nfeat = 8

train_data = pd.read_csv(root_dir+"train.csv")
test_data = pd.read_csv(root_dir+"test.csv")
Xmat = np.matrix(train_data)[:,:nfeat]
ymat = np.matrix(train_data)[:,nfeat]   #This is the target

Prepare the features.

In [0]:
#make dummy variables for the 'weather' categories
#note that we have to create dummies throughout training and testing data to align them
#(another idea is to use OneHotEncoder in scikit-learn)
all_data = pd.concat([train_data,test_data])
meta = pd.get_dummies(all_data['weather']) 

X = np.float64(np.hstack([Xmat[:,1:5],meta.iloc[:len(train_data),:]])) #use four real-valued features and the dummy, and cast to float64 data type 
y = np.float64(ymat)

XTrain = X[:N,:].T #use the first N samples for training (column-wise)
yTrain = y[:N,:]
XVal = X[N:,:].T #use the rests for validation
yVal = y[N:,:]

Run Ridge regression.

In [0]:
w = ridgeRegres(XTrain,yTrain,gamma=10000) #linear regression

Evaluate training and validation errors.

In [0]:
yHatTrain = np.dot(w.T, XTrain)
yHatVal = np.dot(w.T, XVal)

print("Training error ", RMSE(yTrain, yHatTrain.T))
print("Validation error ", RMSE(yVal, yHatVal.T))

Now, project the testing data. You can upload 'result.txt' to the evaluation server.

In [0]:
Xtest = np.float64(np.hstack([test_data.iloc[:,1:5],meta.iloc[len(train_data):,:]]))
yHatTest = np.dot(w.T, Xtest.T)
np.savetxt(root_dir+'result.txt', yHatTest.T) #save predictions in rows