In [1]:
%load_ext autoreload
%autoreload 2
import os
import sys
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt

In [2]:
scripts_path = os.path.abspath(os.path.join('../scripts'))
if scripts_path not in sys.path:
    sys.path.insert(0,scripts_path)

In [4]:
from station_location import *

In [5]:
X_train = pd.read_csv('../processed-data/X_train.csv', index_col=0).values
X_test = pd.read_csv('../processed-data/X_test.csv', index_col=0).values
y_train = pd.read_csv('../processed-data/y_train.csv', index_col=0).values.reshape(-1,)
y_test = pd.read_csv('../processed-data/y_test.csv', index_col=0).values.reshape(-1,)

In [6]:
X_train.shape

(239259, 7)

In [7]:
y_train.shape

(239259,)

In [8]:
X_test.shape

(59815, 7)

In [9]:
y_test.shape

(59815,)

In [10]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [27]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
pred = knn.predict(X_test)

In [37]:
def show_accuracy(pred):
    print(pred[pred==1].shape[0], 'eq, total is ', pred[pred==0].shape[0])
    print(((y_test==1) & (1==pred)).sum())
    print(f"Train Accuracy: {accuracy_score(y_test, pred)}")
    print(f"Train Precision: {precision_score(y_test, pred)}")
    print(f"Train Recall: {recall_score(y_test, pred)}")
    print(f"Train F1 score: {f1_score(y_test, pred)}")

In [38]:
show_accuracy(pred)

59815 eq, total is  0
343
Train Accuracy: 0.005734347571679344
Train Precision: 0.005734347571679344
Train Recall: 1.0
Train F1 score: 0.011403304631138002


In [95]:
from sklearn.linear_model import RidgeClassifier

In [96]:
X_train_expanded = feature_expansion(X_train)
X_test_expanded = feature_expansion(X_test)

In [97]:
X_train_expanded.shape

(239259, 31)

In [98]:
clf = RidgeClassifier(class_weight={0:0.009, 1:0.991}, alpha=0.9)
clf.fit(X_train_expanded, y_train)
pred = clf.predict(X_test_expanded)
pred[pred > 0.5] = 1
pred[pred <= 0.5] = 0

  overwrite_a=True).T


In [99]:
pred.shape

(59815,)

In [100]:
show_accuracy(pred)

36939 eq, total is  22876
229
Train Accuracy: 0.384368469447463
Train Precision: 0.006199409837840764
Train Recall: 0.6676384839650146
Train F1 score: 0.012284748672281529


In [101]:
def feature_expansion(tx):
    """
    Expand the data by adding, sqrt, cos and sin. Assumes that the data is biased.

    Parameters:
    tx: The data

    Returns:
    Expanded data tx
    """
    x_copy = tx.copy()
    for feature in tx.T[1:]:
        x_copy = np.hstack((x_copy, (np.sqrt(np.abs(feature)) * np.sign(feature)).reshape(feature.shape[0], 1)))
        x_copy = np.hstack((x_copy, np.cos(feature).reshape((feature.shape[0], 1))))
        x_copy = np.hstack((x_copy, np.sin(feature).reshape((feature.shape[0], 1))))
        x_copy = np.hstack((x_copy, feature.reshape((x_copy.shape[0], 1)) ** 2))
    return x_copy

In [197]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(7, 100)  # 6*6 from image dimension
        #self.fc2 = nn.Linear(120, 256)
        #self.fc3 = nn.Linear(256, 500)
        #self.fc4 = nn.Linear(1000, 500)
        #self.fc5 = nn.Linear(500, 200)
        #self.fc6 = nn.Linear(200, 100)
        self.fc7 = nn.Linear(100, 10)
        self.fc8 = nn.Linear(10, 2)

    def forward(self, x):
        # Max pooling over a (2, 2) window
        x = F.relu(self.fc1(x))
        #x = F.relu(self.fc2(x))
        #x = F.relu(self.fc3(x))
        #x = F.relu(self.fc4(x))
        #x = F.relu(self.fc5(x))
        #x = F.relu(self.fc6(x))
        x = F.relu(self.fc7(x))
        x = self.fc8(x)
        return x

In [211]:
def train(features, labels, model, lossfunc, optimizer, num_epoch):
    torch_inputs = torch.from_numpy(features)
    torch_outputs = labels
    
    for epoch in range(num_epoch):
        # TODO: Step 1 - create torch variables corresponding to features and labels
        # TODO: Step 2 - compute model predictions and loss
        torch_pred = model(torch_inputs)
        loss = lossfunc(torch_pred, torch_outputs)
        # TODO: Step 3 - do a backward pass and a gradient update step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if epoch % 10 == 0:
            print ('Epoch [%d/%d], Loss: %.4f' 
                   %(epoch+1, num_epoch, loss))


In [212]:
labels = y_train.reshape(-1,1).astype(np.float32)
labels = np.hstack((labels, labels))
labels[labels[:,0]==1,0]=2
labels[labels[:,0]==0,0]=1
labels[labels[:,0]==2,0]=0
labels

array([[ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       ..., 
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.]], dtype=float32)

In [215]:
import torch.optim as optim

# create your optimizer
model = Net()
optimizer = optim.Adam(model.parameters(), lr=0.01)
lossfunc = torch.nn.CrossEntropyLoss(weight=torch.from_numpy(np.array([0.001, 1.0], dtype=np.float32)))
lossfunc = torch.nn.MSELoss()

train(features = X_train.astype(np.float32),
      labels = torch.tensor(labels),
      model = model,
      lossfunc = lossfunc, 
      optimizer = optimizer,
      num_epoch = 240)

Epoch [1/240], Loss: 42.4427
Epoch [11/240], Loss: 0.5244
Epoch [21/240], Loss: 0.4336
Epoch [31/240], Loss: 0.3528
Epoch [41/240], Loss: 0.2831
Epoch [51/240], Loss: 0.2242
Epoch [61/240], Loss: 0.1754
Epoch [71/240], Loss: 0.1355
Epoch [81/240], Loss: 0.1035
Epoch [91/240], Loss: 0.0782
Epoch [101/240], Loss: 0.0585
Epoch [111/240], Loss: 0.0435
Epoch [121/240], Loss: 0.0322
Epoch [131/240], Loss: 0.0239
Epoch [141/240], Loss: 0.0179
Epoch [151/240], Loss: 0.0136
Epoch [161/240], Loss: 0.0106
Epoch [171/240], Loss: 0.0086
Epoch [181/240], Loss: 0.0072
Epoch [191/240], Loss: 0.0063
Epoch [201/240], Loss: 0.0057
Epoch [211/240], Loss: 0.0053
Epoch [221/240], Loss: 0.0051
Epoch [231/240], Loss: 0.0050


In [216]:
pred = model.forward(torch.from_numpy(X_test.astype(np.float32))).detach().numpy()

In [217]:
pred

array([[ 0.9805251 ,  0.00481522],
       [ 0.9805251 ,  0.00481522],
       [ 0.9805251 ,  0.00481522],
       ..., 
       [ 0.9805251 ,  0.00481522],
       [ 0.9805251 ,  0.00481522],
       [ 0.9805251 ,  0.00481522]], dtype=float32)

In [234]:
np.sum(pred[:,1]>0.00481522)

59815

In [165]:
(pred >= 0.016).sum()

357

In [166]:
pred[pred >= 0.016] = 1
pred[pred < 0.016] = 0
y_test.sum()

343.0

In [167]:
show_accuracy(pred)

357 eq, total is  59458
122451
Train Accuracy: 0.9884309955696732
Train Precision: 0.011204481792717087
Train Recall: 0.011661807580174927
Train F1 score: 0.011428571428571429
