# Project 1: Kaggle

Jonathan Booker, Maxwell Montemayor, Lucas Ancieta, Abraham Belayneh

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
from torch.utils.data import TensorDataset, Dataset, DataLoader
from sklearn.model_selection import train_test_split

# Data Proccessing

- We first filter out any data points with Nan.
- Then, we normalize every feature.
- Split the test and train data randomly into tensors.


In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/L-anc/Motility/main/code/out.csv')

df = df.replace('',np.nan)
df = df.dropna(axis="rows", how="any")
df = df.drop(columns= "uid")

df_X = df.drop(columns= "label")
for col in df_X:
    df_X[col] = [(element - np.mean(df_X[col])) / np.std(df_X[col]) for element in df_X[col]]
print("===dataframe=== \n {}".format(df))
X = np.array(df_X)
y = np.array(df["label"])

# def normalize(x):
#     x = np.asarray(x)
#     return (x - x.min()) / (np.ptp(x))
# X_normed = []
# for i in range(0, len(X)):
#     X_normed.append(X[i])
# # X = (X - X.mean())/(X.std())
# print(" - NORMALIZED  X \n {} - ".format(X))


train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2)
# print(" pre TRAIN X \n {}".format(train_X))

train_X = torch.tensor(train_X, dtype=torch.float32)
# print("TENSOR  X \n {}".format(train_X))
train_y = torch.tensor(train_y, dtype=torch.long)
# 80/20 x values as floats
test_X = torch.tensor(test_X, dtype=torch.float32)
test_y = torch.tensor(test_y, dtype=torch.long)
print("test_x torch tensor {}".format(test_X))
print("test_x torch tensor shape {}".format(test_X.shape))

train_dataset = TensorDataset(train_X, train_y)
test_dataset = TensorDataset(test_X, test_y)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=True)

print ("x \n{}".format(df))
print ("y \n{}".format(df))

# x_train = torch.tensor(train[['chi','lin_reg','lin_reg_chi','stddev_step_speed']].copy().to_numpy(), dtype=torch.float32)
# y_train = torch.tensor(train['label'].copy().to_numpy(), dtype=torch.long)
# x_test = torch.tensor(train[['chi','lin_reg','lin_reg_chi','stddev_step_speed']].copy().to_numpy(), dtype=torch.float32)
# y_test = torch.tensor(train['label'].copy().to_numpy(), dtype=torch.long)
# train_dataset = TensorDataset(x_train, y_train)
# test_dataset = TensorDataset(x_test, y_test)

## Network Definition
Let's instantiate a model and take a look at the layers.

In [None]:
model = nn.Sequential(
    nn.Linear(4, 8),
    nn.ReLU(),
    nn.Linear(8, 5),
    nn.ReLU(),
    nn.Linear(5, 2),
    nn.Sigmoid(),

)
optimizer = torch.optim.Adam(model.parameters(), lr=2.4e-4)
# loss_fn = nn.BCELoss()
loss_fn = nn.CrossEntropyLoss()

torch.manual_seed(15552494823729223621)

## Training
We also choose an optimizer and a loss function.

We could write our training procedure manually and directly index the `Dataset` objects, but the `DataLoader` object conveniently creates an iterable for automatically creating random minibatches:

We now write our backpropagation loop, training for 10 epochs.

In [None]:
model.train()
train_correct = 0
train_total = 0
train_acc = []
for epoch in range(10):
    for batch_idx, (data, target) in enumerate(train_loader):
        # Erase accumulated gradients
        optimizer.zero_grad()

        # Forward pass
        output = model(data)

        # Calculate loss
        loss = loss_fn(output, target)

        # Backward pass
        loss.backward()

        # Weight update
        optimizer.step()
        _, predicted = torch.max(output.data, 1)
        train_total += target.size(0)
        train_correct += (predicted == target).sum().item()
        train_acc.append(train_correct/train_total)
    # Track loss each epoch
    print('Train Epoch: %d  Loss: %.4f' % (epoch + 1,  loss.item()))

## Testing
We can perform forward passes through the network without saving gradients.

In [None]:
# Putting layers like Dropout into evaluation mode
model.eval()

test_loss = 0
correct = 0
# Turning off automatic differentiation
with torch.no_grad():
    for data, target in test_loader:
        output = model(data)
        test_loss += loss_fn(output, target).item()  # Sum up batch loss
        pred = output.argmax(dim=1, keepdim=True)  # Get the index of the max class score
        correct += pred.eq(target.view_as(pred)).sum().item()

test_loss /= len(test_loader.dataset)
# print(valid_set)
print('Test set: Average loss: %.4f, Accuracy : %d/%d (%.4f)' %
      (test_loss, correct, len(test_loader.dataset),
       100. * correct / len(test_loader.dataset)))

In [None]:
#   REAL TESTING
test = pd.read_csv('https://raw.githubusercontent.com/L-anc/Motility/main/code/test.csv')

df_vals = test.drop(columns= "uid")
for col in df_vals:
    df_vals[col] = [(element - np.mean(df_vals[col])) / np.std(df_vals[col]) for element in df_vals[col]]

test_X = np.array(df_vals)
test_X = torch.tensor(test_X, dtype=torch.float32)





In [None]:
output = model(test_X)
pred = output.argmax(dim=1, keepdim=True)

pred = np.array(pred)
print(pred)
submission = pd.DataFrame(columns=['UID', 'label'])
submission['UID'] = test['uid']
submission['label'] = pred
print(submission)

submission.to_csv('submission.csv', index=False)

In [None]:
plt.figure(figsize=(10, 5))
plt.plot(train_acc, label='Test loss')
plt.legend()
plt.show()