In [None]:
import os
# The jupyter notebook is launched from your $HOME directory.
# Change the working directory to the C-Debugging directory
# which was created in your username directory under /scratch/vp91
os.chdir(os.path.expandvars("/scratch/vp91/$USER/AAPP-Pytorch"))


# Intro to AI in Pytorch

In this notebook we will explore the basics of doing classification with PyTorch.

First thing we want to demonstrate is that PyTorch can make performing GPU computations very easy. If the GPU is available, using the GPU for matrix multiplication is very easy.

In [None]:
from datetime import datetime
import numpy as np
import torch

device = torch.device("cuda") if torch.cuda.is_available() else "cpu"

N = int(1e4)
A_cpu = torch.rand(N, N)

# start_time = datetime.now()
# A_cpu @ A_cpu
# print(f"Total time: {datetime.now() - start_time}")

# A_gpu = A_cpu.to(device)
# start_time = datetime.now()
# A_gpu @ A_gpu
# print(f"Total time: {datetime.now() - start_time}")

In [None]:
device

We now turn to deep learning. We first create a classification problem. In this dataset, each spiral arm is of a different class.

In [None]:
import random
import math
from IPython import display
from matplotlib import pyplot as plt
from torch import nn, optim
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

## Linear Data

In [None]:
N = 1000  # num_samples_per_class
D = 1  # dimensions
 
tensor = torch.ones((N,D), dtype=torch.float32)
k = tensor.new_full((N,D),random.random())
b = 0.4 * torch.randn((N,D),dtype=torch.float32)
x = torch.randn((N,D),dtype=torch.float32)

y = k*x+b


print(tuple(y.size()), tuple(b.size()))

In [None]:
plt.scatter(x,y)

In [None]:
batchsize = 100
dataset = TensorDataset(x, y)
train_loader = DataLoader(dataset,batch_size=batchsize)

## Linear Regression

The below function encapsulates the training procedure for an epoch (single run over the entire training dataset)

In [None]:
class LinearRegression(torch.nn.Module):

    def __init__(self): 
        super(LinearRegression, self).__init__() 
        self.linear = torch.nn.Linear(D,D)
      
    def forward(self, x): 
        predict_y = self.linear(x) 
        return predict_y 

In [None]:
def train(epoch, model, criteria, train_loader, optimizer, device):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        # send to device
        data = data.to(device)
        target = target.to(device)

        # Optimize
        optimizer.zero_grad()
        output = model(data)
        loss = criteria(output, target)
        loss.backward()
        optimizer.step()
    if epoch % 10 == 0:
        loss_l.append(loss.item())
        print(f'Train Epoch: {epoch} \tLoss: {loss.item():.6f}')
    return loss_l

In [None]:
linear_model = LinearRegression().to(device)
define_criterion = torch.nn.MSELoss()
SGD_optimizer = torch.optim.SGD(linear_model.parameters(), lr=0.001)

In [None]:
loss_l = []
for epoch in range(200):
    loss_l = train(epoch, linear_model, define_criterion, train_loader, SGD_optimizer, device)


In [None]:
plt.plot(range(20),loss_l)

## Use the Bottle Dataset

### Reading Data  
Read and explore the data as much as you want. Here we want to select suitable input for linear regression model.

In [None]:
dataframe_raw = pd.read_csv('/scratch/vp91/AAPP2023/Data/CalCOFI_bottle.csv')
dataframe_raw.head()

In [None]:
dataframe_raw.describe()

**TODO**   
Change the columns below to select suitable columns for Simple Linear Regression or Multiple Linear Regression. Remember the input variables need to be independent to each other for linear regression to work.

In [None]:
dataframe = dataframe_raw.filter(items=['T_degC', 'Depthm', 'Salnty', 'O2ml_L', 'STheta', 'O2Sat', 'Oxy_µmol/Kg'])
dataframe.isnull().sum()

**TODO**  
How do you want to handle the Null values here? Modify the function to your needs.

In [None]:
def customize_dataset(data):
    dataframe = data.copy(deep=True)
    #fill na values
    dataframe = dataframe.groupby(dataframe.columns, axis = 1).transform(lambda x: x.fillna(x.mean()))
    return dataframe

In [None]:
dataframe = customize_dataset(dataframe)
dataframe.isnull().sum() 

**TODO**  
Change the dataframe selected to pytorch tensor. Hint: pandas has a function DataFrame.to_numpy()

In [None]:
input = dataframe[["T_degC","Depthm"]].to_numpy()
x = torch.from_numpy(input)
input.shape

**TODO**  
Modify below cells from above example to build and train your own model.

In [None]:
class LinearRegression(torch.nn.Module):

    def __init__(self): 
        super(LinearRegression, self).__init__() 
        self.linear = torch.nn.Linear(D,D)
      
    def forward(self, x): 
        predict_y = self.linear(x) 
        return predict_y 

In [None]:
def train(epoch, model, criteria, train_loader, optimizer, device):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        # send to device
        data = data.to(device)
        target = target.to(device)

        # Optimize
        optimizer.zero_grad()
        output = model(data)
        loss = criteria(output, target)
        loss.backward()
        optimizer.step()
    if epoch % 10 == 0:
        loss_l.append(loss.item())
        print(f'Train Epoch: {epoch} \tLoss: {loss.item():.6f}')
    return loss_l

In [None]:
linear_model = LinearRegression().to(device)
define_criterion = torch.nn.MSELoss()
SGD_optimizer = torch.optim.SGD(linear_model.parameters(), lr=0.001)

In [None]:
loss_l = []
for epoch in range(200):
    loss_l = train(epoch, linear_model, define_criterion, train_loader, SGD_optimizer, device)


In [None]:
plt.plot(range(20),loss_l)

- How is your model performance?  
- What could be improved?

# Unsupervised Learning - Clustering  


In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from time import time
from kmeans_pytorch import kmeans, kmeans_predict

## K-means Clustering  

1. Set K – number of clusters
2. Randomly assign k points as the centroid of the clusers
3. Measure distance between point a and the k clusters
4. Assign point a to the cluster with the minimum distance
5. Repeat 3-4 for all data points
6. Recalculate the cluster centroid
7. Repeat 5-6 until the clusters don’t change anymore
8. Calculate total clusters variance
9. Repeat 2-8 N times, result is the clustering with the minimum variance
ariance


In [None]:
# set random seed
np.random.seed(123)
# data
data_size, dims, num_clusters = 1000, 2, 3
x = np.random.randn(data_size, dims) /6
x = torch.from_numpy(x)
# more data
y = np.random.randn(5, dims) /6
y = torch.from_numpy(y)

In [None]:
# set device
if torch.cuda.is_available():
    device = torch.device('cuda:0')
else:
    device = torch.device('cpu')

In [None]:
# k-means
cluster_ids_x, cluster_centers = kmeans(
    X=x, num_clusters=num_clusters, distance='euclidean', device=device
)

In [None]:
# predict cluster ids for y
cluster_ids_y = kmeans_predict(
    y, cluster_centers, 'euclidean', device=device
)

In [None]:
# plot
plt.figure(figsize=(4, 3), dpi=160)
plt.scatter(x[:, 0], x[:, 1], c=cluster_ids_x, cmap='cool')
plt.scatter(y[:, 0], y[:, 1], c=cluster_ids_y, cmap='cool', marker='X')
plt.scatter(
    cluster_centers[:, 0], cluster_centers[:, 1],
    c='white',
    alpha=0.6,
    edgecolors='black',
    linewidths=2
)
plt.axis([-1, 1, -1, 1])
plt.tight_layout()
plt.show()

## K-means CPU vs GPU

In [None]:
# dimensions, num clusters
dims, num_clusters = 2, 3

# data sizes
data_sizes = [100000, 1000000, 5000000, 10000000]

In [None]:
gpu_times = []
cpu_times = []

for data_size in data_sizes:
    print(f'\ndata size: {data_size}')

    # data
    x = np.random.randn(data_size, dims) / 6
    x = torch.from_numpy(x)

    # gpu
    start_gpu = time()
    kmeans_gpu = kmeans(X=x, num_clusters=num_clusters, device=torch.device('cuda:0'))
    gpu_time = time() - start_gpu
    gpu_times.append(gpu_time)
    print(f'gpu time: {gpu_time}')
    
    # cpu
    start_cpu = time()
    kmeans_cpu = kmeans(X=x, num_clusters=num_clusters, device=torch.device('cpu'))
    cpu_time = time() - start_cpu
    cpu_times.append(cpu_time)
    print(f'cpu time: {cpu_time}')

In [None]:
# plot
plt.figure(figsize=(6, 3), dpi=160)
plt.plot(data_sizes, gpu_times, marker='o', label='gpu', color='xkcd:vermillion')
plt.plot(data_sizes, cpu_times, marker='o', label='cpu', color='xkcd:neon blue')
plt.xticks(data_sizes)
plt.legend(fontsize=12)
plt.grid(alpha=0.2)
plt.xlabel('data size', fontsize=14)
plt.ylabel('time (s)', fontsize=14)
plt.show()