In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.functional as F
import torch.optim as optim
from tqdm import tqdm

from sklearn.model_selection import train_test_split

# type hint
from typing import List, Dict, Sequence, Any, Union

# 1. Download the Forest Cover Type Dataset by UCI Machine Learning from Kaggle. Create a dataframe with only the forests with cover type 1 or 2.

In [2]:
data = pd.read_csv("Lab1_data/covtype.csv")
data.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,2596,51,3,258,0,510,221,232,148,6279,...,0,0,0,0,0,0,0,0,0,5
1,2590,56,2,212,-6,390,220,235,151,6225,...,0,0,0,0,0,0,0,0,0,5
2,2804,139,9,268,65,3180,234,238,135,6121,...,0,0,0,0,0,0,0,0,0,2
3,2785,155,18,242,118,3090,238,238,122,6211,...,0,0,0,0,0,0,0,0,0,2
4,2595,45,2,153,-1,391,220,234,150,6172,...,0,0,0,0,0,0,0,0,0,5


In [3]:
mask_cover_type = (data["Cover_Type"] == 1) |  (data["Cover_Type"] == 2) 
data_filtered = data[mask_cover_type].reset_index(drop = True)
np.unique(data_filtered.Cover_Type)

array([1, 2])

# 2. Do the following to get ready for training:
(a) Download the Forest Cover Type Dataset by UCI Machine Learning from Kaggle. Create a dataframe with only the forests with cover type 1 or 2.

In [4]:
from torch.utils.data import Dataset

class Forest_DataSet(Dataset):

	def __init__(self, df):
		self.df = df
	
	def __len__(self):
		return self.df.shape[0]
	
	def __getitem__(self, idx):
		row = self.df.iloc[idx]
		X = torch.tensor(row[:10]).float()
		y = torch.tensor(row[-1]).float() - 1 # 2 -> 1; 2 -> 0
		return X, y

In [5]:
Forest_ds = Forest_DataSet(data_filtered)
next(iter(Forest_ds))

(tensor([2804.,  139.,    9.,  268.,   65., 3180.,  234.,  238.,  135., 6121.]),
 tensor(1.))


(b.) Randomly split your data into a training and validation Dataset object.


In [6]:
data_train, data_val = train_test_split(data_filtered, test_size = 0.2)
data_train, data_val = data_train.reset_index(drop = True), data_val.reset_index(drop = True)
train_ds, val_ds = Forest_DataSet(data_train), Forest_DataSet(data_val)

(c.) Create a DataLoader with whatever batch size you desire.

In [7]:
from torch.utils.data import DataLoader

train_dl = DataLoader(train_ds, batch_size = 100, shuffle = True)
val_dl = DataLoader(val_ds, batch_size = 100, shuffle = False)

x, y = next(iter(train_dl))

# notice that since we set batch_size = 10
# each time the gradient descent will see this 10 rows of data shape=(10,2)
# and then update the gradient and corresponding parameters
print(x.shape, y.shape, "\n")

print(x, y)

torch.Size([100, 10]) torch.Size([100]) 

tensor([[ 2.8920e+03,  0.0000e+00,  5.0000e+00,  3.0000e+01,  1.0000e+00,
          1.8380e+03,  2.1300e+02,  2.3100e+02,  1.5600e+02,  9.1200e+02],
        [ 3.1470e+03,  2.6400e+02,  1.7000e+01,  2.3400e+02,  5.6000e+01,
          3.2210e+03,  1.7600e+02,  2.4600e+02,  2.0800e+02,  1.6080e+03],
        [ 3.0180e+03,  6.4000e+01,  8.0000e+00,  3.2400e+02,  4.6000e+01,
          5.0940e+03,  2.2800e+02,  2.2400e+02,  1.2900e+02,  4.3350e+03],
        [ 3.1900e+03,  1.1500e+02,  9.0000e+00,  4.9200e+02,  5.8000e+01,
          3.8200e+02,  2.3600e+02,  2.3200e+02,  1.2700e+02,  8.8200e+02],
        [ 3.1690e+03,  2.5200e+02,  3.0000e+00,  5.4000e+02, -3.7000e+01,
          4.6600e+02,  2.1300e+02,  2.4100e+02,  1.6600e+02,  1.9430e+03],
        [ 3.1700e+03,  3.4900e+02,  8.0000e+00,  3.0000e+02,  3.3000e+01,
          2.1060e+03,  2.0600e+02,  2.2700e+02,  1.6000e+02,  7.4100e+02],
        [ 2.9200e+03,  1.6200e+02,  8.0000e+00,  3.0000e+02,  4.

# 3. Set up the following:

(a.) A 3-layer Feed-Forward Neural Network for this data. Think about the size of the input/output layer. Only linear layers and activation functions are allowed right now!

In [9]:
import torch.nn as nn
# from torchsummary import summary

class ThreeLayersModel(nn.Module):

	def __init__(self, input_dim, hidden_dim, output_dim):

		super().__init__()

		self.Linear1 = nn.Linear(input_dim, hidden_dim)
		self.Linear2 = nn.Linear(hidden_dim, output_dim)
		self.relu = nn.ReLU()

	
	def forward(self, x):
		x = self.Linear1(x)
		x = self.relu(x)
		x = self.Linear2(x)

		return torch.squeeze(x)

In [11]:
model = ThreeLayersModel(10, 10, 1) 
# summary(model, input_size = (10, ), device = "cpu")


# 4. Write a function that iterates over a dataloader, doing the following: 

(a.) prints the average loss (average over each datapoint!),

In [12]:
import torch.optim as optim

model = ThreeLayersModel(10, 10, 1) 
loss_fun = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr = 0.1)

def one_epoch_test(model, data_loader, optimizer, loss_fun):

	X, y = next(iter(data_loader))
	
	model.train()
	y_pred = model(X)
	loss = loss_fun(y_pred, y)
	print("loss: ", loss.item())

	optimizer.zero_grad()
	loss.backward()
	optimizer.step()


In [13]:
one_epoch_test(model = model, data_loader = train_dl, optimizer = optimizer, loss_fun = loss_fun)

loss:  71.24311065673828



# 5. Write a loop that trains your model for ten epochs, and at the end of each epoch it prints the average loss on the training set and on the validation set.

In [22]:
def one_pass(model: model, data_loader: DataLoader, backward: bool, loss_fun,  optimizer) -> torch.float:

	if backward is True:
		model.train()
	else:
		model.eval()

	total_loss = 0.0
	for x, y in data_loader:
		y_pred = model(x)
		loss = loss_fun(y_pred, y)
		total_loss += loss.item()

		if backward is True:
			optimizer.zero_grad()
			loss.backward()
			optimizer.step()
		
	avg_loss = total_loss / len(data_loader)
	return avg_loss


def one_pass_acc(model, dataloader, num_points):
    model.eval()
    total_incorrect = 0
    
    for x, y in dataloader:
        y_pred = torch.sigmoid(model(x))
        
        # summing up all the incorrect examples by taking difference and adding up all non-zero entries
        total_incorrect += torch.sum(torch.abs(y - y_pred)).item()
        
    percent_wrong = total_incorrect / num_points
    return 1 - percent_wrong * 0.01


def training(model: model, data_loader: DataLoader, loss_fun: List[float], optimizer, num_epoch: int) -> torch.float:
	
	for i in tqdm(range(num_epoch)):

		train_loss = one_pass(model=model, data_loader=train_dl, backward=True, loss_fun=loss_fun, optimizer=optimizer)
		val_loss = one_pass(model=model, data_loader=val_dl, backward=False,loss_fun=loss_fun, optimizer=optimizer)
		val_acc = one_pass_acc(model, val_dl, len(val_dl))
		print(f"epoch: {i+1 :<3} train_loss: {train_loss:.9f} val_loss: {val_loss:.9f} val_acc: {val_acc:.3f}")

In [23]:
loss_fun = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr = 0.001)

training(model = model, data_loader = train_dl, loss_fun = loss_fun, optimizer = optimizer, num_epoch = 5)

 20%|██        | 1/5 [01:44<06:58, 104.66s/it]

33.92795658111572 33622.60497188568 991
epoch: 1   train_loss: 0.577789194 val_loss: 0.654867937 val_acc: 0.661


 40%|████      | 2/5 [03:29<05:14, 104.81s/it]

33.43515807324052 33134.24165058136 991
epoch: 2   train_loss: 0.569949154 val_loss: 0.543030148 val_acc: 0.666


 60%|██████    | 3/5 [05:13<03:29, 104.54s/it]

32.39662631465015 32105.0566778183 991
epoch: 3   train_loss: 0.563154364 val_loss: 0.522036364 val_acc: 0.676


 80%|████████  | 4/5 [06:58<01:44, 104.45s/it]

35.41755041128211 35098.79245758057 991
epoch: 4   train_loss: 0.561065788 val_loss: 0.606141038 val_acc: 0.646


100%|██████████| 5/5 [08:45<00:00, 105.05s/it]

33.0127964977299 32715.681329250336 991
epoch: 5   train_loss: 0.556925718 val_loss: 0.536652711 val_acc: 0.670



