In [22]:
import pandas as pd
import numpy as np
import os
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import precision_score, recall_score, f1_score
from imblearn.over_sampling import RandomOverSampler
import pickle

In [23]:
base_dir = os.path.dirname(os.getcwd())
processed_dir = os.path.join(base_dir, 'data', 'processed')

In [24]:
data = pd.read_csv(os.path.join(processed_dir, 'train.csv'))

In [25]:
features = data.drop(columns=["country_destination"])
labels = data["country_destination"]

In [26]:
X_train_val, X_test, y_train_val, y_test = train_test_split(features, labels, test_size=0.2, random_state=42, stratify=labels)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42, stratify=y_train_val)
#Train-Val-Test splits that follow the same distribution

In [27]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

In [28]:
path = os.path.join(base_dir, 'src', 'model', 'scaler.pkl')
with open(path, 'wb') as f:
    pickle.dump(scaler, f)

# Oversampling

In [29]:
oversample_strategy = {0: 3000, 1: 3000, 2: 3000, 3: 3000, 4: 5000, 5: 3000, 6: 3000, 7: 3000, 8: 3000, 9: 20000, 10: 10000}

oversampler = RandomOverSampler(sampling_strategy= oversample_strategy)
X_train, y_train = oversampler.fit_resample(X_train, y_train)
#Oversampling minority classes to make-up for significant class imbalance

In [30]:
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.to_numpy(), dtype=torch.long)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val.to_numpy(), dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.to_numpy(), dtype=torch.long)

# Focal Loss

In [31]:
class FocalLoss(nn.Module):
    def __init__(self, alpha, gamma, num_classes):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.num_classes = num_classes
        self.ce_loss = nn.CrossEntropyLoss(reduction='none')

    def forward(self, inputs, targets):
        ce_loss = self.ce_loss(inputs, targets)
        pt = torch.exp(-ce_loss)
        focal_loss = self.alpha * (1 - pt) ** self.gamma * ce_loss
        return focal_loss.mean()

# Model Architecture

In [33]:
input_size = X_train_tensor.shape[1]
hidden_size = 256
output_size = 11
num_epochs = 40

In [34]:
class AirbnbNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(AirbnbNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.bn1 = nn.BatchNorm1d(hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.bn2 = nn.BatchNorm1d(hidden_size)
        self.fc3 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.fc1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.bn2(x)
        x = self.relu(x)
        x = self.fc3(x)
        return x

# Training Loop

In [35]:
model = AirbnbNN(input_size, hidden_size, output_size)
criterion = FocalLoss(alpha=.25, gamma=2, num_classes=output_size)
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(num_epochs):
    model.train()

    # Forward pass
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)

    # Backward pass
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    model.eval()
    with torch.no_grad():
        val_outputs = model(X_val_tensor)
        val_loss = criterion(val_outputs, y_val_tensor)

    print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {loss.item():.4f}, Val Loss: {val_loss.item():.4f}")

Epoch [1/40], Train Loss: 0.5009, Val Loss: 0.4978
Epoch [2/40], Train Loss: 0.4669, Val Loss: 0.4919
Epoch [3/40], Train Loss: 0.4419, Val Loss: 0.4867
Epoch [4/40], Train Loss: 0.4225, Val Loss: 0.4809
Epoch [5/40], Train Loss: 0.4071, Val Loss: 0.4731
Epoch [6/40], Train Loss: 0.3949, Val Loss: 0.4629
Epoch [7/40], Train Loss: 0.3854, Val Loss: 0.4501
Epoch [8/40], Train Loss: 0.3786, Val Loss: 0.4350
Epoch [9/40], Train Loss: 0.3740, Val Loss: 0.4183
Epoch [10/40], Train Loss: 0.3707, Val Loss: 0.4004
Epoch [11/40], Train Loss: 0.3682, Val Loss: 0.3820
Epoch [12/40], Train Loss: 0.3660, Val Loss: 0.3635
Epoch [13/40], Train Loss: 0.3639, Val Loss: 0.3454
Epoch [14/40], Train Loss: 0.3616, Val Loss: 0.3281
Epoch [15/40], Train Loss: 0.3591, Val Loss: 0.3120
Epoch [16/40], Train Loss: 0.3563, Val Loss: 0.2977
Epoch [17/40], Train Loss: 0.3532, Val Loss: 0.2853
Epoch [18/40], Train Loss: 0.3499, Val Loss: 0.2748
Epoch [19/40], Train Loss: 0.3466, Val Loss: 0.2662
Epoch [20/40], Train 

# Inference

In [36]:
with torch.no_grad():
    model.eval()
    y_pred = model(X_test_tensor)
    test_loss = criterion(y_pred, y_test_tensor)

In [37]:
predicted_classes = torch.argmax(y_pred, dim=1)

precision = precision_score(y_test_tensor, predicted_classes, average='weighted')
recall = recall_score(y_test_tensor, predicted_classes, average='weighted')
f1 = f1_score(y_test_tensor, predicted_classes, average='weighted')

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

Precision: 0.5574
Recall: 0.6785
F1-Score: 0.5752


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [38]:
dir = os.path.join(base_dir, 'src', 'model')
torch.save(model.state_dict(), os.path.join(dir, "model-0.1.0.pth"))