# Density estimation
In this case, we model $p(t|x,y)$ by a neural network. In the pet adoption case, $x$ is the feature of the pet and $y$ is the adoption speed. The $t$ is the type of the pet, i.e., cat or dog.

## 0.Data preparation

In [1]:
import pandas as pd
path = '/app/Final/code'
# path='.'

In [2]:
# This is the dataset processed from the midterm 
train_size = 14993
data_df = pd.read_csv(path + '/data/data_df_proc.csv')[:train_size]
data_df.head()

Unnamed: 0,Type,Name,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,...,RESCUER_avg_photo_STD,RESCUER_Sterilized_MEAN,RESCUER_Dewormed_MEAN,RESCUER_Vaccinated_MEAN,INTERACTION_Fee_MEAN,INTERACTION_Fee_MIN,INTERACTION_Fee_MAX,INTERACTION_avg_fee_MEAN,INTERACTION_avg_fee_STD,INTERACTION_avg_fee_MAX
0,2,Nibble,3,299,0,1,1,7,0,1,...,1.06066,0.0,0.0,0.0,10.438579,0,500,9.860575,37.517683,500.0
1,2,No Name Yet,1,265,0,1,1,2,0,2,...,,,,,7.621302,0,500,7.029586,49.694115,500.0
2,1,Brisco,1,307,0,1,2,7,0,2,...,2.179079,0.144796,0.959184,0.952941,38.65033,0,700,37.996073,101.708699,700.0
3,1,Miko,4,307,0,2,1,2,0,2,...,4.707039,0.3,0.877551,0.489796,37.690031,0,1500,36.557373,114.952243,1000.0
4,1,Hunter,1,307,0,1,1,0,0,2,...,1.233783,0.046875,0.795455,0.75,7.871345,0,500,6.997772,35.615495,500.0


In [3]:
# Load data from csv
# data_df = pd.read_csv(path+'/data/train/train.csv')
# data_df.columns
cols_to_drop = ['Name','RescuerID','VideoAmt','Description','PetID','PhotoAmt']
to_drop_columns = ['PetID', 'Name', 'RescuerID', 'Description',
                    'BreedName_full','Breed1Name','Breed2Name']
data_df.drop(cols_to_drop+to_drop_columns, axis=1, inplace=True)

In [4]:
# Deal with the NaN values
data_df.fillna(-1, inplace=True)

## 1. Neural network
Now we have the data to estimate $p(t|x,y)$ where $t$ is the type of the pet, $y$ is the adoption speed and $x$ is the remaining columns in data_df. We use a neural network to model $p(t|x,y)$.

In [5]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim as torch_optim
import torch.nn as nn
import torch.nn.functional as F

In [6]:
# Embedding the categorical variables using nn.Embedding
cat_cols = ['Breed1', 'Breed2', 'Gender', 'Color1', 'Color2',
       'Color3', 'State', 'Breed_full','Color_full', 'hard_interaction']
from sklearn.preprocessing import LabelEncoder

label_encoders = {}
for cat_col in cat_cols:
    label_encoders[cat_col] = LabelEncoder()
    data_df[cat_col] = label_encoders[cat_col].fit_transform(data_df[cat_col])
    
emb_c = {n: len(col.unique()) for n,col in data_df.items() if n in cat_cols}
emb_cols = emb_c.keys() # names of columns chosen for embedding
emb_szs = [(c, min(30, (c+1)//2)) for _,c in emb_c.items()] #embedding sizes for the chosen columns


In [7]:
# Split data into train and validation
train_df = data_df.iloc[:len(data_df)*4//5, :]
valid_df = data_df.iloc[len(data_df)*4//5:, :]
train_df.shape, valid_df.shape


X_train = train_df.drop(columns='AdoptionSpeed')
y_train = train_df['AdoptionSpeed']
X_valid = valid_df.drop(columns='AdoptionSpeed')
y_valid = valid_df['AdoptionSpeed']

n_cont = len(X_train.columns)-len(emb_cols) # number of continuous columns

In [8]:
class PetFinderData(Dataset):
    def __init__(self, X, Y, emb_cols):
        X = X.copy()
        self.X1 = torch.tensor(X.loc[:,emb_cols].copy().values).long() #categorical columns
        self.X2 = torch.tensor(X.drop(columns=emb_cols).copy().values).float() #numerical columns
        self.y = torch.tensor(Y.values).to(torch.float32)
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.X1[idx], self.X2[idx], self.y[idx]

In [9]:
def get_default_device():
    """Pick GPU if available, else CPU"""
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')
def to_device(data, device):
    """Move tensor(s) to chosen device"""
    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)

class DeviceDataLoader():
    """Wrap a dataloader to move data to a device"""
    def __init__(self, dl, device):
        self.dl = dl
        self.device = device
        
    def __iter__(self):
        """Yield a batch of data after moving it to device"""
        for b in self.dl: 
            yield to_device(b, self.device)

    def __len__(self):
        """Number of batches"""
        return len(self.dl)

In [10]:
class PetFinderModel(nn.Module):
    def __init__(self, embedding_sizes, n_cont):
        super().__init__()
        self.embeddings = nn.ModuleList([nn.Embedding(categories, size) for categories,size in embedding_sizes])
        n_emb = sum(e.embedding_dim for e in self.embeddings) #length of all embeddings combined
        self.n_emb, self.n_cont = n_emb, n_cont
        self.lin1 = nn.Linear(self.n_emb + self.n_cont, 200)
        self.lin2 = nn.Linear(200, 30)
        self.lin4 = nn.Linear(30, 1)
        # self.lin4 = nn.Sequential(nn.Linear(30, 1), nn.Softmax(dim=1))
        self.bn1 = nn.SELU()
        self.bn2 = nn.SELU()
        self.bn3 = nn.SELU()
        self.bn4 = nn.SELU()
        self.emb_drop = nn.Dropout(0.2)
        self.drops = nn.Dropout(0.3)
        

    def forward(self, x_cat, x_cont):
        x = [e(x_cat[:,i]) for i,e in enumerate(self.embeddings)]
        x = torch.cat(x, 1)
        x = self.emb_drop(x)
        x2 = self.bn1(x_cont)
        
        x = torch.cat([x, x2], 1)
        x = self.lin1(x)
        x = self.bn2(x)
        x = self.drops(x)
        x = self.lin2(x)
        x = self.bn3(x)      
        x = self.drops(x)  
        # x = self.lin3(x)
        # x = self.bn4(x)
        # x = self.drops(x)
        x = self.lin4(x)
        # map x to [0,4]
        x = torch.sigmoid(x)*4
        return x

In [11]:
def get_optimizer(model, lr = 0.0001, wd = 0.0):
    optim = torch_optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    return optim

In [12]:
def train_model(model, optim, train_dl):
    model.train()
    total = 0
    sum_loss = 0
    for x1, x2, y in train_dl:
        batch = y.shape[0]
        # print(batch)
        output = model(x1, x2)
        loss = F.mse_loss(output, y.view(-1,1))
        # print(output, y.view(-1,1))
        optim.zero_grad()
        loss.backward()
        optim.step()
        total += batch
        sum_loss += batch*(loss.item())
    return sum_loss/total

In [13]:
def val_loss(model, valid_dl):
    model.eval()
    total = 0
    sum_loss = 0
    correct = 0
    for x1, x2, y in valid_dl:
        current_batch_size = y.shape[0]
        output = model(x1, x2)

        loss = F.mse_loss(output, y.view(-1,1))
        sum_loss += current_batch_size*(loss.item())
        total += current_batch_size
        pred = torch.round(output)[0]
        # pred = torch.round(torch.max(output, 1)[0])
        correct += (pred == y).float().sum().item()

    print("valid loss %.3f and accuracy %.3f" % (sum_loss/total, correct/total))
    return sum_loss/total, correct/total

In [14]:
def train_loop(model, epochs, lr=0.01, wd=0.01, train_dl=None, valid_dl=None):
    optim = get_optimizer(model, lr = lr, wd = wd)
    for i in range(epochs): 
        loss = train_model(model, optim, train_dl)
        print("training loss: %.3f"  % loss)
        val_loss(model, valid_dl)

In [15]:
model = PetFinderModel(emb_szs, n_cont)
device = get_default_device()
to_device(model, device)

PetFinderModel(
  (embeddings): ModuleList(
    (0): Embedding(176, 30)
    (1): Embedding(135, 30)
    (2): Embedding(3, 2)
    (3-4): 2 x Embedding(7, 4)
    (5): Embedding(6, 3)
    (6): Embedding(14, 7)
    (7): Embedding(812, 30)
    (8): Embedding(63, 30)
    (9): Embedding(142, 30)
  )
  (lin1): Linear(in_features=328, out_features=200, bias=True)
  (lin2): Linear(in_features=200, out_features=30, bias=True)
  (lin4): Linear(in_features=30, out_features=1, bias=True)
  (bn1): SELU()
  (bn2): SELU()
  (bn3): SELU()
  (bn4): SELU()
  (emb_drop): Dropout(p=0.2, inplace=False)
  (drops): Dropout(p=0.3, inplace=False)
)

In [16]:
train_ds = PetFinderData(X_train, y_train, emb_cols)
valid_ds = PetFinderData(X_valid, y_valid, emb_cols)

# Get data into device
batch_size = 512
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=batch_size, shuffle=True)

train_dl = DeviceDataLoader(train_dl, device)
valid_dl = DeviceDataLoader(valid_dl, device)



In [17]:
train_loop(model, epochs=5, lr=0.005, wd=0.0001, train_dl=train_dl, valid_dl=valid_dl)

training loss: 3.647
valid loss 3.659 and accuracy 0.279
training loss: 3.569
valid loss 3.659 and accuracy 0.279
training loss: 3.570
valid loss 3.659 and accuracy 0.279
training loss: 3.570
valid loss 3.659 and accuracy 0.279
training loss: 3.569
valid loss 3.659 and accuracy 0.279


In [18]:
# Save model
torch.save(model.state_dict(),'./model.pt')

In [19]:
# for x1, x2, y in valid_dl:
#     output = model(x1, x2)
#     pred = torch.max(output, 1)
#     # correct += (pred == y).float().sum().item()
#     print(pred)
#     print(y)
#     break