# Density estimation
In this case, we model $p(t|x,y)$ by a neural network. In the pet adoption case, $x$ is the feature of the pet and $y$ is the adoption speed. The $t$ is the type of the pet, i.e., cat or dog.

## 0.Data preparation

In [1]:
import pandas as pd

path = "/app/Final/code"
# path = "."
# This is the dataset processed from the midterm
train_size = 14993
data_df = pd.read_csv(path + "/data/data_df_proc.csv")[:train_size]
data_df.head()

cols_to_drop = ["Name", "RescuerID", "VideoAmt", "Description", "PetID", "PhotoAmt"]
to_drop_columns = [
    "PetID",
    "Name",
    "RescuerID",
    "Description",
    "BreedName_full",
    "Breed1Name",
    "Breed2Name",
]
data_df.drop(cols_to_drop + to_drop_columns, axis=1, inplace=True)

# Fill missing values with mean
data_df.fillna(data_df.mean(), inplace=True)

# Embedding the categorical variables using nn.Embedding
cat_cols = [
    "Breed1",
    "Breed2",
    "Gender",
    "Color1",
    "Color2",
    "Color3",
    "State",
    "Breed_full",
    "Color_full",
    "hard_interaction",
]


In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoders = {}
for cat_col in cat_cols:
    label_encoders[cat_col] = LabelEncoder()
    data_df[cat_col] = label_encoders[cat_col].fit_transform(data_df[cat_col])

# Normalize the continuous variables
# cont_cols = data_df.columns.difference(cat_cols + ["AdoptionSpeed"])
# data_df[cont_cols] = data_df[cont_cols].apply(
#     lambda x: (x - x.mean()) / x.std(), axis=0
# )

emb_c = {n: len(col.unique()) for n, col in data_df.items() if n in cat_cols}
emb_cols = emb_c.keys()  # names of columns chosen for embedding
emb_szs = [
    (c, min(20, (c + 1) // 2)) for _, c in emb_c.items()
]  # embedding sizes for the chosen columns

# Split data into train and validation by AdoptionSpeed and stratify
from sklearn.model_selection import train_test_split

train_df, valid_df = train_test_split(
    data_df, test_size=0.2, random_state=42, stratify=data_df["AdoptionSpeed"]
)

X_train = train_df.drop(columns="AdoptionSpeed")
y_train = train_df["AdoptionSpeed"]
X_valid = valid_df.drop(columns="AdoptionSpeed")
y_valid = valid_df["AdoptionSpeed"]

n_cont = len(X_train.columns) - len(emb_cols)  # number of continuous columns


In [None]:
model = PetFinderModel(emb_szs, n_cont)
device = get_default_device()
to_device(model, device)

In [None]:


train_ds = PetFinderData(X_train, y_train, emb_cols)
valid_ds = PetFinderData(X_valid, y_valid, emb_cols)

# Get data into device
batch_size = 512
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=batch_size, shuffle=True)

train_dl = DeviceDataLoader(train_dl, device)
valid_dl = DeviceDataLoader(valid_dl, device)


# Train model
epochs = 5000
history = train_loop(
    model, epochs=epochs, lr=0.00005, wd=0.0001, train_dl=train_dl, valid_dl=valid_dl
)
# Save model
torch.save(model.state_dict(), "./model-stratify.pt")
# Save history
history = np.array(history)
np.save("./history.npy", history)

import matplotlib.pyplot as plt

# range(epochs)
plt.plot(range(epochs), history[:, 0], label="train_loss")
plt.plot(range(epochs), history[:, 2], label="val_loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Loss vs Epochs")
plt.legend()
# plt.show()
plt.savefig(path + "/figure/loss-statify.png")

plt.clf()
plt.plot(range(epochs), history[:, 1], label="train_kappa")
plt.plot(range(epochs), history[:, 3], label="val_kappa")
plt.xlabel("Epochs")
plt.ylabel("Kappa")
plt.title("Quadratic Weighted Kappa vs Epochs")
plt.legend()
# plt.show()
plt.savefig(path + "/figure/kappa-stratify.png")

## 1. Neural network
Now we have the data to estimate $p(t|x,y)$ where $t$ is the type of the pet, $y$ is the adoption speed and $x$ is the remaining columns in data_df. We use a neural network to model $p(t|x,y)$.

In [5]:
from network_setting import *

In [6]:
# Embedding the categorical variables using nn.Embedding
cat_cols = ['Breed1', 'Breed2', 'Gender', 'Color1', 'Color2',
       'Color3', 'State', 'Breed_full','Color_full', 'hard_interaction']
from sklearn.preprocessing import LabelEncoder

label_encoders = {}
for cat_col in cat_cols:
    label_encoders[cat_col] = LabelEncoder()
    data_df[cat_col] = label_encoders[cat_col].fit_transform(data_df[cat_col])
    
emb_c = {n: len(col.unique()) for n,col in data_df.items() if n in cat_cols}
emb_cols = emb_c.keys() # names of columns chosen for embedding
emb_szs = [(c, min(30, (c+1)//2)) for _,c in emb_c.items()] #embedding sizes for the chosen columns


In [7]:
# Split data into train and validation
train_df = data_df.iloc[:len(data_df)*4//5, :]
valid_df = data_df.iloc[len(data_df)*4//5:, :]
train_df.shape, valid_df.shape


X_train = train_df.drop(columns='AdoptionSpeed')
y_train = train_df['AdoptionSpeed']
X_valid = valid_df.drop(columns='AdoptionSpeed')
y_valid = valid_df['AdoptionSpeed']

n_cont = len(X_train.columns)-len(emb_cols) # number of continuous columns

In [15]:
model = PetFinderModel(emb_szs, n_cont)
device = get_default_device()
to_device(model, device)

PetFinderModel(
  (embeddings): ModuleList(
    (0): Embedding(176, 30)
    (1): Embedding(135, 30)
    (2): Embedding(3, 2)
    (3-4): 2 x Embedding(7, 4)
    (5): Embedding(6, 3)
    (6): Embedding(14, 7)
    (7): Embedding(812, 30)
    (8): Embedding(63, 30)
    (9): Embedding(142, 30)
  )
  (lin1): Linear(in_features=328, out_features=200, bias=True)
  (lin2): Linear(in_features=200, out_features=30, bias=True)
  (lin4): Linear(in_features=30, out_features=1, bias=True)
  (bn1): SELU()
  (bn2): SELU()
  (bn3): SELU()
  (bn4): SELU()
  (emb_drop): Dropout(p=0.2, inplace=False)
  (drops): Dropout(p=0.3, inplace=False)
)

In [16]:
train_ds = PetFinderData(X_train, y_train, emb_cols)
valid_ds = PetFinderData(X_valid, y_valid, emb_cols)

# Get data into device
batch_size = 512
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=batch_size, shuffle=True)

train_dl = DeviceDataLoader(train_dl, device)
valid_dl = DeviceDataLoader(valid_dl, device)



In [17]:
train_loop(model, epochs=5, lr=0.005, wd=0.0001, train_dl=train_dl, valid_dl=valid_dl)

training loss: 3.647
valid loss 3.659 and accuracy 0.279
training loss: 3.569
valid loss 3.659 and accuracy 0.279
training loss: 3.570
valid loss 3.659 and accuracy 0.279
training loss: 3.570
valid loss 3.659 and accuracy 0.279
training loss: 3.569
valid loss 3.659 and accuracy 0.279


In [18]:
# Save model
torch.save(model.state_dict(),'./model.pt')