In [72]:
import torch, numpy as np, pandas as pd
from torch import tensor
import torch.nn.functional as F
np.set_printoptions(linewidth=140)
torch.set_printoptions(linewidth=140, sci_mode=False, edgeitems=7)
pd.set_option('display.max_columns', None) 
pd.set_option('display.width', 1000)

In [12]:
# Loading data sets
training_data = pd.read_csv('Cats Training Data.csv')
validation_data = pd.read_csv('Cats Validation Data.csv')
test_data = pd.read_csv('Cats Testing Data.csv')

In [13]:
# concatenating the training & validation set so the one hot encoding will not be different if the validation set happens to not have some of the same categories

fused_data = pd.concat([training_data, validation_data], axis = 0, ignore_index = True)

In [14]:
validation_start_index = training_data.shape[0]

In [15]:
# Making sure that I have the index of the start of the validation data recorded correctly
validation_data.iloc[0]

Outcome Type                                    1
Intake Age                              -0.542123
Outcome Age                             -0.490138
Duration of Stay                         1.276812
Birth Year                               0.499208
Birth Month                             -1.067045
Intake Year                              0.247798
Intake Month                             1.592974
Outcome Year                             0.232558
Outcome Month                            0.964325
Sex                                        Female
Breed                          Domestic Shorthair
Intake Type                                 Stray
Intake Condition                 Nursing Juvenile
Intake Reproductive Status                 Intact
Outcome Reproductive Status               Altered
Breed2                               Not Provided
Purebred?                                   False
Coat Length                                 short
color1                                     Calico


In [16]:
fused_data.iloc[validation_start_index]

Outcome Type                                    1
Intake Age                              -0.542123
Outcome Age                             -0.490138
Duration of Stay                         1.276812
Birth Year                               0.499208
Birth Month                             -1.067045
Intake Year                              0.247798
Intake Month                             1.592974
Outcome Year                             0.232558
Outcome Month                            0.964325
Sex                                        Female
Breed                          Domestic Shorthair
Intake Type                                 Stray
Intake Condition                 Nursing Juvenile
Intake Reproductive Status                 Intact
Outcome Reproductive Status               Altered
Breed2                               Not Provided
Purebred?                                   False
Coat Length                                 short
color1                                     Calico


In [17]:
# One Hot Encoding for the catgeorical variables
# Because there are so many, using embedding might be a good upgrade (fastai does by default w/ tabular pandas)

fused_data = pd.get_dummies(fused_data, columns = 
        ['Sex', 'Breed', 'Intake Type', 'Intake Condition', 'Intake Reproductive Status', 'Outcome Reproductive Status', 'Breed2', 'Purebred?', 'Coat Length', 'color1', 'color2', 'color3', 'pattern1', 'pattern2', 'pattern3', 'Weekend Intake?', 'Weekend Outcome?'])

#training_data = pd.get_dummies(training_data, columns = 
#        ['Sex', 'Breed', 'Intake Type', 'Intake Condition', 'Intake #Reproductive Status', 'Outcome Reproductive Status', 'Breed2', 'Purebred?', #'Coat Length', 'color1', 'color2', 'color3', 'pattern1', 'pattern2', #'pattern3', 'Weekend Intake?', 'Weekend Outcome?'])

#validation_data = pd.get_dummies(validation_data, columns = 
#        ['Sex', 'Breed', 'Intake Type', 'Intake Condition', 'Intake #Reproductive Status', 'Outcome Reproductive Status', 'Breed2', 'Purebred?', #'Coat Length', 'color1', 'color2', 'color3', 'pattern1', 'pattern2', #'pattern3', 'Weekend Intake?', 'Weekend Outcome?'])

In [20]:
# Converting the dataframes to tensors (with extra dimension to allow for matrix multiplication and separating the targets from the features

training_target = tensor(fused_data.iloc[0:validation_start_index-1, 0].values, dtype = torch.float)
training_target = training_target[:,None]
training_features = tensor(fused_data.iloc[0:validation_start_index-1, 1:].values.astype(np.float64), dtype = torch.float)

validation_target = tensor(fused_data.iloc[validation_start_index:, 0].values, dtype = torch.float)
validation_target = validation_target[:,None]
validation_features = tensor(fused_data.iloc[validation_start_index:, 1:].values.astype(np.float64), dtype = torch.float)

In [21]:
# Making sure the size is as expected; it is. 
training_target.shape

torch.Size([53197, 1])

In [89]:
n_coeff = training_features.shape[1]

def init_coeffs():
    hiddens = [10, 10] 
    sizes = [n_coeff] + hiddens + [1]
    n = len(sizes)
    layers = [(torch.rand(sizes[i], sizes[i+1])-0.3)/sizes[i+1]*4 for i in range(n-1)]
    consts = [(torch.rand(1)[0]-0.5)*0.1 for i in range(n-1)]
    for l in layers+consts: l.requires_grad_()
    return layers,consts

def calc_preds(coeffs, features): 
    layers,consts = coeffs
    n = len(layers)
    res = features
    for i,l in enumerate(layers):
        res = res@l + consts[i]
        if i!=n-1: res = F.relu(res)
    return torch.sigmoid(res)

def calc_loss(coeffs, features, targets): return torch.abs(calc_preds(coeffs, features)-targets).mean()

def update_coeffs(coeffs, lr):
    layers,consts = coeffs
    for layer in layers+consts:
        layer.sub_(layer.grad * lr)
        layer.grad.zero_()

def one_epoch(coeffs, lr):
    loss = calc_loss(coeffs, training_features, training_target)
    loss.backward()
    with torch.no_grad(): update_coeffs(coeffs, lr)
    print(f"{loss:.3f}", end="; ")

def train_model(epochs=30, lr=0.01):
    torch.manual_seed(1004)
    coeffs = init_coeffs()
    for i in range(epochs): one_epoch(coeffs, lr=lr)
    return coeffs

def show_coeffs(): return dict(zip(fused_data.iloc[:, 1:].columns, coeffs))

def acc(coeffs): return (validation_target.bool()==(calc_preds(coeffs, validation_features)>0.5)).float().mean()

In [96]:
coeffs = train_model(1000, lr=0.1)

0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.500; 0.499; 0.499; 0.499; 0.499; 0.499; 0.499; 0.499; 0.499; 0.499; 0.499; 0.499; 0.499; 0.499; 0.499; 0.499; 0.499; 0.499; 0.499; 0.499; 0.499; 0.499; 0.499; 0.499; 0.499; 0.499; 0.499; 0.499; 0.499; 0.499; 0.499; 0.499; 0.499; 0.499; 0.499; 0.499; 0.498; 0.498; 0.498; 0.498; 0.498; 0.498; 0.498; 0.498; 0.498; 0.498; 0.498; 0.498; 0.497; 0.497; 0.497; 0.497; 0.497; 0.497; 0.496; 0.496; 0.496; 0.496; 0.495; 0.495; 0.495; 0.494;

In [97]:
acc(coeffs)

tensor(0.9073)

In [85]:
show_coeffs()

{'Intake Age': tensor([[    -1.2477,     -1.7936,      0.3295,      0.7766,     -0.4428,     -0.6488,     -0.4361,  ...,     -0.5864,     -0.0173,
               0.2356,     -0.2330,     -0.1476,     -0.3462,      0.1011],
         [    -1.0016,     -1.7442,      0.3005,      0.6924,     -0.2996,     -0.6117,     -0.4456,  ...,     -0.6097,      0.0048,
              -0.1030,     -0.1999,      0.0563,     -0.3633,      0.0369],
         [     2.0246,      0.3735,     -4.2920,      2.1045,      1.1816,      0.5732,      0.1306,  ...,      0.9959,      0.0243,
               3.8689,      0.0793,     -4.7518,      0.2797,      0.0774],
         [     0.3038,      0.9051,     -0.0211,     -0.0358,      0.1385,      0.7228,      0.3447,  ...,      0.5660,      0.0114,
               0.3281,      0.0919,     -0.6960,      0.3460,      0.0580],
         [    -0.3212,     -0.0733,      0.4014,      0.6981,      0.5987,     -0.0623,      0.0064,  ...,     -0.3908,      0.0064,
              -0.