In [11]:
import torch, numpy as np, pandas as pd
from torch import tensor
np.set_printoptions(linewidth=140)
torch.set_printoptions(linewidth=140, sci_mode=False, edgeitems=7)
pd.set_option('display.max_columns', None) 
pd.set_option('display.width', 1000)

In [12]:
# Loading data sets
training_data = pd.read_csv('Cats Training Data.csv')
validation_data = pd.read_csv('Cats Validation Data.csv')
test_data = pd.read_csv('Cats Testing Data.csv')

In [13]:
# concatenating the training & validation set so the one hot encoding will not be different if the validation set happens to not have some of the same categories

fused_data = pd.concat([training_data, validation_data], axis = 0, ignore_index = True)

In [14]:
validation_start_index = training_data.shape[0]

In [15]:
# Making sure that I have the index of the start of the validation data recorded correctly
validation_data.iloc[0]

Outcome Type                                    1
Intake Age                              -0.542123
Outcome Age                             -0.490138
Duration of Stay                         1.276812
Birth Year                               0.499208
Birth Month                             -1.067045
Intake Year                              0.247798
Intake Month                             1.592974
Outcome Year                             0.232558
Outcome Month                            0.964325
Sex                                        Female
Breed                          Domestic Shorthair
Intake Type                                 Stray
Intake Condition                 Nursing Juvenile
Intake Reproductive Status                 Intact
Outcome Reproductive Status               Altered
Breed2                               Not Provided
Purebred?                                   False
Coat Length                                 short
color1                                     Calico


In [16]:
fused_data.iloc[validation_start_index]

Outcome Type                                    1
Intake Age                              -0.542123
Outcome Age                             -0.490138
Duration of Stay                         1.276812
Birth Year                               0.499208
Birth Month                             -1.067045
Intake Year                              0.247798
Intake Month                             1.592974
Outcome Year                             0.232558
Outcome Month                            0.964325
Sex                                        Female
Breed                          Domestic Shorthair
Intake Type                                 Stray
Intake Condition                 Nursing Juvenile
Intake Reproductive Status                 Intact
Outcome Reproductive Status               Altered
Breed2                               Not Provided
Purebred?                                   False
Coat Length                                 short
color1                                     Calico


In [17]:
# One Hot Encoding for the catgeorical variables
# Because there are so many, using embedding might be a good upgrade (fastai does by default w/ tabular pandas)

fused_data = pd.get_dummies(fused_data, columns = 
        ['Sex', 'Breed', 'Intake Type', 'Intake Condition', 'Intake Reproductive Status', 'Outcome Reproductive Status', 'Breed2', 'Purebred?', 'Coat Length', 'color1', 'color2', 'color3', 'pattern1', 'pattern2', 'pattern3', 'Weekend Intake?', 'Weekend Outcome?'])

#training_data = pd.get_dummies(training_data, columns = 
#        ['Sex', 'Breed', 'Intake Type', 'Intake Condition', 'Intake #Reproductive Status', 'Outcome Reproductive Status', 'Breed2', 'Purebred?', #'Coat Length', 'color1', 'color2', 'color3', 'pattern1', 'pattern2', #'pattern3', 'Weekend Intake?', 'Weekend Outcome?'])

#validation_data = pd.get_dummies(validation_data, columns = 
#        ['Sex', 'Breed', 'Intake Type', 'Intake Condition', 'Intake #Reproductive Status', 'Outcome Reproductive Status', 'Breed2', 'Purebred?', #'Coat Length', 'color1', 'color2', 'color3', 'pattern1', 'pattern2', #'pattern3', 'Weekend Intake?', 'Weekend Outcome?'])

In [18]:
print(fused_data.columns)

Index(['Outcome Type', 'Intake Age', 'Outcome Age', 'Duration of Stay', 'Birth Year', 'Birth Month', 'Intake Year', 'Intake Month', 'Outcome Year', 'Outcome Month',
       ...
       'pattern2_Tabby', 'pattern2_Tortie', 'pattern2_with White Markings', 'pattern3_Empty Pattern', 'pattern3_Tabby', 'pattern3_with White Markings', 'Weekend Intake?_False', 'Weekend Intake?_True', 'Weekend Outcome?_False', 'Weekend Outcome?_True'], dtype='object', length=146)


In [20]:
# Converting the dataframes to tensors (with extra dimension to allow for matrix multiplication and separating the targets from the features

training_target = tensor(fused_data.iloc[0:validation_start_index-1, 0].values, dtype = torch.float)
training_target = training_target[:,None]
training_features = tensor(fused_data.iloc[0:validation_start_index-1, 1:].values.astype(np.float64), dtype = torch.float)

validation_target = tensor(fused_data.iloc[validation_start_index:, 0].values, dtype = torch.float)
validation_target = validation_target[:,None]
validation_features = tensor(fused_data.iloc[validation_start_index:, 1:].values.astype(np.float64), dtype = torch.float)

In [21]:
# Making sure the size is as expected; it is. 
training_target.shape

torch.Size([53197, 1])

In [34]:
torch.manual_seed(1004)

n_coeff = training_features.shape[1]

def calc_preds(coeffs, features): return torch.sigmoid(features@coeffs)

def calc_loss(coeffs, features, targets): return torch.abs(calc_preds(coeffs, features)-targets).mean()

def update_coeffs(coeffs, lr):
    coeffs.sub_(coeffs.grad * lr)
    coeffs.grad.zero_()

def one_epoch(coeffs, lr):
    loss = calc_loss(coeffs, training_features, training_target)
    loss.backward()
    with torch.no_grad(): update_coeffs(coeffs, lr)
    print(f"{loss:.3f}", end="; ")

def init_coeffs(): return (torch.rand(n_coeff, 1)*0.1).requires_grad_()

def train_model(epochs=30, lr=0.01):
    torch.manual_seed(442)
    coeffs = init_coeffs()
    for i in range(epochs): one_epoch(coeffs, lr=lr)
    return coeffs

def show_coeffs(): return dict(zip(training_data.iloc[:, 1:].columns, coeffs.requires_grad_(False)))

def acc(coeffs): return (validation_target.bool()==(calc_preds(coeffs, validation_features)>0.5)).float().mean()

In [35]:
coeffs = train_model(500, lr=0.1)

0.505; 0.501; 0.497; 0.493; 0.489; 0.485; 0.480; 0.476; 0.472; 0.468; 0.463; 0.459; 0.455; 0.451; 0.447; 0.443; 0.439; 0.435; 0.431; 0.428; 0.424; 0.421; 0.418; 0.415; 0.412; 0.409; 0.406; 0.403; 0.401; 0.398; 0.396; 0.393; 0.391; 0.389; 0.387; 0.385; 0.383; 0.381; 0.379; 0.377; 0.375; 0.373; 0.371; 0.370; 0.368; 0.366; 0.365; 0.363; 0.361; 0.360; 0.358; 0.357; 0.356; 0.354; 0.353; 0.351; 0.350; 0.349; 0.347; 0.346; 0.345; 0.344; 0.342; 0.341; 0.340; 0.339; 0.338; 0.336; 0.335; 0.334; 0.333; 0.332; 0.331; 0.330; 0.329; 0.328; 0.327; 0.326; 0.325; 0.324; 0.323; 0.322; 0.321; 0.320; 0.319; 0.318; 0.317; 0.316; 0.315; 0.314; 0.314; 0.313; 0.312; 0.311; 0.310; 0.309; 0.309; 0.308; 0.307; 0.306; 0.305; 0.304; 0.304; 0.303; 0.302; 0.301; 0.301; 0.300; 0.299; 0.298; 0.298; 0.297; 0.296; 0.296; 0.295; 0.294; 0.293; 0.293; 0.292; 0.291; 0.291; 0.290; 0.289; 0.289; 0.288; 0.287; 0.287; 0.286; 0.285; 0.285; 0.284; 0.284; 0.283; 0.282; 0.282; 0.281; 0.281; 0.280; 0.279; 0.279; 0.278; 0.278; 0.277;

In [37]:
acc(coeffs)

tensor(0.8640)

In [36]:
show_coeffs()

{'Intake Age': tensor([-0.6706]),
 'Outcome Age': tensor([-0.5609]),
 'Duration of Stay': tensor([1.6288]),
 'Birth Year': tensor([0.4490]),
 'Birth Month': tensor([-0.0660]),
 'Intake Year': tensor([0.0392]),
 'Intake Month': tensor([-0.0015]),
 'Outcome Year': tensor([0.1405]),
 'Outcome Month': tensor([-0.1923]),
 'Sex': tensor([0.2682]),
 'Breed': tensor([0.0081]),
 'Intake Type': tensor([-0.2127]),
 'Intake Condition': tensor([0.0677]),
 'Intake Reproductive Status': tensor([0.0747]),
 'Outcome Reproductive Status': tensor([0.0129]),
 'Breed2': tensor([0.0501]),
 'Purebred?': tensor([0.0318]),
 'Coat Length': tensor([0.0268]),
 'color1': tensor([0.0560]),
 'color2': tensor([0.0586]),
 'color3': tensor([0.0452]),
 'pattern1': tensor([0.0821]),
 'pattern2': tensor([0.0349]),
 'pattern3': tensor([0.0250]),
 'Weekend Intake?': tensor([0.0836]),
 'Weekend Outcome?': tensor([0.1291])}