# ML Final Project

**Headers**

In [61]:
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sklearn
import scipy as scp

### Data Cleaning and Preprocessing

In [62]:
raw_data = pd.read_csv("train.csv")
raw_data.head()

Unnamed: 0,Id,Name,Intake Time,Found Location,Intake Type,Intake Condition,Animal Type,Sex upon Intake,Age upon Intake,Breed,Color,Outcome Time,Date of Birth,Outcome Type
0,A706918,Belle,07/05/2015 12:59:00 PM,9409 Bluegrass Dr in Austin (TX),Stray,Normal,Dog,Spayed Female,8 years,English Springer Spaniel,White/Liver,07/05/2015 03:13:00 PM,07/05/2007,Return to Owner
1,A724273,Runster,04/14/2016 06:43:00 PM,2818 Palomino Trail in Austin (TX),Stray,Normal,Dog,Intact Male,11 months,Basenji Mix,Sable/White,04/21/2016 05:17:00 PM,04/17/2015,Return to Owner
2,A857105,Johnny Ringo,05/12/2022 12:23:00 AM,4404 Sarasota Drive in Austin (TX),Public Assist,Normal,Cat,Neutered Male,2 years,Domestic Shorthair,Orange Tabby,05/12/2022 02:35:00 PM,05/12/2020,Transfer
3,A743852,Odin,02/18/2017 12:46:00 PM,Austin (TX),Owner Surrender,Normal,Dog,Neutered Male,2 years,Labrador Retriever Mix,Chocolate,02/21/2017 05:44:00 PM,02/18/2015,Return to Owner
4,A635072,Beowulf,04/16/2019 09:53:00 AM,415 East Mary Street in Austin (TX),Public Assist,Normal,Dog,Neutered Male,6 years,Great Dane Mix,Black,04/18/2019 01:45:00 PM,06/03/2012,Return to Owner


In [63]:
# String parser

def parse_age(age_str):
    if pd.isna(age_str):
        return pd.NaT  # or np.nan
    num, unit = age_str.split()
    num = int(num)
    if unit in ["day", "days"]:
        return pd.Timedelta(days=num)
    elif unit in ["week", "weeks"]:
        return pd.Timedelta(days=num * 7) # approx for a week
    elif unit in ["month", "months"]:
        return pd.Timedelta(days=num * 30)  # Approximate 1 month = 30 days
    elif unit in ["year", "years"]:
        return pd.Timedelta(days=num * 365)  # Approximate 1 year = 365 days
    else:
        return pd.NaT

# Convert to datetime objects 
raw_data["Intake_DateTime"] = pd.to_datetime(raw_data["Intake Time"], format="%m/%d/%Y %I:%M:%S %p", errors='coerce')
raw_data["Age_DateTime"] = raw_data["Age upon Intake"].apply(parse_age)
raw_data["DOB_DateTime"] = pd.to_datetime(raw_data["Date of Birth"], format="%m/%d/%Y", errors='coerce')

# check and clean age_time
missing_row = raw_data[raw_data["Age_DateTime"].isna()]
raw_data = raw_data[raw_data["Age_DateTime"].notna()]
data = raw_data
missing_intakes = data["Intake_DateTime"].isnull().sum()
missing_DOBS = data["DOB_DateTime"].isnull().sum()
data["Age_in_Days"] = data["Age_DateTime"].dt.days ## numeric value for training

## no missing values



In [64]:
data["Outcome_DateTime"] = pd.to_datetime(data["Outcome Time"], format="%m/%d/%Y %I:%M:%S %p", errors='coerce')
data["Time_spent"] = (data["Outcome_DateTime"] - data["Intake_DateTime"]).dt.total_seconds()
data = data.drop(columns=["Id", "Name", "Intake Time", "Found Location", "Age upon Intake", "Outcome Time", "Date of Birth"])
data.head()

## First set of cleaning done, these columns were replaced with calculable values or removed (name probably does not affect outcome)


Unnamed: 0,Intake Type,Intake Condition,Animal Type,Sex upon Intake,Breed,Color,Outcome Type,Intake_DateTime,Age_DateTime,DOB_DateTime,Age_in_Days,Outcome_DateTime,Time_spent
0,Stray,Normal,Dog,Spayed Female,English Springer Spaniel,White/Liver,Return to Owner,2015-07-05 12:59:00,2920 days,2007-07-05,2920,2015-07-05 15:13:00,8040.0
1,Stray,Normal,Dog,Intact Male,Basenji Mix,Sable/White,Return to Owner,2016-04-14 18:43:00,330 days,2015-04-17,330,2016-04-21 17:17:00,599640.0
2,Public Assist,Normal,Cat,Neutered Male,Domestic Shorthair,Orange Tabby,Transfer,2022-05-12 00:23:00,730 days,2020-05-12,730,2022-05-12 14:35:00,51120.0
3,Owner Surrender,Normal,Dog,Neutered Male,Labrador Retriever Mix,Chocolate,Return to Owner,2017-02-18 12:46:00,730 days,2015-02-18,730,2017-02-21 17:44:00,277080.0
4,Public Assist,Normal,Dog,Neutered Male,Great Dane Mix,Black,Return to Owner,2019-04-16 09:53:00,2190 days,2012-06-03,2190,2019-04-18 13:45:00,186720.0


In [65]:
# Im looking to see what else I can erase here simply/how I should encode these categorical


print(data["Breed"].nunique()) # frequency
print(data["Color"].nunique()) #
print(data["Intake Type"].nunique())
print(data["Intake Condition"].nunique())



2440
568
6
19


In [66]:
# Could OHE everything, then PCA
OHE_data = pd.get_dummies(data, columns=['Color', 'Breed', "Animal Type", "Intake Type", "Intake Condition", "Sex upon Intake"])
print(OHE_data.head())
print(OHE_data.shape)

      Outcome Type     Intake_DateTime Age_DateTime DOB_DateTime  Age_in_Days  \
0  Return to Owner 2015-07-05 12:59:00    2920 days   2007-07-05         2920   
1  Return to Owner 2016-04-14 18:43:00     330 days   2015-04-17          330   
2         Transfer 2022-05-12 00:23:00     730 days   2020-05-12          730   
3  Return to Owner 2017-02-18 12:46:00     730 days   2015-02-18          730   
4  Return to Owner 2019-04-16 09:53:00    2190 days   2012-06-03         2190   

     Outcome_DateTime  Time_spent  Color_Agouti  Color_Agouti/Brown Tabby  \
0 2015-07-05 15:13:00      8040.0         False                     False   
1 2016-04-21 17:17:00    599640.0         False                     False   
2 2022-05-12 14:35:00     51120.0         False                     False   
3 2017-02-21 17:44:00    277080.0         False                     False   
4 2019-04-18 13:45:00    186720.0         False                     False   

   Color_Agouti/Cream  ...  Intake Condition_Parvo

In [67]:
## Make something useful from the days of intake

OHE_data["Intake_Minute"] = OHE_data["Intake_DateTime"].dt.minute
OHE_data['Intake_Hour'] = OHE_data['Intake_DateTime'].dt.hour
OHE_data['Intake_Weekday'] = OHE_data['Intake_DateTime'].dt.weekday  # Monday=0
OHE_data['Intake_Month'] = OHE_data['Intake_DateTime'].dt.month
OHE_data['Intake_Year'] = OHE_data['Intake_DateTime'].dt.year
OHE_data['Is_Weekend_Intake'] = OHE_data['Intake_Weekday'].isin([5, 6])

## Make something useful from the day of outcome

OHE_data["Outcome_Minute"] = OHE_data["Outcome_DateTime"].dt.minute
OHE_data['Outcome_Hour'] = OHE_data['Outcome_DateTime'].dt.hour
OHE_data['Outcome_Weekday'] = OHE_data['Outcome_DateTime'].dt.weekday  # Monday=0
OHE_data['Outcome_Month'] = OHE_data['Outcome_DateTime'].dt.month
OHE_data['Outcome_Year'] = OHE_data['Outcome_DateTime'].dt.year
OHE_data['Is_Weekend_Outcome'] = OHE_data['Outcome_Weekday'].isin([5, 6])

unscaled_data = OHE_data.drop(columns=["Outcome_DateTime", "Intake_DateTime", "Age_DateTime", "DOB_DateTime"]) ## redundant columns

unscaled_data.head()

Unnamed: 0,Outcome Type,Age_in_Days,Time_spent,Color_Agouti,Color_Agouti/Brown Tabby,Color_Agouti/Cream,Color_Agouti/White,Color_Apricot,Color_Apricot/Brown,Color_Apricot/Tricolor,...,Intake_Weekday,Intake_Month,Intake_Year,Is_Weekend_Intake,Outcome_Minute,Outcome_Hour,Outcome_Weekday,Outcome_Month,Outcome_Year,Is_Weekend_Outcome
0,Return to Owner,2920,8040.0,False,False,False,False,False,False,False,...,6,7,2015,True,13,15,6,7,2015,True
1,Return to Owner,330,599640.0,False,False,False,False,False,False,False,...,3,4,2016,False,17,17,3,4,2016,False
2,Transfer,730,51120.0,False,False,False,False,False,False,False,...,3,5,2022,False,35,14,3,5,2022,False
3,Return to Owner,730,277080.0,False,False,False,False,False,False,False,...,5,2,2017,True,44,17,1,2,2017,False
4,Return to Owner,2190,186720.0,False,False,False,False,False,False,False,...,1,4,2019,False,45,13,3,4,2019,False


In [None]:
## Numeric Scaling

from sklearn.preprocessing import StandardScaler

## Test distributions before we fit transform
print(unscaled_data.describe())

numeric_cols = unscaled_data.select_dtypes(include=['number']).columns
scaler = StandardScaler()
unscaled_data[numeric_cols] = scaler.fit_transform(unscaled_data[numeric_cols])

         Age_in_Days    Time_spent  Intake_Minute    Intake_Hour  \
count  111156.000000  1.111560e+05  111156.000000  111156.000000   
mean      695.233807  1.874549e+06      28.953768      13.417422   
std      1046.295461  1.214591e+07      17.471838       3.005285   
min     -1460.000000 -2.929763e+08       0.000000       0.000000   
25%        30.000000  1.801800e+05      14.000000      11.000000   
50%       365.000000  5.289000e+05      29.000000      13.000000   
75%       730.000000  1.983240e+06      44.000000      16.000000   
max      8760.000000  3.090687e+08      59.000000      23.000000   

       Intake_Weekday   Intake_Month    Intake_Year  Outcome_Minute  \
count   111156.000000  111156.000000  111156.000000   111156.000000   
mean         2.798454       6.588938    2018.138562       28.018910   
std          1.947112       3.241228       3.102382       17.972116   
min          0.000000       1.000000    2013.000000        0.000000   
25%          1.000000       4.00