In [2]:
#Not sure exactly which packages I will likely need, but probably most of these
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.cluster import KMeans

## Import and initial description of intake data

In [3]:
intake_data_raw = pd.read_csv('Austin_Animal_Center_Intakes_20240701.csv')

In [4]:
#dropping columns of variables not likely to be useful
data_after_drops = intake_data_raw.drop(['Name', 'DateTime', 'Found Location'], axis = 1)
data_after_drops.head()

Unnamed: 0,Animal ID,MonthYear,Intake Type,Intake Condition,Animal Type,Sex upon Intake,Age upon Intake,Breed,Color
0,A786884,January 2019,Stray,Normal,Dog,Neutered Male,2 years,Beagle Mix,Tricolor
1,A706918,July 2015,Stray,Normal,Dog,Spayed Female,8 years,English Springer Spaniel,White/Liver
2,A724273,April 2016,Stray,Normal,Dog,Intact Male,11 months,Basenji Mix,Sable/White
3,A857105,May 2022,Public Assist,Normal,Cat,Neutered Male,2 years,Domestic Shorthair,Orange Tabby
4,A682524,June 2014,Stray,Normal,Dog,Neutered Male,4 years,Doberman Pinsch/Australian Cattle Dog,Tan/Gray


In [5]:
#assessing null data within each variable
data_after_drops.isnull().sum()

Animal ID           0
MonthYear           0
Intake Type         0
Intake Condition    0
Animal Type         0
Sex upon Intake     2
Age upon Intake     1
Breed               0
Color               0
dtype: int64

In [6]:
def drop_null_data(df):
    #identifies null data in provided data frame, drops it, and reports columns in which the data were null

    #generates a Series with index = df's columns, values = number of nulls per column
    drops = df.isnull().sum()

    #if there are no nulls in the df, print & return original df
    if drops.sum() == 0:
        print("No null values identified")
            
    #if there are nulls in the df, drop them all and print which columns contained nulls and how many were dropped; returns df without nulls    
    else:
        df = df.dropna(axis = 0)
        for index, value in drops.items():
            if value != 0:
                print('"{}" contained {} null value(s)'.format(index, value))
    return df

In [7]:
intake_data_after_drops = drop_null_data(data_after_drops)

"Sex upon Intake" contained 2 null value(s)
"Age upon Intake" contained 1 null value(s)


In [8]:
intake_data_after_drops['Age upon Intake'].unique()

array(['2 years', '8 years', '11 months', '4 years', '4 months',
       '6 years', '6 months', '4 weeks', '5 months', '14 years',
       '1 month', '2 months', '18 years', '1 year', '3 years', '4 days',
       '9 years', '2 weeks', '15 years', '1 day', '5 years', '3 weeks',
       '9 months', '8 months', '6 days', '7 years', '12 years', '1 week',
       '10 years', '7 months', '3 months', '10 months', '1 weeks',
       '5 days', '2 days', '0 years', '11 years', '17 years', '3 days',
       '13 years', '5 weeks', '19 years', '16 years', '20 years',
       '-1 years', '22 years', '28 years', '23 years', '30 years',
       '-2 years', '21 years', '-3 years', '25 years', '24 years',
       '-4 years'], dtype=object)

In [9]:
import re

year_pattern = re.compile(r"(-?\d+)\s*(?:year|years)")
month_pattern = re.compile(r"(-?\d+)\s*(?:month|months)")
week_pattern = re.compile(r"(-?\d+)\s*(?:week|weeks)")
day_pattern = re.compile(r"(-?\d+)\s*(?:day|days)")

# Function to convert strings to days
def convert_to_days(age_str):
    if "year" in age_str:
        value = re.search(year_pattern, age_str).group(1)
        return int(value) * 365
    elif "month" in age_str:
        value = re.search(month_pattern, age_str).group(1)
        return int(value) * 30 
    elif "week" in age_str:
        value = re.search(week_pattern, age_str).group(1)
        return int(value) * 7
    elif "day" in age_str:
        value = re.search(day_pattern, age_str).group(1)
        return int(value)
    else:
        print(f"Unknown pattern: {age_str}; -1 days reported")
        return int(-1)

In [10]:
intake_data_with_negative_ages = intake_data_after_drops

In [11]:
intake_data_with_negative_ages['Age upon Intake'] = intake_data_with_negative_ages['Age upon Intake'].apply(convert_to_days)
intake_data_with_negative_ages.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  intake_data_with_negative_ages['Age upon Intake'] = intake_data_with_negative_ages['Age upon Intake'].apply(convert_to_days)


Unnamed: 0,Animal ID,MonthYear,Intake Type,Intake Condition,Animal Type,Sex upon Intake,Age upon Intake,Breed,Color
0,A786884,January 2019,Stray,Normal,Dog,Neutered Male,730,Beagle Mix,Tricolor
1,A706918,July 2015,Stray,Normal,Dog,Spayed Female,2920,English Springer Spaniel,White/Liver
2,A724273,April 2016,Stray,Normal,Dog,Intact Male,330,Basenji Mix,Sable/White
3,A857105,May 2022,Public Assist,Normal,Cat,Neutered Male,730,Domestic Shorthair,Orange Tabby
4,A682524,June 2014,Stray,Normal,Dog,Neutered Male,1460,Doberman Pinsch/Australian Cattle Dog,Tan/Gray


In [15]:
# Dropping rows with negative ages
#negative_aged_pets = intake_data_with_negative_ages.loc[intake_data_with_negative_ages['Age upon Intake'] < 0]
pre = len(intake_data_with_negative_ages)
intake_data_without_negative_ages = intake_data_with_negative_ages.loc[intake_data_with_negative_ages['Age upon Intake'] >= 0]
post= len(intake_data_with_negative_ages)
print(f"Dropped rows due to negative ages: {pre - len(intake_data_with_negative_ages)}")

Dropped rows due to negative ages: 0


In [17]:
intake_data_without_negative_ages.head()

Unnamed: 0,Animal ID,MonthYear,Intake Type,Intake Condition,Animal Type,Sex upon Intake,Age upon Intake,Breed,Color
0,A786884,January 2019,Stray,Normal,Dog,Neutered Male,730,Beagle Mix,Tricolor
1,A706918,July 2015,Stray,Normal,Dog,Spayed Female,2920,English Springer Spaniel,White/Liver
2,A724273,April 2016,Stray,Normal,Dog,Intact Male,330,Basenji Mix,Sable/White
3,A857105,May 2022,Public Assist,Normal,Cat,Neutered Male,730,Domestic Shorthair,Orange Tabby
4,A682524,June 2014,Stray,Normal,Dog,Neutered Male,1460,Doberman Pinsch/Australian Cattle Dog,Tan/Gray


In [18]:
intake_data_without_negative_ages

Unnamed: 0,Animal ID,MonthYear,Intake Type,Intake Condition,Animal Type,Sex upon Intake,Age upon Intake,Breed,Color
0,A786884,January 2019,Stray,Normal,Dog,Neutered Male,730,Beagle Mix,Tricolor
1,A706918,July 2015,Stray,Normal,Dog,Spayed Female,2920,English Springer Spaniel,White/Liver
2,A724273,April 2016,Stray,Normal,Dog,Intact Male,330,Basenji Mix,Sable/White
3,A857105,May 2022,Public Assist,Normal,Cat,Neutered Male,730,Domestic Shorthair,Orange Tabby
4,A682524,June 2014,Stray,Normal,Dog,Neutered Male,1460,Doberman Pinsch/Australian Cattle Dog,Tan/Gray
...,...,...,...,...,...,...,...,...,...
164609,A775309,June 2018,Owner Surrender,Normal,Cat,Spayed Female,730,Turkish Angora,White
164610,A760677,October 2017,Stray,Normal,Cat,Intact Male,730,Domestic Shorthair Mix,Blue/White
164611,A811255,December 2019,Stray,Injured,Cat,Intact Female,150,Domestic Shorthair,Black
164612,A763156,December 2017,Stray,Normal,Dog,Intact Male,730,Miniature Poodle/Cocker Spaniel,White


In [19]:
intake_data_without_negative_ages['Intake Condition'].value_counts()

Intake Condition
Normal        140107
Injured         9991
Sick            7426
Nursing         3878
Neonatal        1476
Aged             511
Medical          394
Other            346
Pregnant         148
Feral            140
Behavior          70
Med Attn          55
Unknown           19
Med Urgent        13
Neurologic        11
Parvo              5
Space              4
Agonal             3
Congenital         1
Panleuk            1
Name: count, dtype: int64

In [35]:
#intake_data = intake_data_without_negative_ages[(intake_data_without_negative_ages['Intake Condition'] != 'Aged') |
#                          (intake_data_without_negative_ages['Age upon Intake'] > 365)]
intake_data_without_negative_ages[(intake_data_without_negative_ages['Intake Condition'] == 'Neonatal') &
                                 (intake_data_without_negative_ages['Age upon Intake'] > 30)].sort_values(by = 'Age upon Intake', ascending = False)

Unnamed: 0,Animal ID,MonthYear,Intake Type,Intake Condition,Animal Type,Sex upon Intake,Age upon Intake,Breed,Color
1509,A901235,March 2024,Stray,Neonatal,Cat,Intact Female,730,Domestic Shorthair,Brown Tabby/White
91544,A843381,September 2021,Public Assist,Neonatal,Dog,Intact Male,730,Chihuahua Shorthair,Tan/White
163216,A908186,June 2024,Stray,Neonatal,Cat,Intact Female,730,Domestic Shorthair,Calico
158650,A885070,July 2023,Stray,Neonatal,Cat,Intact Female,730,Domestic Shorthair,Brown Tabby
157132,A887671,August 2023,Stray,Neonatal,Cat,Intact Female,730,Domestic Shorthair,Black/White
154251,A902471,April 2024,Stray,Neonatal,Cat,Intact Female,730,Domestic Shorthair,Brown Tabby
152378,A853757,March 2022,Wildlife,Neonatal,Other,Intact Male,730,Bat,Brown/Brown
146541,A827914,June 2021,Owner Surrender,Neonatal,Dog,Intact Female,730,Boxer,Brown/White
145075,A878564,April 2023,Wildlife,Neonatal,Other,Unknown,730,Raccoon,Black/Gray
141242,A843380,September 2021,Public Assist,Neonatal,Dog,Intact Female,730,Chihuahua Shorthair,Tan/White


## Import and initial description of outcome data

In [7]:
raw_outcome_data = pd.read_csv('Austin_Animal_Center_Outcomes_20240701.csv')

In [616]:
raw_outcome_data.head(100)

Unnamed: 0,Animal ID,Name,DateTime,MonthYear,Date of Birth,Outcome Type,Outcome Subtype,Animal Type,Sex upon Outcome,Age upon Outcome,Breed,Color
0,A794011,Chunk,05/08/2019 06:20:00 PM,May 2019,05/02/2017,Rto-Adopt,,Cat,Neutered Male,2 years,Domestic Shorthair Mix,Brown Tabby/White
1,A776359,Gizmo,07/18/2018 04:02:00 PM,Jul 2018,07/12/2017,Adoption,,Dog,Neutered Male,1 year,Chihuahua Shorthair Mix,White/Brown
2,A821648,,08/16/2020 11:38:00 AM,Aug 2020,08/16/2019,Euthanasia,,Other,Unknown,1 year,Raccoon,Gray
3,A720371,Moose,02/13/2016 05:59:00 PM,Feb 2016,10/08/2015,Adoption,,Dog,Neutered Male,4 months,Anatol Shepherd/Labrador Retriever,Buff
4,A674754,,03/18/2014 11:47:00 AM,Mar 2014,03/12/2014,Transfer,Partner,Cat,Intact Male,6 days,Domestic Shorthair Mix,Orange Tabby
...,...,...,...,...,...,...,...,...,...,...,...,...
95,A801948,Boie,07/09/2021 01:24:00 PM,Jul 2021,02/12/2019,Adoption,,Dog,Neutered Male,2 years,Australian Kelpie Mix,Brown Brindle/White
96,A809230,*Moo,11/24/2019 04:36:00 PM,Nov 2019,11/21/2015,Adoption,,Dog,Neutered Male,4 years,Siberian Husky,Gray/White
97,A783412,*Yams,12/16/2018 12:45:00 PM,Dec 2018,10/30/2008,Adoption,,Cat,Neutered Male,10 years,Domestic Shorthair Mix,Black
98,A686459,*Lovey,09/21/2014 05:39:00 PM,Sep 2014,07/15/2014,Adoption,Foster,Cat,Spayed Female,2 months,Domestic Shorthair Mix,Tortie


In [617]:
outcome_data_after_drops = raw_outcome_data.drop(['Name', 'MonthYear'], axis = 1)
outcome_data_after_drops.head()

Unnamed: 0,Animal ID,DateTime,Date of Birth,Outcome Type,Outcome Subtype,Animal Type,Sex upon Outcome,Age upon Outcome,Breed,Color
0,A794011,05/08/2019 06:20:00 PM,05/02/2017,Rto-Adopt,,Cat,Neutered Male,2 years,Domestic Shorthair Mix,Brown Tabby/White
1,A776359,07/18/2018 04:02:00 PM,07/12/2017,Adoption,,Dog,Neutered Male,1 year,Chihuahua Shorthair Mix,White/Brown
2,A821648,08/16/2020 11:38:00 AM,08/16/2019,Euthanasia,,Other,Unknown,1 year,Raccoon,Gray
3,A720371,02/13/2016 05:59:00 PM,10/08/2015,Adoption,,Dog,Neutered Male,4 months,Anatol Shepherd/Labrador Retriever,Buff
4,A674754,03/18/2014 11:47:00 AM,03/12/2014,Transfer,Partner,Cat,Intact Male,6 days,Domestic Shorthair Mix,Orange Tabby


In [618]:
outcome_data_after_drops.isnull().sum()

Animal ID               0
DateTime                0
Date of Birth           0
Outcome Type           39
Outcome Subtype     88751
Animal Type             0
Sex upon Outcome        2
Age upon Outcome       10
Breed                   0
Color                   0
dtype: int64

In [619]:
outcome_data_after_drops.loc[outcome_data_after_drops['Outcome Type'].isnull() == True]

Unnamed: 0,Animal ID,DateTime,Date of Birth,Outcome Type,Outcome Subtype,Animal Type,Sex upon Outcome,Age upon Outcome,Breed,Color
3729,A828272,02/02/2021 06:01:00 PM,01/12/2020,,,Dog,Neutered Male,1 year,Blue Lacy Mix,Blue/White
4890,A874518,02/14/2023 09:10:00 AM,02/13/2022,,,Bird,Unknown,1 year,Waxwing,Tan/Black
9848,A700839,01/12/2021 06:07:00 PM,04/20/2014,,,Dog,Neutered Male,6 years,Pit Bull Mix,Blue
15929,A841539,08/30/2021 06:10:00 PM,08/29/2019,,,Dog,Intact Male,2 years,German Shepherd,Black/Tan
21845,A827712,01/05/2021 05:47:00 PM,02/29/2020,,,Dog,Spayed Female,10 months,German Shepherd,Tan/Black
26293,A828543,01/27/2021 02:24:00 PM,07/20/2020,,,Dog,Spayed Female,6 months,Labrador Retriever/Border Collie,Black/White
28066,A679066,05/16/2014 12:00:00 AM,04/16/2014,,,Other,Unknown,4 weeks,Rabbit Sh,Brown
32164,A828570,01/27/2021 04:59:00 PM,09/20/2020,,,Dog,Neutered Male,4 months,American Bulldog Mix,Black/White
38121,A828974,02/03/2021 03:55:00 PM,01/30/2019,,Snr,Cat,Spayed Female,2 years,Domestic Shorthair,Black
48981,A671017,01/20/2014 08:27:00 AM,01/17/2013,,,Other,Unknown,1 year,Fox Mix,Brown/Black


In [620]:
outcome_data_after_drops['Outcome Type'].unique()

array(['Rto-Adopt', 'Adoption', 'Euthanasia', 'Transfer',
       'Return to Owner', 'Died', 'Disposal', 'Missing', 'Relocate', nan,
       'Stolen', 'Lost'], dtype=object)

In [621]:
outcome_data_after_drops.loc[outcome_data_after_drops['Outcome Subtype'] == 'Foster']

Unnamed: 0,Animal ID,DateTime,Date of Birth,Outcome Type,Outcome Subtype,Animal Type,Sex upon Outcome,Age upon Outcome,Breed,Color
6,A814515,05/06/2020 07:59:00 AM,03/01/2018,Adoption,Foster,Dog,Neutered Male,2 years,American Foxhound/Labrador Retriever,White/Brown
21,A822928,01/24/2022 11:49:00 AM,09/15/2010,Adoption,Foster,Cat,Spayed Female,11 years,Domestic Shorthair,Torbie
27,A765349,06/08/2018 01:04:00 PM,01/18/2009,Adoption,Foster,Dog,Neutered Male,9 years,Chihuahua Shorthair Mix,Tricolor
35,A812473,03/05/2020 04:15:00 PM,01/26/2010,Adoption,Foster,Dog,Neutered Male,10 years,Chihuahua Shorthair,Brown
38,A789298,04/23/2019 11:59:00 AM,02/15/2019,Adoption,Foster,Dog,Neutered Male,2 months,Labrador Retriever Mix,Brown/White
...,...,...,...,...,...,...,...,...,...,...
164076,A901535,06/01/2024 01:14:00 PM,04/03/2023,Adoption,Foster,Cat,Spayed Female,1 year,Bengal Mix,Brown Tabby
164077,A903054,06/01/2024 02:38:00 PM,06/01/2023,Adoption,Foster,Dog,Spayed Female,1 year,German Shepherd,Sable
164082,A761967,06/01/2024 03:12:00 PM,11/12/2013,Adoption,Foster,Dog,Neutered Male,10 years,Labrador Retriever Mix,Black/White
164095,A887625,06/02/2024 12:42:00 PM,08/24/2018,Adoption,Foster,Dog,Neutered Male,5 years,Pit Bull,Blue/White


In [53]:
outcome_data_after_drops.loc[(outcome_data_after_drops['Outcome Type'].isnull()) & (outcome_data_after_drops['Outcome Subtype'].isnull()==False)]

Unnamed: 0,Animal ID,MonthYear,Date of Birth,Outcome Type,Outcome Subtype,Animal Type,Sex upon Outcome,Age upon Outcome,Breed,Color
38121,A828974,Feb 2021,01/30/2019,,Snr,Cat,Spayed Female,2 years,Domestic Shorthair,Black
156108,A890645,Dec 2023,09/11/2023,,Foster,Cat,Spayed Female,3 months,Siamese Mix,Lynx Point


In [463]:
lost_to_follow_up = outcome_data_after_drops.loc[outcome_data_after_drops['Outcome Type'].isnull()]

(0, 10)

In [92]:
drops = lost_to_follow_up.index
breaking_my_data = outcome_data_after_drops.drop(drops, axis = 0)
breaking_my_data.isnull().sum()

Animal ID               0
MonthYear               0
Date of Birth           0
Outcome Type            0
Outcome Subtype     88714
Animal Type             0
Sex upon Outcome        2
Age upon Outcome       10
Breed                   0
Color                   0
dtype: int64

In [93]:
outcome_data_after_drops = breaking_my_data

In [96]:
outcome_data_after_drops.loc[outcome_data_after_drops['Sex upon Outcome'].isnull()]

Unnamed: 0,Animal ID,MonthYear,Date of Birth,Outcome Type,Outcome Subtype,Animal Type,Sex upon Outcome,Age upon Outcome,Breed,Color
51935,A830333,Oct 2022,10/12/2022,Return to Owner,,Dog,,,Kuvasz,Unknown
78121,A667395,Nov 2013,11/17/2006,Return to Owner,,Dog,,7 years,Dachshund,Brown Merle


In [102]:
drops = outcome_data_after_drops.loc[outcome_data_after_drops['Sex upon Outcome'].isnull()].index
drops
breaking_my_data = outcome_data_after_drops.drop(drops, axis = 0)
breaking_my_data.isnull().sum()

Animal ID               0
MonthYear               0
Date of Birth           0
Outcome Type            0
Outcome Subtype     88712
Animal Type             0
Sex upon Outcome        0
Age upon Outcome        9
Breed                   0
Color                   0
dtype: int64

In [197]:
outcome_data_after_drops = breaking_my_data

In [202]:
outcome_data_after_drops.isnull().sum()

Animal ID           0
MonthYear           0
Date of Birth       0
Outcome Type        0
Outcome Subtype     0
Animal Type         0
Sex upon Outcome    0
Age upon Outcome    0
Breed               0
Color               0
dtype: int64

In [203]:
outcome_data_no_nulls = outcome_data_after_drops
#&(outcome_data_after_drops['Outcome Type'] != "Euthanasia")]

In [268]:
outcome_data_no_nulls['Age upon Outcome'].unique()

array(['2 years', '1 year', '4 months', '6 days', '7 years', '2 months',
       '2 days', '3 weeks', '11 years', '9 months', '4 weeks', '2 weeks',
       '6 months', '3 months', '9 years', '10 years', '8 years',
       '3 years', '7 months', '6 years', '4 years', '1 month', '12 years',
       '5 years', '1 weeks', '5 months', '5 days', '15 years',
       '10 months', '4 days', '16 years', '1 day', '8 months', '13 years',
       '1 week', '14 years', '11 months', '3 days', '0 years', '5 weeks',
       '17 years', '18 years', '20 years', '22 years', '19 years',
       '28 years', '23 years', '30 years', '24 years', '25 years',
       '21 years', 'Unknown'], dtype=object)

In [250]:
breaking_my_data = outcome_data_no_nulls

In [266]:
breaking_my_data.loc[breaking_my_data['Age upon Outcome'] == "-4 years"]

Unnamed: 0,Animal ID,MonthYear,Date of Birth,Outcome Type,Outcome Subtype,Animal Type,Sex upon Outcome,Age upon Outcome,Breed,Color


In [265]:
drops = breaking_my_data.loc[breaking_my_data['Age upon Outcome']=="-4 years"].index
#drops
breaking_my_data = breaking_my_data.drop(drops, axis = 0)

In [267]:
outcome_data_no_nulls = breaking_my_data