In [13]:
# In this notebook, I have a data set from the Austin Animal Center that I would like to use to answer some questions about the shelter. 
# Before I can start really analyzing the data, first the data sets need to be cleaned, and I would like to consolidate the two files 
# (intake and outcome) into one file that I can use for all my analysis. Because I don't yet have a specific question in mind, I will 
# focus mostly on getting rid of null values and sensitive information, and doing some logic checking to make sure the data is as trustworthy as possible. 

In [14]:
import pandas as pd

### Import and initial description of intake data

In [15]:
#read both the intake and outcome data files
intake_data_raw = pd.read_csv('Austin_Animal_Center_Intakes_20240701.csv')
outcome_data_raw = pd.read_csv('Austin_Animal_Center_Outcomes_20240701.csv')


In [16]:
intake_data_raw.head()
intake_length = len(intake_data_raw)
#intake_length #164614
#Name and Found Location are potentially sensitive information, so I will drop these. 
#Animal ID can be used to join the intake and outcome data sets, so I will keep it for now.
#MonthYear and DateTime are redundant, so I will drop MonthYear since DateTime should be easy to convert from str to timestamp or other numerical. 

In [17]:
outcome_length = len(outcome_data_raw)
outcome_data_raw.head()
#outcome_length #164228
#Name is again potentially sensitive. The Animal ID will be used for merging.
#MonthYear and DateTime are again redundant.
#The outcome data is shorter than the intake data, likely because some pets are actively in the shelter (so they have intake but no outcome yet)

Unnamed: 0,Animal ID,Name,DateTime,MonthYear,Date of Birth,Outcome Type,Outcome Subtype,Animal Type,Sex upon Outcome,Age upon Outcome,Breed,Color
0,A794011,Chunk,05/08/2019 06:20:00 PM,May 2019,05/02/2017,Rto-Adopt,,Cat,Neutered Male,2 years,Domestic Shorthair Mix,Brown Tabby/White
1,A776359,Gizmo,07/18/2018 04:02:00 PM,Jul 2018,07/12/2017,Adoption,,Dog,Neutered Male,1 year,Chihuahua Shorthair Mix,White/Brown
2,A821648,,08/16/2020 11:38:00 AM,Aug 2020,08/16/2019,Euthanasia,,Other,Unknown,1 year,Raccoon,Gray
3,A720371,Moose,02/13/2016 05:59:00 PM,Feb 2016,10/08/2015,Adoption,,Dog,Neutered Male,4 months,Anatol Shepherd/Labrador Retriever,Buff
4,A674754,,03/18/2014 11:47:00 AM,Mar 2014,03/12/2014,Transfer,Partner,Cat,Intact Male,6 days,Domestic Shorthair Mix,Orange Tabby


### Duplicate Detection & Removal

In [18]:
intake_data_raw['Animal ID'].value_counts()
#Animal ID is not unique because a single animal may go through the shelter system multiple times

Animal ID
A721033    33
A718223    14
A718877    12
A705625    11
A706536    11
           ..
A714773     1
A765646     1
A732323     1
A698657     1
A855904     1
Name: count, Length: 147862, dtype: int64

In [19]:
#Looking more closely at the most frequently intaken pet, we see these are not duplicate data, but actually 33 unique visits to the shelter. 
#The other pets with multiple entries of the same Animal ID tell similar stories. 
intake_data_raw.loc[(intake_data_raw['Animal ID'] == 'A721033')].sort_values(by = 'DateTime')

Unnamed: 0,Animal ID,Name,DateTime,MonthYear,Found Location,Intake Type,Intake Condition,Animal Type,Sex upon Intake,Age upon Intake,Breed,Color
118002,A721033,Lil Bit,01/09/2017 02:26:00 PM,January 2017,6210 E Ben White Blvd in Austin (TX),Stray,Injured,Dog,Neutered Male,1 year,Rat Terrier Mix,Tricolor/Brown Brindle
125351,A721033,Lil Bit,01/26/2017 06:55:00 AM,January 2017,901 W Ben White Blvd in Austin (TX),Public Assist,Normal,Dog,Neutered Male,1 year,Rat Terrier Mix,Tricolor/Brown Brindle
48856,A721033,Lil Bit,01/30/2017 11:05:00 PM,January 2017,6210 E Ben White in Austin (TX),Public Assist,Normal,Dog,Neutered Male,1 year,Rat Terrier Mix,Tricolor/Brown Brindle
110181,A721033,Lil Bit,02/06/2017 10:13:00 AM,February 2017,6210 E Ben White in Austin (TX),Public Assist,Normal,Dog,Neutered Male,1 year,Rat Terrier Mix,Tricolor/Brown Brindle
30692,A721033,Lil Bit,02/12/2019 10:21:00 AM,February 2019,1936 East Oltorf Street in Austin (TX),Public Assist,Normal,Dog,Neutered Male,3 years,Rat Terrier Mix,Tricolor/Brown Brindle
69901,A721033,Lil Bit,02/16/2019 10:30:00 AM,February 2019,1135 Airport Blvd in Austin (TX),Public Assist,Normal,Dog,Neutered Male,3 years,Rat Terrier Mix,Tricolor/Brown Brindle
102945,A721033,Lil Bit,02/20/2016 10:44:00 AM,February 2016,2508 E Riverside Dr in Austin (TX),Stray,Normal,Dog,Neutered Male,9 months,Rat Terrier Mix,Tricolor/Brown Brindle
150333,A721033,Lil Bit,02/22/2018 10:28:00 AM,February 2018,6400 Ben White Blvd in Austin (TX),Public Assist,Normal,Dog,Neutered Male,2 years,Rat Terrier Mix,Tricolor/Brown Brindle
5518,A721033,Lil Bit,02/24/2019 09:53:00 PM,February 2019,700 Allen St in Austin (TX),Public Assist,Normal,Dog,Neutered Male,3 years,Rat Terrier Mix,Tricolor/Brown Brindle
13100,A721033,Lil Bit,03/07/2018 08:27:00 AM,March 2018,4111 South 1St in Austin (TX),Public Assist,Normal,Dog,Neutered Male,2 years,Rat Terrier Mix,Tricolor/Brown Brindle


In [20]:
outcome_data_raw.loc[(outcome_data_raw['Animal ID'] == 'A721033')].sort_values(by = 'DateTime')
#Because these frequently returning animals do not represent the "typical" intake-outcome cycle, I considered dropping pets with more than 3-5 visits. 
#But ultimately, a pet being returned to its owner is considered a success. And I'm curious how many of these repeat offenders make up the "return to owner" outcome type. 
#So I will leave them in for now

Unnamed: 0,Animal ID,Name,DateTime,MonthYear,Date of Birth,Outcome Type,Outcome Subtype,Animal Type,Sex upon Outcome,Age upon Outcome,Breed,Color
10976,A721033,Lil Bit,01/10/2017 04:20:00 PM,Jan 2017,05/20/2015,Return to Owner,,Dog,Neutered Male,1 year,Rat Terrier Mix,Tricolor/Brown Brindle
42029,A721033,Lil Bit,01/28/2017 03:22:00 PM,Jan 2017,05/20/2015,Return to Owner,,Dog,Neutered Male,1 year,Rat Terrier Mix,Tricolor/Brown Brindle
83153,A721033,Lil Bit,02/02/2017 11:19:00 AM,Feb 2017,05/20/2015,Return to Owner,,Dog,Neutered Male,1 year,Rat Terrier Mix,Tricolor/Brown Brindle
67815,A721033,Lil Bit,02/07/2017 05:26:00 PM,Feb 2017,05/20/2015,Return to Owner,,Dog,Neutered Male,1 year,Rat Terrier Mix,Tricolor/Brown Brindle
136111,A721033,Lil Bit,02/12/2019 03:20:00 PM,Feb 2019,05/20/2015,Return to Owner,,Dog,Neutered Male,3 years,Rat Terrier Mix,Tricolor/Brown Brindle
37003,A721033,Lil Bit,02/18/2019 04:46:00 PM,Feb 2019,05/20/2015,Return to Owner,,Dog,Neutered Male,3 years,Rat Terrier Mix,Tricolor/Brown Brindle
74377,A721033,Lil Bit,02/20/2016 04:18:00 PM,Feb 2016,05/20/2015,Return to Owner,,Dog,Neutered Male,9 months,Rat Terrier Mix,Tricolor/Brown Brindle
90560,A721033,Lil Bit,02/23/2018 01:06:00 PM,Feb 2018,05/20/2015,Return to Owner,,Dog,Neutered Male,2 years,Rat Terrier Mix,Tricolor/Brown Brindle
51640,A721033,Lil Bit,02/26/2019 07:00:00 PM,Feb 2019,05/20/2015,Return to Owner,,Dog,Neutered Male,3 years,Rat Terrier Mix,Tricolor/Brown Brindle
108849,A721033,Lil Bit,03/08/2018 03:04:00 PM,Mar 2018,05/20/2015,Return to Owner,,Dog,Neutered Male,2 years,Rat Terrier Mix,Tricolor/Brown Brindle


In [23]:
#To merge the intake and outcome datasets while keeping the multiple-visit pets in the set, I will group by Animal ID to assign each visit a count, combine the Animal ID with the count, and merge along this key
#First I want to sort the datasets by DateTime, which means my current DateTime str needs to be convert to a numerical. 
intake_data_raw['DateTime'] = pd.to_datetime(intake_data_raw['DateTime'], format = '%m/%d/%Y %I:%M:%S %p')
outcome_data_raw['DateTime'] = pd.to_datetime(outcome_data_raw['DateTime'], format = '%m/%d/%Y %I:%M:%S %p')

In [24]:
#Sorting the datasets by DateTime and dropping rows that have the same Animal ID & DateTime (ie. true duplicate entries)
intake_data_raw.sort_values(by = 'DateTime')
intake_data_raw['Duplicated?'] = intake_data_raw.duplicated(subset=['Animal ID', 'DateTime'])

outcome_data_raw.sort_values(by = 'DateTime')
outcome_data_raw['Duplicated?'] = outcome_data_raw.duplicated(subset=['Animal ID', 'DateTime'])

In [25]:
#Spotchecking that duplicates that were flagged are true duplicates and not repeat visits and checking the length of each to see how many were dropped. 
#It worked =)

#outcome_data_raw.loc[outcome_data_raw['Duplicated?'] == True] #25 count
#intake_data_raw.loc[intake_data_raw['Duplicated?'] == True] #37 count
#intake_data_raw.loc[intake_data_raw['Animal ID'] == 'A727043']

In [26]:
outcome_data_raw = outcome_data_raw.loc[outcome_data_raw['Duplicated?'] != True]
#len(outcome_data_raw) #164203 length

intake_data_raw = intake_data_raw.loc[intake_data_raw['Duplicated?'] != True]
#len(intake_data_raw) #164577 length

### Merging intake & outcome data sets

In [27]:
#Group both datasets by Animal ID & use the cumulative count to organize repeat visits 
intake_data_raw['Visit Count'] = intake_data_raw.groupby(by = 'Animal ID').cumcount() +1
outcome_data_raw['Visit Count'] = outcome_data_raw.groupby(by = 'Animal ID').cumcount() +1

In [28]:
# Merge the intake and outcome data sets based on Animal ID and Visit Count
merged_data_raw = pd.merge(intake_data_raw, outcome_data_raw, on = ['Animal ID', 'Visit Count'])
len(outcome_data_raw) - len(merged_data_raw)
#len(merged_data_raw) #163263
#I expected the outcome data length to be longer than the merged data length because some pets will not have both intake and outcome data, and so would not be included in the merged set

940

In [29]:
#When I examine the animals that did not make it into the merged set from the outcome data, most of them are from the time the shelter first started recording data (Oct 1, 2013). 
#Pets admitted prior to this date, would not have intake data to match with their outcome data, so they cannot be included. 
#There are a few pets from later on that are also missing intake data for some reason. None of these pets have multiple visits, so they will not mess up the merged data set. 

outcome_data_not_merged = outcome_data_raw[~outcome_data_raw['Animal ID'].isin(merged_data_raw['Animal ID'])]
outcome_data_not_merged.loc[outcome_data_not_merged['Visit Count'] > 1]

Unnamed: 0,Animal ID,Name,DateTime,MonthYear,Date of Birth,Outcome Type,Outcome Subtype,Animal Type,Sex upon Outcome,Age upon Outcome,Breed,Color,Duplicated?,Visit Count


In [30]:
#When I examine the animals that did not make it into the merged set from the intake data, almost all of them (1116/1189) are from 2024, so there is a good chance, these pets do not yet have outcomes.
#Of the smaller number of animals that predate 2024, none of them have outcome data. They can be considered "lost" and excluded from the data set

intake_data_not_merged = intake_data_raw[~intake_data_raw['Animal ID'].isin(merged_data_raw['Animal ID'])]
#intake_data_not_merged.loc[intake_data_not_merged['DateTime'] >= '2024'].sort_values(by = 'DateTime')
#intake_data_not_merged.loc[intake_data_not_merged['DateTime'] < '2023'].sort_values(by = 'DateTime')
#intake_data_not_merged.loc[intake_data_not_merged['Visit Count'] > 1]

### Cleaning the merged data set - Nulls

In [31]:
merged_data_raw.columns
# Looking at the columns of the merged data set, there are several columns that should be identical (ex. Breed_x & Breed_y),
# some columns are redundant (monthyear), and some are not helpful (Duplicated?)

Index(['Animal ID', 'Name_x', 'DateTime_x', 'MonthYear_x', 'Found Location',
       'Intake Type', 'Intake Condition', 'Animal Type_x', 'Sex upon Intake',
       'Age upon Intake', 'Breed_x', 'Color_x', 'Duplicated?_x', 'Visit Count',
       'Name_y', 'DateTime_y', 'MonthYear_y', 'Date of Birth', 'Outcome Type',
       'Outcome Subtype', 'Animal Type_y', 'Sex upon Outcome',
       'Age upon Outcome', 'Breed_y', 'Color_y', 'Duplicated?_y'],
      dtype='object')

In [32]:
#First I will make sure that the columns that should be identical actually are.

# merged_data_raw.loc[(merged_data_raw['Breed_x'] == merged_data_raw['Breed_y']) == False] 
# merged_data_raw.loc[(merged_data_raw['Animal Type_x'] == merged_data_raw['Animal Type_y']) == False]
merged_data_raw.loc[(merged_data_raw['Color_x'] == merged_data_raw['Color_y']) == False]

Unnamed: 0,Animal ID,Name_x,DateTime_x,MonthYear_x,Found Location,Intake Type,Intake Condition,Animal Type_x,Sex upon Intake,Age upon Intake,...,MonthYear_y,Date of Birth,Outcome Type,Outcome Subtype,Animal Type_y,Sex upon Outcome,Age upon Outcome,Breed_y,Color_y,Duplicated?_y


In [33]:
# Since all of those duplicate columns matched like they should, I can drop the duplicates without issue. 
# I will also drop the sensitive, redundant, and not helpful data. 

merged_data = merged_data_raw.drop(['Name_x', 'MonthYear_x', 'Found Location', 'Duplicated?_x', 'Name_y', 'MonthYear_y', 'Animal Type_y', 'Breed_y', 'Color_y', 'Duplicated?_y'], axis = 1)
merged_data.head()

Unnamed: 0,Animal ID,DateTime_x,Intake Type,Intake Condition,Animal Type_x,Sex upon Intake,Age upon Intake,Breed_x,Color_x,Visit Count,DateTime_y,Date of Birth,Outcome Type,Outcome Subtype,Sex upon Outcome,Age upon Outcome
0,A786884,2019-01-03 16:19:00,Stray,Normal,Dog,Neutered Male,2 years,Beagle Mix,Tricolor,1,2019-01-08 15:11:00,01/03/2017,Transfer,Partner,Neutered Male,2 years
1,A706918,2015-07-05 12:59:00,Stray,Normal,Dog,Spayed Female,8 years,English Springer Spaniel,White/Liver,1,2015-07-05 15:13:00,07/05/2007,Return to Owner,,Spayed Female,8 years
2,A724273,2016-04-14 18:43:00,Stray,Normal,Dog,Intact Male,11 months,Basenji Mix,Sable/White,1,2016-04-21 17:17:00,04/17/2015,Return to Owner,,Neutered Male,1 year
3,A857105,2022-05-12 00:23:00,Public Assist,Normal,Cat,Neutered Male,2 years,Domestic Shorthair,Orange Tabby,1,2022-05-12 14:35:00,05/12/2020,Transfer,Partner,Neutered Male,2 years
4,A682524,2014-06-29 10:38:00,Stray,Normal,Dog,Neutered Male,4 years,Doberman Pinsch/Australian Cattle Dog,Tan/Gray,1,2014-07-02 14:16:00,06/29/2010,Return to Owner,,Neutered Male,4 years


In [34]:
# And renaming some columns to improve organization. 
merged_data = merged_data.rename({'DateTime_x': 'Intake DateTime', 'Animal Type_x': 'Animal Type', 'Breed_x': 'Breed', 'Color_x':'Color', 'DateTime_y': 'Outcome DateTime'}, axis='columns')
merged_data = merged_data.reindex(columns = ['Animal ID', 'Animal Type', 'Date of Birth', 'Breed', 'Color', 'Intake Type', 'Intake Condition', 'Sex upon Intake', 'Age upon Intake', 'Intake DateTime', 'Outcome DateTime', 'Outcome Type', 'Outcome Subtype', 'Sex upon Outcome', 'Age upon Outcome'])
merged_data.head()


Unnamed: 0,Animal ID,Animal Type,Date of Birth,Breed,Color,Intake Type,Intake Condition,Sex upon Intake,Age upon Intake,Intake DateTime,Outcome DateTime,Outcome Type,Outcome Subtype,Sex upon Outcome,Age upon Outcome
0,A786884,Dog,01/03/2017,Beagle Mix,Tricolor,Stray,Normal,Neutered Male,2 years,2019-01-03 16:19:00,2019-01-08 15:11:00,Transfer,Partner,Neutered Male,2 years
1,A706918,Dog,07/05/2007,English Springer Spaniel,White/Liver,Stray,Normal,Spayed Female,8 years,2015-07-05 12:59:00,2015-07-05 15:13:00,Return to Owner,,Spayed Female,8 years
2,A724273,Dog,04/17/2015,Basenji Mix,Sable/White,Stray,Normal,Intact Male,11 months,2016-04-14 18:43:00,2016-04-21 17:17:00,Return to Owner,,Neutered Male,1 year
3,A857105,Cat,05/12/2020,Domestic Shorthair,Orange Tabby,Public Assist,Normal,Neutered Male,2 years,2022-05-12 00:23:00,2022-05-12 14:35:00,Transfer,Partner,Neutered Male,2 years
4,A682524,Dog,06/29/2010,Doberman Pinsch/Australian Cattle Dog,Tan/Gray,Stray,Normal,Neutered Male,4 years,2014-06-29 10:38:00,2014-07-02 14:16:00,Return to Owner,,Neutered Male,4 years


In [35]:
merged_data.describe(include = 'all')
# The data set contains over 160k animals. All of this information could potentially effect outcome (adoption vs other).
# There is still some redundant information between "age upon intake," "Date of Birth," and "age upon outcome," but we can address this later

Unnamed: 0,Animal ID,Animal Type,Date of Birth,Breed,Color,Intake Type,Intake Condition,Sex upon Intake,Age upon Intake,Intake DateTime,Outcome DateTime,Outcome Type,Outcome Subtype,Sex upon Outcome,Age upon Outcome
count,163263,163263,163263,163263,163263,163263,163263,163261,163262,163263,163263,163226,74974,163261,163253
unique,146673,5,8378,2936,648,6,20,5,55,,,11,26,5,55
top,A721033,Dog,05/01/2016,Domestic Shorthair Mix,Black/White,Stray,Normal,Intact Male,1 year,,,Adoption,Partner,Neutered Male,1 year
freq,33,89841,121,33443,16987,111908,139064,55043,26855,,,77874,38548,57065,27309
mean,,,,,,,,,,2018-07-14 00:41:36.702743296,2018-08-03 02:34:43.415593472,,,,
min,,,,,,,,,,2013-10-01 07:51:00,2013-10-01 10:39:00,,,,
25%,,,,,,,,,,2015-12-11 13:25:00,2016-01-02 12:09:00,,,,
50%,,,,,,,,,,2018-04-30 18:20:00,2018-05-17 19:16:00,,,,
75%,,,,,,,,,,2020-12-02 11:06:00,2021-01-04 14:10:00,,,,
max,,,,,,,,,,2024-06-30 13:24:00,2024-07-01 11:15:00,,,,


In [36]:
merged_data.isnull().sum()
# There are a number of null values throughout the data set, 
# The "outcome subtype" is the largest cluster because not every Outcome Type needs to be further described by a Subtype
# Options for handling the subtype nulls include: 
#      marking the nulls as "Unspecific" or similar placeholder, 
#      dropping the subtype variable altogether, 
#      combining the type-subtype data into one variable. 
# For now, I will mark the Nan Subtypes as Not Specified, but I might circle back to this later. 

Animal ID               0
Animal Type             0
Date of Birth           0
Breed                   0
Color                   0
Intake Type             0
Intake Condition        0
Sex upon Intake         2
Age upon Intake         1
Intake DateTime         0
Outcome DateTime        0
Outcome Type           37
Outcome Subtype     88289
Sex upon Outcome        2
Age upon Outcome       10
dtype: int64

In [37]:
merged_data['Outcome Subtype'] = merged_data['Outcome Subtype'].fillna(value = 'Not Specified')

In [38]:
merged_data.loc[merged_data['Outcome Type'].isnull() == True]

# There are a few pets that have no recorded Outcome type, and these can't help answer my questions about what gets pets adopted, so I will drop them. 

merged_data = merged_data.loc[merged_data['Outcome Type'].isnull() != True]

In [39]:
merged_data = merged_data.loc[merged_data['Sex upon Outcome'].isnull() != True]

In [40]:
merged_data.loc[merged_data['Age upon Outcome'].isnull() == True]

# There are a few pets that have no recorded age or sex upon outcome. 
# For a couple of these, they have an "age upon intake," so "age upon outcome" 
# can easily be calculated and filled in, but because my data set is large, I'm going to drop these. 

merged_data = merged_data.dropna(subset = ['Age upon Outcome'], axis = 0)

#merged_data = merged_data.loc[merged_data['Age upon Outcome'].isnull() != True &
#                             merged_data['Sex upon Outcome'].isnull() !=]

In [41]:
merged_data.isnull().sum()

Animal ID           0
Animal Type         0
Date of Birth       0
Breed               0
Color               0
Intake Type         0
Intake Condition    0
Sex upon Intake     0
Age upon Intake     0
Intake DateTime     0
Outcome DateTime    0
Outcome Type        0
Outcome Subtype     0
Sex upon Outcome    0
Age upon Outcome    0
dtype: int64

In [42]:
# Since I hope this data analysis can be repeated on updated data in the future, I put the above
# null management sequence into a method that can go in a module

def drop_null_data(df):
    #identifies null data in provided data frame, drops it, and reports columns in which the data were null

    #generates a Series with index = df's columns, values = number of nulls per column
    drops = df.isnull().sum()

    #if there are no nulls in the df, print & return original df
    if drops.sum() == 0:
        print("No null values identified")
            
    #if there are nulls in the df, drop them all and print which columns contained nulls and how many were dropped; returns df without nulls    
    else:
        df = df.dropna(axis = 0)
        for index, value in drops.items():
            if value != 0:
                print('"{}" contained {} null value(s)'.format(index, value))
    return df

### Cleaning the merged data set - Sanity Checks

##### Ages - Neonates & Aged

In [43]:
# The data contains a small number of animals that have ages listed as negative numbers. 
# They are few enough that I could drop them, but I prefer to fix them. 

merged_data['Age upon Intake'].unique()

array(['2 years', '8 years', '11 months', '4 years', '4 months',
       '6 years', '6 months', '4 weeks', '5 months', '14 years',
       '1 month', '2 months', '18 years', '1 year', '3 years', '4 days',
       '9 years', '2 weeks', '15 years', '1 day', '5 years', '3 weeks',
       '9 months', '8 months', '6 days', '7 years', '12 years', '1 week',
       '10 years', '7 months', '3 months', '10 months', '1 weeks',
       '5 days', '2 days', '0 years', '11 years', '17 years', '3 days',
       '13 years', '5 weeks', '19 years', '16 years', '20 years',
       '-1 years', '22 years', '28 years', '23 years', '30 years',
       '-2 years', '21 years', '-3 years', '25 years', '24 years',
       '-4 years'], dtype=object)

In [44]:
# This method will use regex to convert the ages from strings to ints
# I leave the ages in days because the computer doesn't need years/months/weeks/days to draw conclusions, 
# but for data visualization later, they will need to be converted to more meaningful numbers.

import re

year_pattern = re.compile(r"(-?\d+)\s*(?:year|years)")
month_pattern = re.compile(r"(-?\d+)\s*(?:month|months)")
week_pattern = re.compile(r"(-?\d+)\s*(?:week|weeks)")
day_pattern = re.compile(r"(-?\d+)\s*(?:day|days)")

# Function to convert strings to days
def convert_to_days(age_str):
    if "year" in age_str:
        value = re.search(year_pattern, age_str).group(1)
        return int(value) * 365
    elif "month" in age_str:
        value = re.search(month_pattern, age_str).group(1)
        return int(value) * 30 
    elif "week" in age_str:
        value = re.search(week_pattern, age_str).group(1)
        return int(value) * 7
    elif "day" in age_str:
        value = re.search(day_pattern, age_str).group(1)
        return int(value)
    else:
        print(f"Unknown pattern: {age_str}; -1 days reported")
        return int(-1)

In [45]:
merged_data['Age upon Intake'] = merged_data['Age upon Intake'].apply(convert_to_days)
merged_data['Age upon Outcome'] = merged_data['Age upon Outcome'].apply(convert_to_days)
merged_data.head()

Unnamed: 0,Animal ID,Animal Type,Date of Birth,Breed,Color,Intake Type,Intake Condition,Sex upon Intake,Age upon Intake,Intake DateTime,Outcome DateTime,Outcome Type,Outcome Subtype,Sex upon Outcome,Age upon Outcome
0,A786884,Dog,01/03/2017,Beagle Mix,Tricolor,Stray,Normal,Neutered Male,730,2019-01-03 16:19:00,2019-01-08 15:11:00,Transfer,Partner,Neutered Male,730
1,A706918,Dog,07/05/2007,English Springer Spaniel,White/Liver,Stray,Normal,Spayed Female,2920,2015-07-05 12:59:00,2015-07-05 15:13:00,Return to Owner,Not Specified,Spayed Female,2920
2,A724273,Dog,04/17/2015,Basenji Mix,Sable/White,Stray,Normal,Intact Male,330,2016-04-14 18:43:00,2016-04-21 17:17:00,Return to Owner,Not Specified,Neutered Male,365
3,A857105,Cat,05/12/2020,Domestic Shorthair,Orange Tabby,Public Assist,Normal,Neutered Male,730,2022-05-12 00:23:00,2022-05-12 14:35:00,Transfer,Partner,Neutered Male,730
4,A682524,Dog,06/29/2010,Doberman Pinsch/Australian Cattle Dog,Tan/Gray,Stray,Normal,Neutered Male,1460,2014-06-29 10:38:00,2014-07-02 14:16:00,Return to Owner,Not Specified,Neutered Male,1460


In [46]:
# Dropping rows with negative ages for both Age upon Intake & Age upon Outcome

pre = len(merged_data)
merged_data = merged_data.loc[merged_data['Age upon Intake'] >= 0]
post= len(merged_data)
print(f"Dropped rows due to negative Intake ages: {pre - post}")

pre2 = len(merged_data)
merged_data = merged_data.loc[merged_data['Age upon Outcome'] >= 0]
post2 = len(merged_data)
print(f"Dropped rows due to negative Outcome ages: {pre - post}")

Dropped rows due to negative Intake ages: 13
Dropped rows due to negative Outcome ages: 13


In [47]:
merged_data.head()

Unnamed: 0,Animal ID,Animal Type,Date of Birth,Breed,Color,Intake Type,Intake Condition,Sex upon Intake,Age upon Intake,Intake DateTime,Outcome DateTime,Outcome Type,Outcome Subtype,Sex upon Outcome,Age upon Outcome
0,A786884,Dog,01/03/2017,Beagle Mix,Tricolor,Stray,Normal,Neutered Male,730,2019-01-03 16:19:00,2019-01-08 15:11:00,Transfer,Partner,Neutered Male,730
1,A706918,Dog,07/05/2007,English Springer Spaniel,White/Liver,Stray,Normal,Spayed Female,2920,2015-07-05 12:59:00,2015-07-05 15:13:00,Return to Owner,Not Specified,Spayed Female,2920
2,A724273,Dog,04/17/2015,Basenji Mix,Sable/White,Stray,Normal,Intact Male,330,2016-04-14 18:43:00,2016-04-21 17:17:00,Return to Owner,Not Specified,Neutered Male,365
3,A857105,Cat,05/12/2020,Domestic Shorthair,Orange Tabby,Public Assist,Normal,Neutered Male,730,2022-05-12 00:23:00,2022-05-12 14:35:00,Transfer,Partner,Neutered Male,730
4,A682524,Dog,06/29/2010,Doberman Pinsch/Australian Cattle Dog,Tan/Gray,Stray,Normal,Neutered Male,1460,2014-06-29 10:38:00,2014-07-02 14:16:00,Return to Owner,Not Specified,Neutered Male,1460


In [48]:
merged_data['Intake Condition'].value_counts()

# These are the identifier & counts for the intake condition column. Some are very large & some are only single animals.
# Depending on the question I want to answer, I might ultimately like to consolidate these, but first, I would like to do some checking to make sure they are accurate.
# For example, a neonate should not be an old animal, and a pet described as "aged" should not be young. 
# Pregnant and nursing animals should be female. 

Intake Condition
Normal        139013
Injured         9869
Sick            7329
Nursing         3878
Neonatal        1423
Aged             510
Medical          372
Other            344
Pregnant         148
Feral            140
Behavior          69
Med Attn          52
Unknown           17
Med Urgent        11
Neurologic        10
Parvo              5
Space              4
Agonal             3
Congenital         1
Panleuk            1
Name: count, dtype: int64

In [49]:
merged_data.loc[merged_data['Intake Condition'] == 'Neonatal'].sort_values(by = 'Age upon Intake', ascending = False)

# There are 2 year old animals being called "neonates," so that's not correct. Medically speaking, puppies/kittens are considered neonates when they're younger than 14 days (28 if you're lenient).
# I will drop any animals that are considered "neonatal" with an age upon intake <30 days. 

Unnamed: 0,Animal ID,Animal Type,Date of Birth,Breed,Color,Intake Type,Intake Condition,Sex upon Intake,Age upon Intake,Intake DateTime,Outcome DateTime,Outcome Type,Outcome Subtype,Sex upon Outcome,Age upon Outcome
122679,A859987,Other,06/21/2020,Bat,Brown/Brown,Wildlife,Neonatal,Unknown,730,2022-06-21 15:37:00,2022-06-23 09:54:00,Euthanasia,Not Specified,Unknown,730
55242,A845299,Cat,10/27/2019,Domestic Shorthair,Torbie,Abandoned,Neonatal,Intact Female,730,2021-10-27 17:24:00,2021-10-27 17:42:00,Transfer,Partner,Intact Female,730
156827,A887671,Cat,08/24/2021,Domestic Shorthair,Black/White,Stray,Neonatal,Intact Female,730,2023-08-24 21:08:00,2024-01-21 13:42:00,Adoption,Foster,Spayed Female,730
141172,A843380,Dog,09/30/2019,Chihuahua Shorthair,Tan/White,Public Assist,Neonatal,Intact Female,730,2021-09-30 11:05:00,2022-01-18 15:33:00,Return to Owner,Not Specified,Intact Female,730
26107,A845298,Cat,10/27/2019,Domestic Shorthair,Torbie,Abandoned,Neonatal,Intact Female,730,2021-10-27 17:24:00,2021-10-27 17:41:00,Transfer,Partner,Intact Female,730
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153392,A889427,Cat,09/22/2023,Domestic Shorthair,Orange Tabby,Stray,Neonatal,Intact Male,0,2023-09-22 09:44:00,2023-09-22 11:53:00,Transfer,Partner,Intact Male,0
153393,A889428,Cat,09/22/2023,Domestic Shorthair,Torbie,Stray,Neonatal,Intact Female,0,2023-09-22 09:44:00,2023-09-22 11:53:00,Transfer,Partner,Intact Female,0
153394,A889426,Cat,09/22/2023,Domestic Shorthair,Orange Tabby,Stray,Neonatal,Intact Female,0,2023-09-22 09:44:00,2023-09-22 11:53:00,Transfer,Partner,Intact Female,0
153472,A889434,Cat,09/22/2023,Domestic Shorthair,Black/White,Stray,Neonatal,Unknown,0,2023-09-22 10:50:00,2023-09-22 14:08:00,Died,In Kennel,Unknown,0


In [50]:
merged_data.loc[(merged_data['Intake Condition'] == 'Aged') & (merged_data['Age upon Intake'] < 1460)].sort_values(by = 'Age upon Intake', ascending = True)

# There is no concrete ages at which an animal becomes "aged," and an animal who has had a hard life can appear "aged" before they are really old. 
# BUT there is no situation in which a 2 day old animal should be called "aged."
# I will be lenient and say any animal under 2yr shouldn't be in the "aged" category. This also allows for exotics, which have shorter lifespans.

Unnamed: 0,Animal ID,Animal Type,Date of Birth,Breed,Color,Intake Type,Intake Condition,Sex upon Intake,Age upon Intake,Intake DateTime,Outcome DateTime,Outcome Type,Outcome Subtype,Sex upon Outcome,Age upon Outcome
156375,A887830,Cat,08/25/2023,Domestic Shorthair,Brown Tabby,Owner Surrender,Aged,Neutered Male,2,2023-08-27 14:37:00,2023-08-27 16:06:00,Transfer,Partner,Neutered Male,2
156374,A887829,Cat,08/25/2023,Domestic Shorthair,Brown Tabby,Owner Surrender,Aged,Neutered Male,2,2023-08-27 14:37:00,2023-08-27 16:06:00,Transfer,Partner,Neutered Male,2
156376,A887831,Cat,08/25/2023,Domestic Shorthair,Black/White,Owner Surrender,Aged,Neutered Male,2,2023-08-27 14:37:00,2023-08-27 16:06:00,Transfer,Partner,Neutered Male,2
145208,A888924,Cat,09/12/2023,Domestic Shorthair,Black,Stray,Aged,Intact Male,2,2023-09-14 16:36:00,2023-09-14 18:14:00,Transfer,Partner,Intact Male,2
145210,A888923,Cat,09/12/2023,Domestic Shorthair,Tortie,Stray,Aged,Intact Female,2,2023-09-14 16:36:00,2023-09-14 18:14:00,Transfer,Partner,Intact Female,2
143828,A888856,Cat,09/03/2023,Domestic Shorthair,Tortie,Stray,Aged,Intact Female,7,2023-09-13 15:19:00,2023-09-13 16:17:00,Transfer,Partner,Intact Female,7
147771,A889057,Cat,08/25/2023,Domestic Shorthair,Orange/White,Stray,Aged,Intact Male,21,2023-09-16 15:40:00,2023-09-16 17:26:00,Transfer,Partner,Intact Male,21
145212,A888919,Cat,08/24/2023,Domestic Shorthair,Cream Tabby/White,Stray,Aged,Intact Female,21,2023-09-14 17:21:00,2023-09-14 18:14:00,Transfer,Partner,Intact Female,21
17775,A831549,Cat,03/01/2021,Domestic Shorthair,Cream Tabby,Stray,Aged,Intact Male,28,2021-03-29 12:40:00,2021-09-07 11:00:00,Adoption,Foster,Neutered Male,180
54712,A831547,Cat,03/01/2021,Domestic Shorthair,Black/White,Stray,Aged,Intact Male,28,2021-03-29 12:40:00,2021-07-17 09:48:00,Adoption,Foster,Neutered Male,120


In [51]:
#Sanity checked - no old "neonatal" or young "aged" animals
merged_data = merged_data[(merged_data['Intake Condition'] != 'Neonatal') |
                            (merged_data['Age upon Intake'] < 30)]

merged_data = merged_data[(merged_data['Intake Condition'] != 'Aged') |
                          (merged_data['Age upon Intake'] > 730)]

##### Reproductive status - Pregnant & Nursing

In [52]:
merged_data.loc[(merged_data['Intake Condition'] == 'Pregnant') &
                (merged_data['Sex upon Intake'] != 'Intact Female')]

# There are a fair number of animals listed as pregnant that are male or spayed. 
# Since I can't trust this information, I will drop these ones too. 

Unnamed: 0,Animal ID,Animal Type,Date of Birth,Breed,Color,Intake Type,Intake Condition,Sex upon Intake,Age upon Intake,Intake DateTime,Outcome DateTime,Outcome Type,Outcome Subtype,Sex upon Outcome,Age upon Outcome
1571,A885138,Cat,07/14/2021,Domestic Shorthair,Brown Tabby/White,Owner Surrender,Pregnant,Spayed Female,730,2023-07-14 07:54:00,2023-08-19 13:27:00,Adoption,Not Specified,Spayed Female,730
6181,A842769,Dog,10/06/2021,Pit Bull/Siberian Husky,Black/White,Stray,Pregnant,Intact Male,0,2021-09-21 12:40:00,2021-11-29 17:50:00,Adoption,Not Specified,Neutered Male,30
9187,A701482,Cat,04/30/2014,Domestic Longhair Mix,Brown Tabby,Stray,Pregnant,Spayed Female,365,2015-04-30 11:50:00,2015-05-01 09:00:00,Transfer,SCRP,Spayed Female,365
14285,A832146,Cat,04/08/2016,Domestic Shorthair,Blue Tabby/White,Owner Surrender,Pregnant,Spayed Female,1825,2021-04-08 07:57:00,2021-04-12 15:32:00,Rto-Adopt,Not Specified,Spayed Female,1825
26326,A731134,Dog,07/17/2016,Jack Russell Terrier Mix,Black/Tan,Stray,Pregnant,Intact Male,0,2016-07-17 08:41:00,2016-07-19 14:09:00,Transfer,Partner,Intact Male,2
28138,A861113,Dog,06/07/2022,Pit Bull,Brown/White,Public Assist,Pregnant,Intact Male,28,2022-07-07 23:17:00,2022-07-12 10:05:00,Disposal,Not Specified,Intact Male,35
31384,A731133,Dog,07/17/2016,Jack Russell Terrier Mix,Sable,Stray,Pregnant,Intact Male,0,2016-07-17 08:41:00,2016-07-19 14:09:00,Transfer,Partner,Intact Male,2
32694,A842777,Dog,09/18/2021,Pit Bull,Black,Stray,Pregnant,Unknown,3,2021-09-21 12:40:00,2021-11-27 08:55:00,Adoption,Foster,Unknown,60
35022,A842775,Dog,09/18/2021,Pit Bull,Black,Stray,Pregnant,Unknown,3,2021-09-21 12:40:00,2021-11-27 08:54:00,Adoption,Foster,Unknown,60
36791,A842773,Dog,09/18/2021,Pit Bull,Black,Stray,Pregnant,Unknown,3,2021-09-21 12:40:00,2021-11-27 08:54:00,Adoption,Foster,Unknown,60


In [53]:
#Sanity checked - male/spayed female pregnant

merged_data = merged_data.loc[(merged_data['Intake Condition'] != 'Pregnant') |
                (merged_data['Sex upon Intake'] == 'Intact Female')]

In [54]:
# Lastly, I want to look more closely at the "nursing" intake condition. These animals might refer to the nursing bitches (and thus should also be female) or to the nursing puppies. 

merged_data.loc[(merged_data['Intake Condition'] == 'Nursing') &
                (merged_data['Age upon Intake'] > 60)].sort_values(by = 'Age upon Intake')

# It looks like the shelter mostly uses "Nursing" to indicate nursing puppies/kittens. 
# HOWEVER, about 10% of the Nursing animals are over 8wks old (when puppies, kittens, and rabbits should be done nursing)
# And some are old enough that these are bitches/queens. There are also a few older males included; I will drop these. 
# I would like to split the young-nursing animals into another group (as being a cute puppy likely increases your odds of adoption)
# and keep the older-nursing females in a separate category (as being an older animal who has nursed recently might also change your odds of adoption).

Unnamed: 0,Animal ID,Animal Type,Date of Birth,Breed,Color,Intake Type,Intake Condition,Sex upon Intake,Age upon Intake,Intake DateTime,Outcome DateTime,Outcome Type,Outcome Subtype,Sex upon Outcome,Age upon Outcome
740,A835442,Cat,01/27/2021,Siamese,Seal Point,Stray,Nursing,Intact Female,90,2021-05-27 16:08:00,2021-06-09 16:01:00,Adoption,Not Specified,Spayed Female,120
32164,A673721,Other,11/01/2013,Rabbit Sh Mix,Black/White,Public Assist,Nursing,Intact Female,90,2014-03-01 10:12:00,2014-03-05 16:20:00,Transfer,Partner,Intact Female,120
92409,A704491,Dog,02/10/2015,Labrador Retriever/Plott Hound,Black,Stray,Nursing,Intact Female,90,2015-06-05 14:05:00,2015-06-05 16:03:00,Transfer,Partner,Intact Female,90
34991,A796707,Cat,02/04/2019,Domestic Shorthair,Brown Tabby,Stray,Nursing,Unknown,90,2019-06-04 15:32:00,2019-06-04 18:35:00,Transfer,Partner,Unknown,90
37301,A673720,Other,11/01/2013,Rabbit Sh Mix,Black/White,Public Assist,Nursing,Intact Female,90,2014-03-01 10:12:00,2014-03-05 16:19:00,Transfer,Partner,Intact Female,120
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110627,A803389,Dog,09/05/2012,Doberman Pinsch,Brown,Stray,Nursing,Intact Female,2190,2019-08-31 13:41:00,2019-09-06 16:49:00,Adoption,Not Specified,Spayed Female,2555
84862,A737659,Dog,11/02/2008,Chihuahua Shorthair Mix,Black,Stray,Nursing,Neutered Male,2920,2016-11-02 11:38:00,2017-01-08 14:59:00,Adoption,Not Specified,Neutered Male,2920
81335,A775493,Dog,07/03/2012,Chihuahua Shorthair Mix,Tan/White,Stray,Nursing,Spayed Female,2920,2021-05-11 08:12:00,2018-07-03 16:14:00,Return to Owner,Not Specified,Spayed Female,2190
59422,A701255,Dog,04/26/2006,Shih Tzu Mix,White,Stray,Nursing,Neutered Male,3285,2015-04-26 13:47:00,2015-05-07 00:00:00,Transfer,Partner,Neutered Male,3285


In [55]:
# Animals younger than 60 days (aka 8.5 weeks) will now have the Intake Condition "Nursing Juvenile"

young_nursing = merged_data.loc[(merged_data['Intake Condition'] == 'Nursing') &
                                (merged_data['Age upon Intake'] <= 60)].index
merged_data['Intake Condition'][young_nursing] = 'Nursing Juvenile'
merged_data.loc[(merged_data['Intake Condition'] == 'Nursing Juvenile') &
                                (merged_data['Age upon Intake'] <= 60)]

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  merged_data['Intake Condition'][young_nursing] = 'Nursing Juvenile'


Unnamed: 0,Animal ID,Animal Type,Date of Birth,Breed,Color,Intake Type,Intake Condition,Sex upon Intake,Age upon Intake,Intake DateTime,Outcome DateTime,Outcome Type,Outcome Subtype,Sex upon Outcome,Age upon Outcome
50,A701811,Cat,04/20/2015,Domestic Shorthair Mix,Gray Tabby,Stray,Nursing Juvenile,Unknown,14,2015-05-05 07:29:00,2015-05-05 11:45:00,Transfer,Partner,Unknown,14
137,A724640,Cat,04/05/2016,Domestic Shorthair Mix,Brown Tabby/Black,Stray,Nursing Juvenile,Intact Male,14,2016-04-20 08:19:00,2016-04-20 17:48:00,Transfer,Partner,Intact Male,14
149,A800717,Cat,07/18/2019,Domestic Shorthair,Blue/White,Stray,Nursing Juvenile,Intact Female,7,2019-07-25 13:49:00,2019-07-25 18:39:00,Transfer,Partner,Intact Female,7
183,A728810,Cat,05/24/2016,Domestic Shorthair Mix,Blue,Stray,Nursing Juvenile,Intact Male,14,2016-06-08 17:30:00,2016-06-10 15:11:00,Transfer,Partner,Intact Male,14
194,A793946,Cat,04/17/2019,Domestic Shorthair Mix,Blue Tabby,Stray,Nursing Juvenile,Intact Female,14,2019-05-02 08:21:00,2019-05-02 14:57:00,Transfer,Partner,Intact Female,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162991,A804224,Cat,08/21/2019,Domestic Shorthair Mix,Black,Stray,Nursing Juvenile,Intact Male,21,2019-09-11 21:41:00,2019-09-12 11:08:00,Transfer,Partner,Intact Male,21
162992,A812494,Dog,01/15/2020,Labrador Retriever Mix,Black/White,Owner Surrender,Nursing Juvenile,Intact Female,7,2020-01-23 14:56:00,2020-04-07 08:50:00,Adoption,Foster,Spayed Female,60
163130,A782910,Cat,09/24/2018,Domestic Shorthair Mix,Blue Tabby/White,Stray,Nursing Juvenile,Intact Male,28,2018-10-22 18:31:00,2018-12-18 07:54:00,Adoption,Foster,Neutered Male,60
163141,A808754,Cat,10/22/2019,Domestic Shorthair,Black,Stray,Nursing Juvenile,Intact Male,21,2019-11-13 13:18:00,2019-11-13 13:50:00,Transfer,Partner,Intact Male,21


In [56]:
# Female animals older than 4 months (the youngest age at which a female could be nursing for dogs/cats/rabbits/guinea pigs, which are the only species represented as nursing in this data)

nursing_adults = merged_data.loc[(merged_data['Intake Condition'] == 'Nursing') &
                                 (merged_data['Age upon Intake'] > 120) &
                                 ((merged_data['Sex upon Intake'] == 'Intact Female') |
                                 (merged_data['Sex upon Intake'] == 'Spayed Female'))].index
merged_data['Intake Condition'][nursing_adults] = 'Nursing Adult'
merged_data['Intake Condition'].unique()

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  merged_data['Intake Condition'][nursing_adults] = 'Nursing Adult'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

array(['Normal', 'Injured', 'Pregnant', 'Sick', 'Nursing Juvenile',
       'Aged', 'Nursing Adult', 'Unknown', 'Nursing', 'Congenital',
       'Medical', 'Other', 'Neonatal', 'Med Attn', 'Feral', 'Behavior',
       'Med Urgent', 'Space', 'Agonal', 'Neurologic', 'Panleuk', 'Parvo'],
      dtype=object)

In [57]:
# Any animals lefts in the 'Nursing' Intake condition are now either male adults or age-ambiguous enough that I can't tell whether they are the mothers or the juveniles. These will be dropped.

merged_data = merged_data.loc[(merged_data['Intake Condition'] != 'Nursing')]
merged_data['Intake Condition'].unique()

array(['Normal', 'Injured', 'Pregnant', 'Sick', 'Nursing Juvenile',
       'Aged', 'Nursing Adult', 'Unknown', 'Congenital', 'Medical',
       'Other', 'Neonatal', 'Med Attn', 'Feral', 'Behavior', 'Med Urgent',
       'Space', 'Agonal', 'Neurologic', 'Panleuk', 'Parvo'], dtype=object)

In [58]:
# At this point, I think my data is cleaned enough and complete enough to be used to answer a wide range of questions about this shelter animal population. 
# I'm going to end off here, so I can start my data exploration and analysis in a fresh notebook!