In [49]:
# In this notebook, I have a data set from the Austin Animal Center that I would like to use to answer some questions about the shelter. 
# Before I can start really analyzing the data, first the data sets need to be cleaned, and I would like to consolidate the two files 
# (intake and outcome) into one file that I can use for all my analysis. Because I don't yet have a specific question in mind, I will 
# focus mostly on getting rid of null values and sensitive information, and doing some logic checking to make sure the data is as trustworthy as possible. 

In [50]:
import pandas as pd

### Import and initial description of intake data

In [51]:
#read both the intake and outcome data files
intake_data_raw = pd.read_csv('Austin_Animal_Center_Intakes_20240701.csv')
outcome_data_raw = pd.read_csv('Austin_Animal_Center_Outcomes_20240701.csv')


In [52]:
intake_data_raw.head()
intake_length = len(intake_data_raw)
#intake_length #164614
#Name and Found Location are potentially sensitive information, so I will drop these. 
#Animal ID can be used to join the intake and outcome data sets, so I will keep it for now.
#MonthYear and DateTime are redundant, so I will drop MonthYear since DateTime should be easy to convert from str to timestamp or other numerical. 

In [53]:
outcome_length = len(outcome_data_raw)
outcome_data_raw.head()
#outcome_length #164228
#Name is again potentially sensitive. The Animal ID will be used for merging.
#MonthYear and DateTime are again redundant.
#The outcome data is shorter than the intake data, likely because some pets are actively in the shelter (so they have intake but no outcome yet)

Unnamed: 0,Animal ID,Name,DateTime,MonthYear,Date of Birth,Outcome Type,Outcome Subtype,Animal Type,Sex upon Outcome,Age upon Outcome,Breed,Color
0,A794011,Chunk,05/08/2019 06:20:00 PM,May 2019,05/02/2017,Rto-Adopt,,Cat,Neutered Male,2 years,Domestic Shorthair Mix,Brown Tabby/White
1,A776359,Gizmo,07/18/2018 04:02:00 PM,Jul 2018,07/12/2017,Adoption,,Dog,Neutered Male,1 year,Chihuahua Shorthair Mix,White/Brown
2,A821648,,08/16/2020 11:38:00 AM,Aug 2020,08/16/2019,Euthanasia,,Other,Unknown,1 year,Raccoon,Gray
3,A720371,Moose,02/13/2016 05:59:00 PM,Feb 2016,10/08/2015,Adoption,,Dog,Neutered Male,4 months,Anatol Shepherd/Labrador Retriever,Buff
4,A674754,,03/18/2014 11:47:00 AM,Mar 2014,03/12/2014,Transfer,Partner,Cat,Intact Male,6 days,Domestic Shorthair Mix,Orange Tabby


### Duplicate Detection & Removal

In [54]:
intake_data_raw['Animal ID'].value_counts()
#Animal ID is not unique because a single animal may go through the shelter system multiple times

Animal ID
A721033    33
A718223    14
A718877    12
A705625    11
A706536    11
           ..
A714773     1
A765646     1
A732323     1
A698657     1
A855904     1
Name: count, Length: 147862, dtype: int64

In [55]:
#Looking more closely at the most frequently intaken pet, we see these are not duplicate data, but actually 33 unique visits to the shelter. 
#The other pets with multiple entries of the same Animal ID tell similar stories. 
intake_data_raw.loc[(intake_data_raw['Animal ID'] == 'A721033')].sort_values(by = 'DateTime')

Unnamed: 0,Animal ID,Name,DateTime,MonthYear,Found Location,Intake Type,Intake Condition,Animal Type,Sex upon Intake,Age upon Intake,Breed,Color
118002,A721033,Lil Bit,01/09/2017 02:26:00 PM,January 2017,6210 E Ben White Blvd in Austin (TX),Stray,Injured,Dog,Neutered Male,1 year,Rat Terrier Mix,Tricolor/Brown Brindle
125351,A721033,Lil Bit,01/26/2017 06:55:00 AM,January 2017,901 W Ben White Blvd in Austin (TX),Public Assist,Normal,Dog,Neutered Male,1 year,Rat Terrier Mix,Tricolor/Brown Brindle
48856,A721033,Lil Bit,01/30/2017 11:05:00 PM,January 2017,6210 E Ben White in Austin (TX),Public Assist,Normal,Dog,Neutered Male,1 year,Rat Terrier Mix,Tricolor/Brown Brindle
110181,A721033,Lil Bit,02/06/2017 10:13:00 AM,February 2017,6210 E Ben White in Austin (TX),Public Assist,Normal,Dog,Neutered Male,1 year,Rat Terrier Mix,Tricolor/Brown Brindle
30692,A721033,Lil Bit,02/12/2019 10:21:00 AM,February 2019,1936 East Oltorf Street in Austin (TX),Public Assist,Normal,Dog,Neutered Male,3 years,Rat Terrier Mix,Tricolor/Brown Brindle
69901,A721033,Lil Bit,02/16/2019 10:30:00 AM,February 2019,1135 Airport Blvd in Austin (TX),Public Assist,Normal,Dog,Neutered Male,3 years,Rat Terrier Mix,Tricolor/Brown Brindle
102945,A721033,Lil Bit,02/20/2016 10:44:00 AM,February 2016,2508 E Riverside Dr in Austin (TX),Stray,Normal,Dog,Neutered Male,9 months,Rat Terrier Mix,Tricolor/Brown Brindle
150333,A721033,Lil Bit,02/22/2018 10:28:00 AM,February 2018,6400 Ben White Blvd in Austin (TX),Public Assist,Normal,Dog,Neutered Male,2 years,Rat Terrier Mix,Tricolor/Brown Brindle
5518,A721033,Lil Bit,02/24/2019 09:53:00 PM,February 2019,700 Allen St in Austin (TX),Public Assist,Normal,Dog,Neutered Male,3 years,Rat Terrier Mix,Tricolor/Brown Brindle
13100,A721033,Lil Bit,03/07/2018 08:27:00 AM,March 2018,4111 South 1St in Austin (TX),Public Assist,Normal,Dog,Neutered Male,2 years,Rat Terrier Mix,Tricolor/Brown Brindle


In [56]:
outcome_data_raw.loc[(outcome_data_raw['Animal ID'] == 'A721033')].sort_values(by = 'DateTime')
#Because these frequently returning animals do not represent the "typical" intake-outcome cycle, I considered dropping pets with more than 3-5 visits. 
#But ultimately, a pet being returned to its owner is considered a success. And I'm curious how many of these repeat offenders make up the "return to owner" outcome type. 
#So I will leave them in for now

Unnamed: 0,Animal ID,Name,DateTime,MonthYear,Date of Birth,Outcome Type,Outcome Subtype,Animal Type,Sex upon Outcome,Age upon Outcome,Breed,Color
10976,A721033,Lil Bit,01/10/2017 04:20:00 PM,Jan 2017,05/20/2015,Return to Owner,,Dog,Neutered Male,1 year,Rat Terrier Mix,Tricolor/Brown Brindle
42029,A721033,Lil Bit,01/28/2017 03:22:00 PM,Jan 2017,05/20/2015,Return to Owner,,Dog,Neutered Male,1 year,Rat Terrier Mix,Tricolor/Brown Brindle
83153,A721033,Lil Bit,02/02/2017 11:19:00 AM,Feb 2017,05/20/2015,Return to Owner,,Dog,Neutered Male,1 year,Rat Terrier Mix,Tricolor/Brown Brindle
67815,A721033,Lil Bit,02/07/2017 05:26:00 PM,Feb 2017,05/20/2015,Return to Owner,,Dog,Neutered Male,1 year,Rat Terrier Mix,Tricolor/Brown Brindle
136111,A721033,Lil Bit,02/12/2019 03:20:00 PM,Feb 2019,05/20/2015,Return to Owner,,Dog,Neutered Male,3 years,Rat Terrier Mix,Tricolor/Brown Brindle
37003,A721033,Lil Bit,02/18/2019 04:46:00 PM,Feb 2019,05/20/2015,Return to Owner,,Dog,Neutered Male,3 years,Rat Terrier Mix,Tricolor/Brown Brindle
74377,A721033,Lil Bit,02/20/2016 04:18:00 PM,Feb 2016,05/20/2015,Return to Owner,,Dog,Neutered Male,9 months,Rat Terrier Mix,Tricolor/Brown Brindle
90560,A721033,Lil Bit,02/23/2018 01:06:00 PM,Feb 2018,05/20/2015,Return to Owner,,Dog,Neutered Male,2 years,Rat Terrier Mix,Tricolor/Brown Brindle
51640,A721033,Lil Bit,02/26/2019 07:00:00 PM,Feb 2019,05/20/2015,Return to Owner,,Dog,Neutered Male,3 years,Rat Terrier Mix,Tricolor/Brown Brindle
108849,A721033,Lil Bit,03/08/2018 03:04:00 PM,Mar 2018,05/20/2015,Return to Owner,,Dog,Neutered Male,2 years,Rat Terrier Mix,Tricolor/Brown Brindle


In [57]:
#To merge the intake and outcome datasets while keeping the multiple-visit pets in the set, I will group by Animal ID to assign each visit a count, combine the Animal ID with the count, and merge along this key
#First I want to sort the datasets by DateTime, which means my current DateTime str needs to be convert to a numerical. 
intake_data_raw['DateTime'] = pd.to_datetime(intake_data_raw['DateTime'], format = '%m/%d/%Y %I:%M:%S %p')
outcome_data_raw['DateTime'] = pd.to_datetime(outcome_data_raw['DateTime'], format = '%m/%d/%Y %I:%M:%S %p')

In [58]:
#Sorting the datasets by DateTime and dropping rows that have the same Animal ID & DateTime (ie. true duplicate entries)
intake_data_raw = intake_data_raw.sort_values(by = 'DateTime')
intake_data_raw['Duplicated?'] = intake_data_raw.duplicated(subset=['Animal ID', 'DateTime'])

outcome_data_raw = outcome_data_raw.sort_values(by = 'DateTime')
outcome_data_raw['Duplicated?'] = outcome_data_raw.duplicated(subset=['Animal ID', 'DateTime'])

In [59]:
#Spotchecking that duplicates that were flagged are true duplicates and not repeat visits and checking the length of each to see how many were dropped. 
#It worked =)

#outcome_data_raw.loc[outcome_data_raw['Duplicated?'] == True] #25 count
#intake_data_raw.loc[intake_data_raw['Duplicated?'] == True] #37 count
#intake_data_raw.loc[intake_data_raw['Animal ID'] == 'A727043']

In [60]:
outcome_data_raw = outcome_data_raw.loc[outcome_data_raw['Duplicated?'] != True]
#len(outcome_data_raw) #164203 length

intake_data_raw = intake_data_raw.loc[intake_data_raw['Duplicated?'] != True]
#len(intake_data_raw) #164577 length

### Merging intake & outcome data sets

In [61]:
intake_data_raw.loc[(intake_data_raw['Animal ID'] == 'A659667')]

Unnamed: 0,Animal ID,Name,DateTime,MonthYear,Found Location,Intake Type,Intake Condition,Animal Type,Sex upon Intake,Age upon Intake,Breed,Color,Duplicated?
142924,A659667,,2013-10-26 15:03:00,October 2013,12507 Esplanade in Austin (TX),Stray,Normal,Cat,Neutered Male,8 years,Domestic Longhair Mix,Blue/White,False


In [62]:
outcome_data_raw.loc[(outcome_data_raw['Animal ID'] == 'A659667')]

Unnamed: 0,Animal ID,Name,DateTime,MonthYear,Date of Birth,Outcome Type,Outcome Subtype,Animal Type,Sex upon Outcome,Age upon Outcome,Breed,Color,Duplicated?
109613,A659667,,2013-10-26 13:34:00,Oct 2013,07/28/2005,Transfer,SCRP,Cat,Neutered Male,8 years,Domestic Longhair Mix,Blue/White,False


In [63]:
merged_data_unfiltered = pd.merge(intake_data_raw, outcome_data_raw, on='Animal ID', suffixes=('_intake', '_outcome'))

# There are a number of pets who were in the shelter at the time that data collection started, so they have outcome data 
# before they have intake data. This will discard the first outcome data, so their visits are matched correctly.
merged_data_raw = merged_data_unfiltered[merged_data_unfiltered['DateTime_outcome'] >= merged_data_unfiltered['DateTime_intake']]

In [64]:
#When I examine the animals that did not make it into the merged set from the outcome data, most of them are from the time the shelter first started recording data (Oct 1, 2013). 
#Pets admitted prior to this date, would not have intake data to match with their outcome data, so they cannot be included. 
#There are a few pets from later on that are also missing intake data for some reason. None of these pets have multiple visits, so they will not mess up the merged data set. 

outcome_data_not_merged = outcome_data_raw[~outcome_data_raw['Animal ID'].isin(merged_data_raw['Animal ID'])]
outcome_data_not_merged.loc[outcome_data_not_merged['DateTime'] < '2023'].sort_values(by = 'DateTime')

Unnamed: 0,Animal ID,Name,DateTime,MonthYear,Date of Birth,Outcome Type,Outcome Subtype,Animal Type,Sex upon Outcome,Age upon Outcome,Breed,Color,Duplicated?
65866,A659834,*Dudley,2013-10-01 09:31:00,Oct 2013,07/23/2013,Adoption,Foster,Dog,Neutered Male,2 months,Labrador Retriever Mix,Black,False
13012,A664223,Moby,2013-10-01 11:03:00,Oct 2013,09/30/2009,Return to Owner,,Dog,Neutered Male,4 years,Bulldog Mix,White,False
34753,A663646,,2013-10-01 11:12:00,Oct 2013,09/22/2010,Transfer,Partner,Dog,Neutered Male,3 years,Toy Poodle Mix,White,False
126159,A663888,,2013-10-01 11:13:00,Oct 2013,09/25/2011,Transfer,Partner,Dog,Spayed Female,2 years,Boxer Mix,Red/White,False
32550,A663833,Baby Girl,2013-10-01 11:50:00,Oct 2013,09/24/2004,Return to Owner,,Dog,Spayed Female,9 years,Labrador Retriever Mix,Black,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
112967,A868562,,2022-11-05 15:08:00,Nov 2022,11/05/2020,Euthanasia,Rabies Risk,Other,Unknown,2 years,Fox,Gray/Brown,False
32011,A868557,,2022-11-05 15:10:00,Nov 2022,11/05/2020,Euthanasia,Rabies Risk,Other,Unknown,2 years,Raccoon,Black/Gray,False
150127,A869983,,2022-11-29 14:47:00,Nov 2022,05/29/2022,Euthanasia,Suffering,Cat,Unknown,6 months,Domestic Shorthair,Brown Tabby,False
78723,A870677,,2022-12-10 15:12:00,Dec 2022,12/10/2020,Died,In Kennel,Other,Unknown,2 years,Raccoon,Black/Gray,False


In [65]:
# When I examine the animals that did not make it into the merged set from the intake data, almost all of them (1126/1771) are from 2024, so there is a good chance these pets do not yet have outcomes.
# There are some animals that have intake data recorded AFTER their outcome data, and for some of these, I suspect clerical errors, but that data is not trustworthy for answering time-associated questions

intake_data_not_merged = intake_data_raw[~intake_data_raw['Animal ID'].isin(merged_data_raw['Animal ID'])]
intake_data_not_merged

#intake_data_not_merged.loc[intake_data_not_merged['DateTime'] >= '2024'].sort_values(by = 'DateTime')
#intake_data_not_merged.loc[intake_data_not_merged['DateTime'] < '2023'].sort_values(by = 'DateTime')

Unnamed: 0,Animal ID,Name,DateTime,MonthYear,Found Location,Intake Type,Intake Condition,Animal Type,Sex upon Intake,Age upon Intake,Breed,Color,Duplicated?
142993,A665649,,2013-10-21 11:12:00,October 2013,12507 Esplanade in Austin (TX),Stray,Normal,Cat,Unknown,1 year,Domestic Shorthair Mix,Black,False
142924,A659667,,2013-10-26 15:03:00,October 2013,12507 Esplanade in Austin (TX),Stray,Normal,Cat,Neutered Male,8 years,Domestic Longhair Mix,Blue/White,False
14142,A672696,Oreo,2014-02-15 12:30:00,February 2014,2 Brett Cove in Rollingwood (TX),Stray,Normal,Dog,Neutered Male,12 years,Rat Terrier Mix,Tricolor,False
50398,A673335,,2014-02-22 13:03:00,February 2014,4415 Secluded Hollow in Austin (TX),Wildlife,Sick,Other,Unknown,2 years,Raccoon,Black/Gray,False
55548,A674638,Sam,2014-03-16 11:46:00,March 2014,Rollingwood & Bee Cave Rd in Austin (TX),Stray,Normal,Dog,Neutered Male,7 years,Labrador Retriever Mix,Chocolate,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
163526,A908486,,2024-07-01 09:09:00,July 2024,4216 Gochman in Austin (TX),Stray,Neonatal,Cat,Unknown,4 weeks,American Shorthair,Black/Black,False
163521,A908489,,2024-07-01 09:09:00,July 2024,4216 Gochman in Austin (TX),Stray,Neonatal,Cat,Unknown,4 weeks,American Shorthair,Black/Black,False
163528,A908485,,2024-07-01 09:09:00,July 2024,4216 Gochman in Austin (TX),Stray,Neonatal,Cat,Unknown,4 weeks,American Shorthair,Black/Black,False
163523,A908487,,2024-07-01 09:09:00,July 2024,4216 Gochman in Austin (TX),Stray,Neonatal,Cat,Unknown,4 weeks,American Shorthair,Black/Black,False


### Cleaning the merged data set - Nulls

In [66]:
merged_data_raw.columns
# Looking at the columns of the merged data set, there are several columns that should be identical (ex. Breed_x & Breed_y),
# some columns are redundant (monthyear), and some are not helpful (Duplicated?)

Index(['Animal ID', 'Name_intake', 'DateTime_intake', 'MonthYear_intake',
       'Found Location', 'Intake Type', 'Intake Condition',
       'Animal Type_intake', 'Sex upon Intake', 'Age upon Intake',
       'Breed_intake', 'Color_intake', 'Duplicated?_intake', 'Name_outcome',
       'DateTime_outcome', 'MonthYear_outcome', 'Date of Birth',
       'Outcome Type', 'Outcome Subtype', 'Animal Type_outcome',
       'Sex upon Outcome', 'Age upon Outcome', 'Breed_outcome',
       'Color_outcome', 'Duplicated?_outcome'],
      dtype='object')

In [67]:
#First I will make sure that the columns that should be identical actually are.

#merged_data_raw.loc[(merged_data_raw['Breed_intake'] == merged_data_raw['Breed_outcome']) == False] 
#merged_data_raw.loc[(merged_data_raw['Animal Type_intake'] == merged_data_raw['Animal Type_outcome']) == False]
#merged_data_raw.loc[(merged_data_raw['Color_intake'] == merged_data_raw['Color_outcome']) == False]

In [68]:
# Since all of those duplicate columns matched like they should, I can drop the duplicates without issue. 
# I will also drop the sensitive, redundant, and not helpful data. 

merged_data = merged_data_raw.drop(['Name_intake', 'MonthYear_intake', 'Found Location', 'Duplicated?_intake', 'Name_outcome', 'MonthYear_outcome', 'Animal Type_outcome', 'Breed_outcome', 'Color_outcome', 'Duplicated?_outcome'], axis = 1)
merged_data.head()

Unnamed: 0,Animal ID,DateTime_intake,Intake Type,Intake Condition,Animal Type_intake,Sex upon Intake,Age upon Intake,Breed_intake,Color_intake,DateTime_outcome,Date of Birth,Outcome Type,Outcome Subtype,Sex upon Outcome,Age upon Outcome
0,A521520,2013-10-01 07:51:00,Stray,Normal,Dog,Spayed Female,7 years,Border Terrier/Border Collie,White/Tan,2013-10-01 15:39:00,09/07/2006,Return to Owner,,Spayed Female,7 years
1,A664235,2013-10-01 08:33:00,Stray,Normal,Cat,Unknown,1 week,Domestic Shorthair Mix,Orange/White,2013-10-01 10:39:00,09/24/2013,Transfer,Partner,Unknown,1 week
2,A664237,2013-10-01 08:33:00,Stray,Normal,Cat,Unknown,1 week,Domestic Shorthair Mix,Orange/White,2013-10-01 10:44:00,09/24/2013,Transfer,Partner,Unknown,1 week
3,A664236,2013-10-01 08:33:00,Stray,Normal,Cat,Unknown,1 week,Domestic Shorthair Mix,Orange/White,2013-10-01 10:44:00,09/24/2013,Transfer,Partner,Unknown,1 week
4,A664233,2013-10-01 08:53:00,Stray,Injured,Dog,Intact Female,3 years,Pit Bull Mix,Blue/White,2013-10-01 15:33:00,09/30/2010,Euthanasia,Suffering,Intact Female,3 years


In [69]:
# And renaming some columns to improve organization. 
merged_data = merged_data.rename({'DateTime_intake': 'Intake DateTime', 'Animal Type_intake': 'Animal Type', 'Breed_intake': 'Breed', 'Color_intake':'Color', 'DateTime_outcome': 'Outcome DateTime'}, axis='columns')
merged_data = merged_data.reindex(columns = ['Animal ID', 'Animal Type', 'Date of Birth', 'Breed', 'Color', 'Intake Type', 'Intake Condition', 'Sex upon Intake', 'Age upon Intake', 'Intake DateTime', 'Outcome DateTime', 'Outcome Type', 'Outcome Subtype', 'Sex upon Outcome', 'Age upon Outcome'])
merged_data.head()


Unnamed: 0,Animal ID,Animal Type,Date of Birth,Breed,Color,Intake Type,Intake Condition,Sex upon Intake,Age upon Intake,Intake DateTime,Outcome DateTime,Outcome Type,Outcome Subtype,Sex upon Outcome,Age upon Outcome
0,A521520,Dog,09/07/2006,Border Terrier/Border Collie,White/Tan,Stray,Normal,Spayed Female,7 years,2013-10-01 07:51:00,2013-10-01 15:39:00,Return to Owner,,Spayed Female,7 years
1,A664235,Cat,09/24/2013,Domestic Shorthair Mix,Orange/White,Stray,Normal,Unknown,1 week,2013-10-01 08:33:00,2013-10-01 10:39:00,Transfer,Partner,Unknown,1 week
2,A664237,Cat,09/24/2013,Domestic Shorthair Mix,Orange/White,Stray,Normal,Unknown,1 week,2013-10-01 08:33:00,2013-10-01 10:44:00,Transfer,Partner,Unknown,1 week
3,A664236,Cat,09/24/2013,Domestic Shorthair Mix,Orange/White,Stray,Normal,Unknown,1 week,2013-10-01 08:33:00,2013-10-01 10:44:00,Transfer,Partner,Unknown,1 week
4,A664233,Dog,09/30/2010,Pit Bull Mix,Blue/White,Stray,Injured,Intact Female,3 years,2013-10-01 08:53:00,2013-10-01 15:33:00,Euthanasia,Suffering,Intact Female,3 years


In [70]:
merged_data.loc[(merged_data['Animal ID'] == 'A661830')]

Unnamed: 0,Animal ID,Animal Type,Date of Birth,Breed,Color,Intake Type,Intake Condition,Sex upon Intake,Age upon Intake,Intake DateTime,Outcome DateTime,Outcome Type,Outcome Subtype,Sex upon Outcome,Age upon Outcome
830,A661830,Cat,08/25/2012,Domestic Medium Hair Mix,Torbie,Stray,Normal,Intact Female,1 year,2013-10-15 08:25:00,2013-10-17 17:58:00,Transfer,SCRP,Intact Female,1 year


In [71]:
merged_data.describe(include = 'all')
# The data set contains over 180k animals. All of this information could potentially effect outcome (adoption vs other).
# There is still some redundant information between "age upon intake," "Date of Birth," and "age upon outcome," 
# but different questions might be more easily answered with one vs another, so I will leave them

Unnamed: 0,Animal ID,Animal Type,Date of Birth,Breed,Color,Intake Type,Intake Condition,Sex upon Intake,Age upon Intake,Intake DateTime,Outcome DateTime,Outcome Type,Outcome Subtype,Sex upon Outcome,Age upon Outcome
count,185717,185717,185717,185717,185717,185717,185717,185715,185716,185717,185717,185668,79884,185715,185707
unique,146091,5,8375,2933,647,6,20,5,55,,,11,26,5,55
top,A721033,Dog,05/20/2015,Domestic Shorthair Mix,Black/White,Stray,Normal,Intact Male,1 year,,,Adoption,Partner,Neutered Male,1 year
freq,561,109468,603,35153,19818,126863,160601,62181,31806,,,88977,41057,69603,31756
mean,,,,,,,,,,2018-06-09 08:22:44.187554048,2018-08-13 15:19:55.906998272,,,,
min,,,,,,,,,,2013-10-01 07:51:00,2013-10-01 10:39:00,,,,
25%,,,,,,,,,,2015-11-25 08:33:00,2016-02-16 19:08:00,,,,
50%,,,,,,,,,,2018-03-09 11:34:00,2018-06-02 18:03:00,,,,
75%,,,,,,,,,,2020-09-11 12:58:00,2020-12-09 12:45:00,,,,
max,,,,,,,,,,2024-06-30 13:24:00,2024-07-01 11:15:00,,,,


In [72]:
merged_data.isnull().sum()
# There are a number of null values throughout the data set, 
# The "outcome subtype" is the largest cluster because not every Outcome Type needs to be further described by a Subtype
# Options for handling the subtype nulls include: 
#      marking the nulls as "Unspecific" or similar placeholder, 
#      dropping the subtype variable altogether, 
#      combining the type-subtype data into one variable. 
# For now, I will mark the Nan Subtypes as Not Specified, but I might circle back to this later. 

Animal ID                0
Animal Type              0
Date of Birth            0
Breed                    0
Color                    0
Intake Type              0
Intake Condition         0
Sex upon Intake          2
Age upon Intake          1
Intake DateTime          0
Outcome DateTime         0
Outcome Type            49
Outcome Subtype     105833
Sex upon Outcome         2
Age upon Outcome        10
dtype: int64

In [73]:
merged_data['Outcome Subtype'] = merged_data['Outcome Subtype'].fillna(value = 'Not Specified')

In [74]:
merged_data.loc[merged_data['Outcome Type'].isnull() == True]

# There are a few pets that have no recorded Outcome type, and these can't help answer my questions about what gets pets adopted, so I will drop them. 

merged_data = merged_data.loc[merged_data['Outcome Type'].isnull() != True]

In [75]:
merged_data = merged_data.loc[merged_data['Sex upon Outcome'].isnull() != True]

In [76]:
merged_data.loc[merged_data['Age upon Outcome'].isnull() == True]

# There are a few pets that have no recorded age or sex upon outcome. 
# For a couple of these, they have an "age upon intake," so "age upon outcome" 
# can easily be calculated and filled in, but because my data set is large, I'm going to drop these. 

merged_data = merged_data.dropna(subset = ['Age upon Outcome'], axis = 0)

#merged_data = merged_data.loc[merged_data['Age upon Outcome'].isnull() != True &
#                             merged_data['Sex upon Outcome'].isnull() !=]

In [77]:
merged_data.isnull().sum()

Animal ID           0
Animal Type         0
Date of Birth       0
Breed               0
Color               0
Intake Type         0
Intake Condition    0
Sex upon Intake     0
Age upon Intake     0
Intake DateTime     0
Outcome DateTime    0
Outcome Type        0
Outcome Subtype     0
Sex upon Outcome    0
Age upon Outcome    0
dtype: int64

In [78]:
# Since I hope this data analysis can be repeated on updated data in the future, I put the above
# null management sequence into a method that can go in a module

def drop_null_data(df):
    #identifies null data in provided data frame, drops it, and reports columns in which the data were null

    #generates a Series with index = df's columns, values = number of nulls per column
    drops = df.isnull().sum()

    #if there are no nulls in the df, print & return original df
    if drops.sum() == 0:
        print("No null values identified")
            
    #if there are nulls in the df, drop them all and print which columns contained nulls and how many were dropped; returns df without nulls    
    else:
        df = df.dropna(axis = 0)
        for index, value in drops.items():
            if value != 0:
                print('"{}" contained {} null value(s)'.format(index, value))
    return df

### Cleaning the merged data set - Sanity Checks

##### Ages - Neonates & Aged

In [79]:
# The data contains a small number of animals that have ages listed as negative numbers. 
# They are few enough that I could drop them, but I prefer to fix them. 

merged_data['Age upon Intake'].unique()

array(['7 years', '1 week', '3 years', '4 months', '8 years', '17 years',
       '4 years', '1 year', '3 weeks', '6 years', '5 years', '2 years',
       '1 weeks', '2 months', '8 months', '0 years', '1 month',
       '3 months', '9 months', '4 weeks', '10 months', '6 months',
       '2 weeks', '7 months', '5 months', '12 years', '16 years',
       '15 years', '10 years', '9 years', '3 days', '11 months', '2 days',
       '4 days', '11 years', '1 day', '14 years', '13 years', '18 years',
       '5 weeks', '19 years', '20 years', '5 days', '6 days', '22 years',
       '-1 years', '-3 years', '25 years', '-2 years', '24 years',
       '-4 years', '21 years', '23 years', '30 years', '28 years'],
      dtype=object)

In [80]:
# This method will use regex to convert the ages from strings to ints
# I leave the ages in days because the computer doesn't need years/months/weeks/days to draw conclusions, 
# but for data visualization later, they will need to be converted to more meaningful numbers.

import re

year_pattern = re.compile(r"(-?\d+)\s*(?:year|years)")
month_pattern = re.compile(r"(-?\d+)\s*(?:month|months)")
week_pattern = re.compile(r"(-?\d+)\s*(?:week|weeks)")
day_pattern = re.compile(r"(-?\d+)\s*(?:day|days)")

# Function to convert strings to days
def convert_to_days(age_str):
    if "year" in age_str:
        value = re.search(year_pattern, age_str).group(1)
        return int(value) * 365
    elif "month" in age_str:
        value = re.search(month_pattern, age_str).group(1)
        return int(value) * 30 
    elif "week" in age_str:
        value = re.search(week_pattern, age_str).group(1)
        return int(value) * 7
    elif "day" in age_str:
        value = re.search(day_pattern, age_str).group(1)
        return int(value)
    else:
        print(f"Unknown pattern: {age_str}; -1 days reported")
        return int(-1)

In [81]:
merged_data['Age upon Intake'] = merged_data['Age upon Intake'].apply(convert_to_days)
merged_data['Age upon Outcome'] = merged_data['Age upon Outcome'].apply(convert_to_days)
merged_data.head()

Unnamed: 0,Animal ID,Animal Type,Date of Birth,Breed,Color,Intake Type,Intake Condition,Sex upon Intake,Age upon Intake,Intake DateTime,Outcome DateTime,Outcome Type,Outcome Subtype,Sex upon Outcome,Age upon Outcome
0,A521520,Dog,09/07/2006,Border Terrier/Border Collie,White/Tan,Stray,Normal,Spayed Female,2555,2013-10-01 07:51:00,2013-10-01 15:39:00,Return to Owner,Not Specified,Spayed Female,2555
1,A664235,Cat,09/24/2013,Domestic Shorthair Mix,Orange/White,Stray,Normal,Unknown,7,2013-10-01 08:33:00,2013-10-01 10:39:00,Transfer,Partner,Unknown,7
2,A664237,Cat,09/24/2013,Domestic Shorthair Mix,Orange/White,Stray,Normal,Unknown,7,2013-10-01 08:33:00,2013-10-01 10:44:00,Transfer,Partner,Unknown,7
3,A664236,Cat,09/24/2013,Domestic Shorthair Mix,Orange/White,Stray,Normal,Unknown,7,2013-10-01 08:33:00,2013-10-01 10:44:00,Transfer,Partner,Unknown,7
4,A664233,Dog,09/30/2010,Pit Bull Mix,Blue/White,Stray,Injured,Intact Female,1095,2013-10-01 08:53:00,2013-10-01 15:33:00,Euthanasia,Suffering,Intact Female,1095


In [82]:
# Dropping rows with negative ages for both Age upon Intake & Age upon Outcome

pre = len(merged_data)
merged_data = merged_data.loc[merged_data['Age upon Intake'] >= 0]
post= len(merged_data)
print(f"Dropped rows due to negative Intake ages: {pre - post}")

pre2 = len(merged_data)
merged_data = merged_data.loc[merged_data['Age upon Outcome'] >= 0]
post2 = len(merged_data)
print(f"Dropped rows due to negative Outcome ages: {pre - post}")

Dropped rows due to negative Intake ages: 34
Dropped rows due to negative Outcome ages: 34


In [83]:
merged_data.head()

Unnamed: 0,Animal ID,Animal Type,Date of Birth,Breed,Color,Intake Type,Intake Condition,Sex upon Intake,Age upon Intake,Intake DateTime,Outcome DateTime,Outcome Type,Outcome Subtype,Sex upon Outcome,Age upon Outcome
0,A521520,Dog,09/07/2006,Border Terrier/Border Collie,White/Tan,Stray,Normal,Spayed Female,2555,2013-10-01 07:51:00,2013-10-01 15:39:00,Return to Owner,Not Specified,Spayed Female,2555
1,A664235,Cat,09/24/2013,Domestic Shorthair Mix,Orange/White,Stray,Normal,Unknown,7,2013-10-01 08:33:00,2013-10-01 10:39:00,Transfer,Partner,Unknown,7
2,A664237,Cat,09/24/2013,Domestic Shorthair Mix,Orange/White,Stray,Normal,Unknown,7,2013-10-01 08:33:00,2013-10-01 10:44:00,Transfer,Partner,Unknown,7
3,A664236,Cat,09/24/2013,Domestic Shorthair Mix,Orange/White,Stray,Normal,Unknown,7,2013-10-01 08:33:00,2013-10-01 10:44:00,Transfer,Partner,Unknown,7
4,A664233,Dog,09/30/2010,Pit Bull Mix,Blue/White,Stray,Injured,Intact Female,1095,2013-10-01 08:53:00,2013-10-01 15:33:00,Euthanasia,Suffering,Intact Female,1095


In [84]:
merged_data['Intake Condition'].value_counts()

# These are the identifier & counts for the intake condition column. Some are very large & some are only single animals.
# Depending on the question I want to answer, I might ultimately like to consolidate these, but first, I would like to do some checking to make sure they are accurate.
# For example, a neonate should not be an old animal, and a pet described as "aged" should not be young. 
# Pregnant and nursing animals should be female. 

Intake Condition
Normal        160524
Injured        10469
Sick            7448
Nursing         3942
Neonatal        1426
Aged             553
Medical          392
Other            368
Pregnant         164
Feral            151
Behavior          82
Med Attn          52
Unknown           17
Med Urgent        11
Neurologic        10
Parvo              5
Space              4
Agonal             3
Panleuk            1
Congenital         1
Name: count, dtype: int64

In [85]:
merged_data.loc[merged_data['Intake Condition'] == 'Neonatal'].sort_values(by = 'Age upon Intake', ascending = False)

# There are 2 year old animals being called "neonates," so that's not correct. Medically speaking, puppies/kittens are considered neonates when they're younger than 14 days (28 if you're lenient).
# I will drop any animals that are considered "neonatal" with an age upon intake <30 days. 

Unnamed: 0,Animal ID,Animal Type,Date of Birth,Breed,Color,Intake Type,Intake Condition,Sex upon Intake,Age upon Intake,Intake DateTime,Outcome DateTime,Outcome Type,Outcome Subtype,Sex upon Outcome,Age upon Outcome
209617,A908186,Cat,06/26/2022,Domestic Shorthair,Calico,Stray,Neonatal,Intact Female,730,2024-06-26 11:18:00,2024-06-27 13:29:00,Transfer,Partner,Intact Female,730
173726,A845298,Cat,10/27/2019,Domestic Shorthair,Torbie,Abandoned,Neonatal,Intact Female,730,2021-10-27 17:24:00,2021-10-27 17:41:00,Transfer,Partner,Intact Female,730
170617,A841441,Dog,08/27/2019,Queensland Heeler,Blue,Stray,Neonatal,Spayed Female,730,2021-08-27 15:18:00,2021-09-29 18:59:00,Adoption,Not Specified,Spayed Female,730
170618,A841441,Dog,08/27/2019,Queensland Heeler,Blue,Stray,Neonatal,Spayed Female,730,2021-08-27 15:18:00,2021-10-13 17:31:00,Adoption,Not Specified,Spayed Female,730
172263,A843380,Dog,09/30/2019,Chihuahua Shorthair,Tan/White,Public Assist,Neonatal,Intact Female,730,2021-09-30 11:05:00,2021-10-04 15:58:00,Return to Owner,Not Specified,Intact Female,730
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
186624,A864549,Cat,08/27/2022,Domestic Shorthair,White,Owner Surrender,Neonatal,Unknown,0,2022-08-27 09:58:00,2022-08-28 14:39:00,Transfer,Partner,Unknown,1
186640,A864587,Cat,08/28/2022,Domestic Shorthair,Blue,Owner Surrender,Neonatal,Intact Female,0,2022-08-28 00:00:00,2022-08-29 12:46:00,Transfer,Partner,Intact Female,1
201464,A891068,Cat,10/16/2023,Domestic Shorthair,Brown Tabby,Owner Surrender,Neonatal,Intact Female,0,2023-10-16 12:05:00,2023-10-18 13:55:00,Transfer,Partner,Intact Female,2
201465,A891067,Cat,10/16/2023,Domestic Shorthair,Brown Tabby,Owner Surrender,Neonatal,Intact Female,0,2023-10-16 12:05:00,2023-10-18 13:55:00,Transfer,Partner,Intact Female,2


In [86]:
merged_data.loc[(merged_data['Intake Condition'] == 'Aged') & (merged_data['Age upon Intake'] < 1460)].sort_values(by = 'Age upon Intake', ascending = True)

# There is no concrete ages at which an animal becomes "aged," and an animal who has had a hard life can appear "aged" before they are really old. 
# BUT there is no situation in which a 2 day old animal should be called "aged."
# I will be lenient and say any animal under 2yr shouldn't be in the "aged" category. This also allows for exotics, which have shorter lifespans.

Unnamed: 0,Animal ID,Animal Type,Date of Birth,Breed,Color,Intake Type,Intake Condition,Sex upon Intake,Age upon Intake,Intake DateTime,Outcome DateTime,Outcome Type,Outcome Subtype,Sex upon Outcome,Age upon Outcome
200323,A888924,Cat,09/12/2023,Domestic Shorthair,Black,Stray,Aged,Intact Male,2,2023-09-14 16:36:00,2023-09-14 18:14:00,Transfer,Partner,Intact Male,2
200322,A888923,Cat,09/12/2023,Domestic Shorthair,Tortie,Stray,Aged,Intact Female,2,2023-09-14 16:36:00,2023-09-14 18:14:00,Transfer,Partner,Intact Female,2
199705,A887829,Cat,08/25/2023,Domestic Shorthair,Brown Tabby,Owner Surrender,Aged,Neutered Male,2,2023-08-27 14:37:00,2023-08-27 16:06:00,Transfer,Partner,Neutered Male,2
199706,A887831,Cat,08/25/2023,Domestic Shorthair,Black/White,Owner Surrender,Aged,Neutered Male,2,2023-08-27 14:37:00,2023-08-27 16:06:00,Transfer,Partner,Neutered Male,2
199707,A887830,Cat,08/25/2023,Domestic Shorthair,Brown Tabby,Owner Surrender,Aged,Neutered Male,2,2023-08-27 14:37:00,2023-08-27 16:06:00,Transfer,Partner,Neutered Male,2
200274,A888856,Cat,09/03/2023,Domestic Shorthair,Tortie,Stray,Aged,Intact Female,7,2023-09-13 15:19:00,2023-09-13 16:17:00,Transfer,Partner,Intact Female,7
200327,A888919,Cat,08/24/2023,Domestic Shorthair,Cream Tabby/White,Stray,Aged,Intact Female,21,2023-09-14 17:21:00,2023-09-14 18:14:00,Transfer,Partner,Intact Female,21
200397,A889057,Cat,08/25/2023,Domestic Shorthair,Orange/White,Stray,Aged,Intact Male,21,2023-09-16 15:40:00,2023-09-16 17:26:00,Transfer,Partner,Intact Male,21
163399,A831545,Cat,03/01/2021,Domestic Shorthair,Black/White,Stray,Aged,Intact Female,28,2021-03-29 15:20:00,2021-07-17 09:48:00,Adoption,Foster,Spayed Female,120
163381,A831546,Cat,03/01/2021,Domestic Shorthair,Black/White,Stray,Aged,Intact Male,28,2021-03-29 12:40:00,2021-09-18 14:08:00,Adoption,Not Specified,Neutered Male,180


In [87]:
#Sanity checked - no old "neonatal" or young "aged" animals
merged_data = merged_data[(merged_data['Intake Condition'] != 'Neonatal') |
                            (merged_data['Age upon Intake'] < 30)]

merged_data = merged_data[(merged_data['Intake Condition'] != 'Aged') |
                          (merged_data['Age upon Intake'] > 730)]

##### Reproductive status - Pregnant & Nursing

In [88]:
merged_data.loc[(merged_data['Intake Condition'] == 'Pregnant') &
                (merged_data['Sex upon Intake'] != 'Intact Female')]

# There are a fair number of animals listed as pregnant that are male or spayed. 
# Since I can't trust this information, I will drop these ones too. 

Unnamed: 0,Animal ID,Animal Type,Date of Birth,Breed,Color,Intake Type,Intake Condition,Sex upon Intake,Age upon Intake,Intake DateTime,Outcome DateTime,Outcome Type,Outcome Subtype,Sex upon Outcome,Age upon Outcome
34543,A701482,Cat,04/30/2014,Domestic Longhair Mix,Brown Tabby,Stray,Pregnant,Spayed Female,365,2015-04-30 11:50:00,2015-05-01 09:00:00,Transfer,SCRP,Spayed Female,365
34544,A701482,Cat,04/30/2014,Domestic Longhair Mix,Brown Tabby,Stray,Pregnant,Spayed Female,365,2015-04-30 11:50:00,2015-06-23 13:40:00,Transfer,Partner,Spayed Female,365
36678,A703551,Cat,05/26/2015,Domestic Shorthair Mix,Brown Tabby,Stray,Pregnant,Intact Male,0,2015-05-26 13:16:00,2015-05-28 10:05:00,Died,In Kennel,Intact Male,2
62163,A599849,Dog,12/12/2004,Pit Bull,Black/White,Stray,Pregnant,Spayed Female,4015,2016-06-05 18:19:00,2016-06-07 16:10:00,Return to Owner,Not Specified,Spayed Female,4015
64823,A731133,Dog,07/17/2016,Jack Russell Terrier Mix,Sable,Stray,Pregnant,Intact Male,0,2016-07-17 08:41:00,2016-07-19 14:09:00,Transfer,Partner,Intact Male,2
64824,A731134,Dog,07/17/2016,Jack Russell Terrier Mix,Black/Tan,Stray,Pregnant,Intact Male,0,2016-07-17 08:41:00,2016-07-19 14:09:00,Transfer,Partner,Intact Male,2
64826,A731136,Dog,07/17/2016,Jack Russell Terrier Mix,White/Brown,Stray,Pregnant,Intact Male,0,2016-07-17 08:42:00,2016-07-19 14:09:00,Transfer,Partner,Intact Male,2
64828,A731138,Dog,07/17/2016,Jack Russell Terrier Mix,Tricolor,Stray,Pregnant,Intact Male,0,2016-07-17 08:44:00,2016-07-19 14:10:00,Transfer,Partner,Intact Male,2
130953,A793977,Dog,05/02/2017,Australian Kelpie Mix,Brown/White,Stray,Pregnant,Spayed Female,730,2019-05-02 13:04:00,2019-05-07 15:23:00,Adoption,Not Specified,Spayed Female,730
130954,A793977,Dog,05/02/2017,Australian Kelpie Mix,Brown/White,Stray,Pregnant,Spayed Female,730,2019-05-02 13:04:00,2019-10-14 11:46:00,Return to Owner,Not Specified,Spayed Female,730


In [89]:
#Sanity checked - male/spayed female pregnant

merged_data = merged_data.loc[(merged_data['Intake Condition'] != 'Pregnant') |
                (merged_data['Sex upon Intake'] == 'Intact Female')]

In [90]:
# Lastly, I want to look more closely at the "nursing" intake condition. These animals might refer to the nursing bitches (and thus should also be female) or to the nursing puppies. 

merged_data.loc[(merged_data['Intake Condition'] == 'Nursing') &
                (merged_data['Age upon Intake'] > 60)].sort_values(by = 'Age upon Intake')

# It looks like the shelter mostly uses "Nursing" to indicate nursing puppies/kittens. 
# HOWEVER, about 10% of the Nursing animals are over 8wks old (when puppies, kittens, and rabbits should be done nursing)
# And some are old enough that these are bitches/queens. There are also a few older males included; I will drop these. 
# I would like to split the young-nursing animals into another group (as being a cute puppy likely increases your odds of adoption)
# and keep the older-nursing females in a separate category (as being an older animal who has nursed recently might also change your odds of adoption).

Unnamed: 0,Animal ID,Animal Type,Date of Birth,Breed,Color,Intake Type,Intake Condition,Sex upon Intake,Age upon Intake,Intake DateTime,Outcome DateTime,Outcome Type,Outcome Subtype,Sex upon Outcome,Age upon Outcome
8024,A673721,Other,11/01/2013,Rabbit Sh Mix,Black/White,Public Assist,Nursing,Intact Female,90,2014-03-01 10:12:00,2014-03-05 16:20:00,Transfer,Partner,Intact Female,120
133863,A796707,Cat,02/04/2019,Domestic Shorthair,Brown Tabby,Stray,Nursing,Unknown,90,2019-06-04 15:32:00,2019-06-04 18:35:00,Transfer,Partner,Unknown,90
8023,A673722,Other,11/01/2013,Rabbit Sh Mix,Black/White,Public Assist,Nursing,Intact Female,90,2014-03-01 10:12:00,2014-03-05 16:21:00,Transfer,Partner,Intact Female,120
8022,A673723,Other,11/01/2013,Rabbit Sh Mix,Blue/White,Public Assist,Nursing,Intact Female,90,2014-03-01 10:12:00,2014-03-05 16:21:00,Transfer,Partner,Intact Female,120
8021,A673720,Other,11/01/2013,Rabbit Sh Mix,Black/White,Public Assist,Nursing,Intact Female,90,2014-03-01 10:12:00,2014-03-05 16:19:00,Transfer,Partner,Intact Female,120
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140985,A803389,Dog,09/05/2012,Doberman Pinsch,Brown,Stray,Nursing,Intact Female,2190,2019-08-31 13:41:00,2019-09-06 16:49:00,Adoption,Not Specified,Spayed Female,2555
164984,A775493,Dog,07/03/2012,Chihuahua Shorthair Mix,Tan/White,Stray,Nursing,Spayed Female,2920,2021-05-11 08:12:00,2021-05-11 12:38:00,Transfer,Partner,Spayed Female,2920
71981,A737659,Dog,11/02/2008,Chihuahua Shorthair Mix,Black,Stray,Nursing,Neutered Male,2920,2016-11-02 11:38:00,2017-01-08 14:59:00,Adoption,Not Specified,Neutered Male,2920
73755,A739289,Cat,11/30/2007,Domestic Shorthair Mix,Blue,Stray,Nursing,Spayed Female,3285,2016-11-30 12:38:00,2017-01-15 14:25:00,Adoption,Not Specified,Spayed Female,3285


In [91]:
# Animals younger than 60 days (aka 8.5 weeks) will now have the Intake Condition "Nursing Juvenile"

young_nursing = merged_data.loc[(merged_data['Intake Condition'] == 'Nursing') &
                                (merged_data['Age upon Intake'] <= 60)].index
merged_data['Intake Condition'][young_nursing] = 'Nursing Juvenile'
merged_data.loc[(merged_data['Intake Condition'] == 'Nursing Juvenile') &
                                (merged_data['Age upon Intake'] <= 60)]

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  merged_data['Intake Condition'][young_nursing] = 'Nursing Juvenile'


Unnamed: 0,Animal ID,Animal Type,Date of Birth,Breed,Color,Intake Type,Intake Condition,Sex upon Intake,Age upon Intake,Intake DateTime,Outcome DateTime,Outcome Type,Outcome Subtype,Sex upon Outcome,Age upon Outcome
39,A664285,Cat,10/01/2013,Domestic Shorthair Mix,Black,Stray,Nursing Juvenile,Intact Male,0,2013-10-01 15:20:00,2013-10-02 11:03:00,Died,In Foster,Intact Male,1
40,A664284,Cat,10/01/2013,Domestic Shorthair Mix,Black/White,Stray,Nursing Juvenile,Intact Male,0,2013-10-01 15:20:00,2013-10-02 11:02:00,Died,In Foster,Intact Male,1
94,A664346,Cat,09/17/2013,Domestic Shorthair Mix,Black Tabby/White,Stray,Nursing Juvenile,Intact Male,14,2013-10-02 12:21:00,2013-10-02 14:27:00,Transfer,Partner,Intact Male,14
95,A664347,Cat,09/17/2013,Domestic Shorthair Mix,Blue Tabby,Stray,Nursing Juvenile,Intact Male,14,2013-10-02 12:21:00,2013-10-02 14:28:00,Transfer,Partner,Intact Male,14
96,A664344,Cat,09/17/2013,Domestic Shorthair Mix,White/Blue,Stray,Nursing Juvenile,Intact Male,14,2013-10-02 12:21:00,2013-10-02 14:25:00,Transfer,Partner,Intact Male,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169686,A840467,Cat,08/02/2021,Domestic Shorthair,Cream/White,Abandoned,Nursing Juvenile,Intact Male,7,2021-08-09 10:41:00,2021-08-09 15:00:00,Transfer,Partner,Intact Male,7
169687,A840468,Cat,08/02/2021,Domestic Shorthair,Cream,Abandoned,Nursing Juvenile,Intact Male,7,2021-08-09 10:41:00,2021-08-09 15:00:00,Transfer,Partner,Intact Male,7
169688,A840470,Cat,08/02/2021,Domestic Shorthair,Orange,Abandoned,Nursing Juvenile,Intact Female,7,2021-08-09 10:41:00,2021-08-09 15:00:00,Transfer,Partner,Intact Female,7
169689,A840471,Cat,08/02/2021,Domestic Shorthair,Orange,Abandoned,Nursing Juvenile,Intact Male,7,2021-08-09 10:41:00,2021-08-09 15:00:00,Transfer,Partner,Intact Male,7


In [92]:
# Female animals older than 4 months (the youngest age at which a female could be nursing for dogs/cats/rabbits/guinea pigs, which are the only species represented as nursing in this data)

nursing_adults = merged_data.loc[(merged_data['Intake Condition'] == 'Nursing') &
                                 (merged_data['Age upon Intake'] > 120) &
                                 ((merged_data['Sex upon Intake'] == 'Intact Female') |
                                 (merged_data['Sex upon Intake'] == 'Spayed Female'))].index
merged_data['Intake Condition'][nursing_adults] = 'Nursing Adult'
merged_data['Intake Condition'].unique()

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  merged_data['Intake Condition'][nursing_adults] = 'Nursing Adult'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

array(['Normal', 'Injured', 'Nursing Juvenile', 'Nursing Adult', 'Sick',
       'Other', 'Pregnant', 'Aged', 'Feral', 'Nursing', 'Medical',
       'Behavior', 'Unknown', 'Neonatal', 'Space', 'Panleuk',
       'Med Urgent', 'Med Attn', 'Agonal', 'Neurologic', 'Parvo',
       'Congenital'], dtype=object)

In [93]:
# Any animals lefts in the 'Nursing' Intake condition are now either male adults or age-ambiguous enough that I can't tell whether they are the mothers or the juveniles. These will be dropped.

merged_data = merged_data.loc[(merged_data['Intake Condition'] != 'Nursing')]
merged_data['Intake Condition'].unique()

array(['Normal', 'Injured', 'Nursing Juvenile', 'Nursing Adult', 'Sick',
       'Other', 'Pregnant', 'Aged', 'Feral', 'Medical', 'Behavior',
       'Unknown', 'Neonatal', 'Space', 'Panleuk', 'Med Urgent',
       'Med Attn', 'Agonal', 'Neurologic', 'Parvo', 'Congenital'],
      dtype=object)

#### Ease of Use - Splitting Sex & Reproductive Status

In [94]:
# The animal industry often refers to Sex and Reproductive Status in one phrase (ex. Intact Female), so it is stored as 
# one column in this data set. However, there are actually two bits of information stored in that phrase. I would prefer
# to split this over two features so we can more easily ask questions about sex and reproductive status independently.
# Sex would then be a static categorical over the pet's life (so no 'upon intake'/'upon outcome' separator necessary). 
# Since a pet might come to the shelter intact and leave as neutered, I will keep the 'upon intake'/'upon outcome' separator
# for reproductive status. I will use "Altered" in place of "spayed" or "neutered" to consolidate to a sex-neutral term.

merged_data['Sex upon Intake'].unique()

array(['Spayed Female', 'Unknown', 'Intact Female', 'Intact Male',
       'Neutered Male'], dtype=object)

In [95]:
def split_sex_and_repro(row):
    sex_upon_intake = row['Sex upon Intake']
    sex_upon_outcome = row['Sex upon Outcome']
    
    # Define a dictionary mapping intake/outcome statuses to sex and reproductive status
    status_mapping = {
        'Spayed Female': {'sex': 'Female', 'repro_status': 'Altered'},
        'Intact Female': {'sex': 'Female', 'repro_status': 'Intact'},
        'Neutered Male': {'sex': 'Male', 'repro_status': 'Altered'},
        'Intact Male': {'sex': 'Male', 'repro_status': 'Intact'},
    }
    
    # Determine sex based on intake status
    sex_info = status_mapping.get(sex_upon_intake, {'sex': 'Unknown', 'repro_status': 'Unknown'})
    sex = sex_info['sex']
    repro_status_in = sex_info['repro_status']
    
    # Determine reproductive status based on outcome status, keeping sex unchanged
    repro_status_out = status_mapping.get(sex_upon_outcome, {'sex': 'Unknown', 'repro_status': 'Unknown'}).get('repro_status', 'Unknown')
        
    return sex, repro_status_in, repro_status_out

In [96]:
merged_data['Sex'], merged_data['Intake Reproductive Status'], merged_data['Outcome Reproductive Status'] = zip(*merged_data.apply(split_sex_and_repro, axis=1))


In [97]:
merged_data = merged_data.drop(['Sex upon Intake', 'Sex upon Outcome'], axis = 1)
merged_data

Unnamed: 0,Animal ID,Animal Type,Date of Birth,Breed,Color,Intake Type,Intake Condition,Age upon Intake,Intake DateTime,Outcome DateTime,Outcome Type,Outcome Subtype,Age upon Outcome,Sex,Intake Reproductive Status,Outcome Reproductive Status
0,A521520,Dog,09/07/2006,Border Terrier/Border Collie,White/Tan,Stray,Normal,2555,2013-10-01 07:51:00,2013-10-01 15:39:00,Return to Owner,Not Specified,2555,Female,Altered,Altered
1,A664235,Cat,09/24/2013,Domestic Shorthair Mix,Orange/White,Stray,Normal,7,2013-10-01 08:33:00,2013-10-01 10:39:00,Transfer,Partner,7,Unknown,Unknown,Unknown
2,A664237,Cat,09/24/2013,Domestic Shorthair Mix,Orange/White,Stray,Normal,7,2013-10-01 08:33:00,2013-10-01 10:44:00,Transfer,Partner,7,Unknown,Unknown,Unknown
3,A664236,Cat,09/24/2013,Domestic Shorthair Mix,Orange/White,Stray,Normal,7,2013-10-01 08:33:00,2013-10-01 10:44:00,Transfer,Partner,7,Unknown,Unknown,Unknown
4,A664233,Dog,09/30/2010,Pit Bull Mix,Blue/White,Stray,Injured,1095,2013-10-01 08:53:00,2013-10-01 15:33:00,Euthanasia,Suffering,1095,Female,Intact,Intact
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209639,A908372,Cat,06/25/2024,Domestic Shorthair Mix,Blue,Stray,Normal,3,2024-06-28 16:29:00,2024-06-28 18:44:00,Transfer,Partner,3,Male,Intact,Intact
209640,A908375,Cat,06/25/2024,Domestic Shorthair Mix,Blue,Stray,Normal,3,2024-06-28 16:29:00,2024-06-28 18:45:00,Transfer,Partner,3,Female,Intact,Intact
209641,A908376,Cat,06/25/2024,Domestic Shorthair Mix,Black,Stray,Normal,3,2024-06-28 16:29:00,2024-06-28 18:45:00,Transfer,Partner,3,Male,Intact,Intact
209642,A908374,Cat,06/25/2024,Domestic Shorthair Mix,Blue,Stray,Normal,3,2024-06-28 16:29:00,2024-06-28 18:44:00,Transfer,Partner,3,Male,Intact,Intact


In [98]:
merged_data = merged_data.rename({'Age upon Intake': 'Intake Age', 'Age upon Outcome':'Outcome Age'}, axis='columns')
merged_data = merged_data.reindex(columns = ['Animal ID', 'Animal Type', 'Date of Birth', 'Sex', 'Breed', 'Color', 'Intake Type', 'Intake Condition', 'Intake Age', 'Intake Reproductive Status', 'Intake DateTime', 'Outcome DateTime', 'Outcome Type', 'Outcome Subtype', 'Outcome Age', 'Outcome Reproductive Status'])
merged_data.head()

Unnamed: 0,Animal ID,Animal Type,Date of Birth,Sex,Breed,Color,Intake Type,Intake Condition,Intake Age,Intake Reproductive Status,Intake DateTime,Outcome DateTime,Outcome Type,Outcome Subtype,Outcome Age,Outcome Reproductive Status
0,A521520,Dog,09/07/2006,Female,Border Terrier/Border Collie,White/Tan,Stray,Normal,2555,Altered,2013-10-01 07:51:00,2013-10-01 15:39:00,Return to Owner,Not Specified,2555,Altered
1,A664235,Cat,09/24/2013,Unknown,Domestic Shorthair Mix,Orange/White,Stray,Normal,7,Unknown,2013-10-01 08:33:00,2013-10-01 10:39:00,Transfer,Partner,7,Unknown
2,A664237,Cat,09/24/2013,Unknown,Domestic Shorthair Mix,Orange/White,Stray,Normal,7,Unknown,2013-10-01 08:33:00,2013-10-01 10:44:00,Transfer,Partner,7,Unknown
3,A664236,Cat,09/24/2013,Unknown,Domestic Shorthair Mix,Orange/White,Stray,Normal,7,Unknown,2013-10-01 08:33:00,2013-10-01 10:44:00,Transfer,Partner,7,Unknown
4,A664233,Dog,09/30/2010,Female,Pit Bull Mix,Blue/White,Stray,Injured,1095,Intact,2013-10-01 08:53:00,2013-10-01 15:33:00,Euthanasia,Suffering,1095,Intact


In [99]:
# At this point, I think my data is cleaned enough and complete enough to be used to answer a wide range of questions about this shelter animal population. 
# I'm going to end off here, so I can start my data exploration and analysis in a fresh notebook!