In [16]:
#Not sure exactly which packages I will likely need, but probably most of these
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.cluster import KMeans

### Import and initial description of intake data

In [17]:
#read both the intake and outcome data files
intake_data_raw = pd.read_csv('Austin_Animal_Center_Intakes_20240701.csv')
outcome_data_raw = pd.read_csv('Austin_Animal_Center_Outcomes_20240701.csv')


In [18]:
intake_data_raw.head()
intake_length = len(intake_data_raw)
#intake_length #164614
#Name and Found Location are potentially sensitive information, so I will drop these. 
#Animal ID is also potentially sensitive information, but I can use it to join the intake and outcome data sets, so I will keep it for now. It should not appear in the final data set.
#MonthYear and DateTime are redundant, so I will drop MonthYear since DateTime should be easy to convert from str to timestamp or other numerical. 

In [19]:
outcome_length = len(outcome_data_raw)
outcome_data_raw.head()
#outcome_length #164228
#Name is again potentially sensitive. The Animal ID will be used for merging.
#MonthYear and DateTime are again redundant.
#The outcome data is shorter than the intake data, likely because some pets are actively in the shelter (so they have intake but no outcome yet)

Unnamed: 0,Animal ID,Name,DateTime,MonthYear,Date of Birth,Outcome Type,Outcome Subtype,Animal Type,Sex upon Outcome,Age upon Outcome,Breed,Color
0,A794011,Chunk,05/08/2019 06:20:00 PM,May 2019,05/02/2017,Rto-Adopt,,Cat,Neutered Male,2 years,Domestic Shorthair Mix,Brown Tabby/White
1,A776359,Gizmo,07/18/2018 04:02:00 PM,Jul 2018,07/12/2017,Adoption,,Dog,Neutered Male,1 year,Chihuahua Shorthair Mix,White/Brown
2,A821648,,08/16/2020 11:38:00 AM,Aug 2020,08/16/2019,Euthanasia,,Other,Unknown,1 year,Raccoon,Gray
3,A720371,Moose,02/13/2016 05:59:00 PM,Feb 2016,10/08/2015,Adoption,,Dog,Neutered Male,4 months,Anatol Shepherd/Labrador Retriever,Buff
4,A674754,,03/18/2014 11:47:00 AM,Mar 2014,03/12/2014,Transfer,Partner,Cat,Intact Male,6 days,Domestic Shorthair Mix,Orange Tabby


### Duplicate Detection & Removal

In [20]:
intake_data_raw['Animal ID'].value_counts()
#Animal ID is not unique because a single animal may go through the shelter system multiple times

Animal ID
A721033    33
A718223    14
A718877    12
A705625    11
A706536    11
           ..
A714773     1
A765646     1
A732323     1
A698657     1
A855904     1
Name: count, Length: 147862, dtype: int64

In [21]:
#Looking more closely at the most frequently intaken pet, we see these are not duplicate data, but actually 33 unique visits to the shelter. 
#The other pets with multiple entries of the same Animal ID tell similar stories. 
intake_data_raw.loc[(intake_data_raw['Animal ID'] == 'A721033')].sort_values(by = 'DateTime')

Unnamed: 0,Animal ID,Name,DateTime,MonthYear,Found Location,Intake Type,Intake Condition,Animal Type,Sex upon Intake,Age upon Intake,Breed,Color
118002,A721033,Lil Bit,01/09/2017 02:26:00 PM,January 2017,6210 E Ben White Blvd in Austin (TX),Stray,Injured,Dog,Neutered Male,1 year,Rat Terrier Mix,Tricolor/Brown Brindle
125351,A721033,Lil Bit,01/26/2017 06:55:00 AM,January 2017,901 W Ben White Blvd in Austin (TX),Public Assist,Normal,Dog,Neutered Male,1 year,Rat Terrier Mix,Tricolor/Brown Brindle
48856,A721033,Lil Bit,01/30/2017 11:05:00 PM,January 2017,6210 E Ben White in Austin (TX),Public Assist,Normal,Dog,Neutered Male,1 year,Rat Terrier Mix,Tricolor/Brown Brindle
110181,A721033,Lil Bit,02/06/2017 10:13:00 AM,February 2017,6210 E Ben White in Austin (TX),Public Assist,Normal,Dog,Neutered Male,1 year,Rat Terrier Mix,Tricolor/Brown Brindle
30692,A721033,Lil Bit,02/12/2019 10:21:00 AM,February 2019,1936 East Oltorf Street in Austin (TX),Public Assist,Normal,Dog,Neutered Male,3 years,Rat Terrier Mix,Tricolor/Brown Brindle
69901,A721033,Lil Bit,02/16/2019 10:30:00 AM,February 2019,1135 Airport Blvd in Austin (TX),Public Assist,Normal,Dog,Neutered Male,3 years,Rat Terrier Mix,Tricolor/Brown Brindle
102945,A721033,Lil Bit,02/20/2016 10:44:00 AM,February 2016,2508 E Riverside Dr in Austin (TX),Stray,Normal,Dog,Neutered Male,9 months,Rat Terrier Mix,Tricolor/Brown Brindle
150333,A721033,Lil Bit,02/22/2018 10:28:00 AM,February 2018,6400 Ben White Blvd in Austin (TX),Public Assist,Normal,Dog,Neutered Male,2 years,Rat Terrier Mix,Tricolor/Brown Brindle
5518,A721033,Lil Bit,02/24/2019 09:53:00 PM,February 2019,700 Allen St in Austin (TX),Public Assist,Normal,Dog,Neutered Male,3 years,Rat Terrier Mix,Tricolor/Brown Brindle
13100,A721033,Lil Bit,03/07/2018 08:27:00 AM,March 2018,4111 South 1St in Austin (TX),Public Assist,Normal,Dog,Neutered Male,2 years,Rat Terrier Mix,Tricolor/Brown Brindle


In [22]:
outcome_data_raw.loc[(outcome_data_raw['Animal ID'] == 'A721033')].sort_values(by = 'DateTime')
#Because these frequently returning animals do not represent the "typical" intake-outcome cycle, I considered dropping pets with more than 3-5 visits. 
#But ultimately, a pet being returned to its owner is considered a success. And I'm curious how many of these repeat offenders make up the "return to owner" outcome type. 
#So I will leave them in for now

Unnamed: 0,Animal ID,Name,DateTime,MonthYear,Date of Birth,Outcome Type,Outcome Subtype,Animal Type,Sex upon Outcome,Age upon Outcome,Breed,Color
10976,A721033,Lil Bit,01/10/2017 04:20:00 PM,Jan 2017,05/20/2015,Return to Owner,,Dog,Neutered Male,1 year,Rat Terrier Mix,Tricolor/Brown Brindle
42029,A721033,Lil Bit,01/28/2017 03:22:00 PM,Jan 2017,05/20/2015,Return to Owner,,Dog,Neutered Male,1 year,Rat Terrier Mix,Tricolor/Brown Brindle
83153,A721033,Lil Bit,02/02/2017 11:19:00 AM,Feb 2017,05/20/2015,Return to Owner,,Dog,Neutered Male,1 year,Rat Terrier Mix,Tricolor/Brown Brindle
67815,A721033,Lil Bit,02/07/2017 05:26:00 PM,Feb 2017,05/20/2015,Return to Owner,,Dog,Neutered Male,1 year,Rat Terrier Mix,Tricolor/Brown Brindle
136111,A721033,Lil Bit,02/12/2019 03:20:00 PM,Feb 2019,05/20/2015,Return to Owner,,Dog,Neutered Male,3 years,Rat Terrier Mix,Tricolor/Brown Brindle
37003,A721033,Lil Bit,02/18/2019 04:46:00 PM,Feb 2019,05/20/2015,Return to Owner,,Dog,Neutered Male,3 years,Rat Terrier Mix,Tricolor/Brown Brindle
74377,A721033,Lil Bit,02/20/2016 04:18:00 PM,Feb 2016,05/20/2015,Return to Owner,,Dog,Neutered Male,9 months,Rat Terrier Mix,Tricolor/Brown Brindle
90560,A721033,Lil Bit,02/23/2018 01:06:00 PM,Feb 2018,05/20/2015,Return to Owner,,Dog,Neutered Male,2 years,Rat Terrier Mix,Tricolor/Brown Brindle
51640,A721033,Lil Bit,02/26/2019 07:00:00 PM,Feb 2019,05/20/2015,Return to Owner,,Dog,Neutered Male,3 years,Rat Terrier Mix,Tricolor/Brown Brindle
108849,A721033,Lil Bit,03/08/2018 03:04:00 PM,Mar 2018,05/20/2015,Return to Owner,,Dog,Neutered Male,2 years,Rat Terrier Mix,Tricolor/Brown Brindle


In [23]:
#To merge the intake and outcome datasets while keeping the multiple-visit pets in the set, I will group by Animal ID to assign each visit a count, combine the Animal ID with the count, and merge along this key
#First I want to sort the datasets by DateTime, which means my current DateTime str needs to be convert to a numerical. 
intake_data_raw['DateTime'] = pd.to_datetime(intake_data_raw['DateTime'], format = '%m/%d/%Y %H:%M:%S %p')
outcome_data_raw['DateTime'] = pd.to_datetime(outcome_data_raw['DateTime'], format = '%m/%d/%Y %H:%M:%S %p')

In [24]:
#Sorting the datasets by DateTime and dropping rows that have the same Animal ID & DateTime (ie. true duplicate entries)
intake_data_raw.sort_values(by = 'DateTime')
intake_data_raw['Duplicated?'] = intake_data_raw.duplicated(subset=['Animal ID', 'DateTime'])

outcome_data_raw.sort_values(by = 'DateTime')
outcome_data_raw['Duplicated?'] = outcome_data_raw.duplicated(subset=['Animal ID', 'DateTime'])

In [36]:
#Spotchecking that duplicates that were flagged are true duplicates and not repeat visits and checking the length of each to see how many were dropped. 
#It worked =)

#outcome_data_raw.loc[outcome_data_raw['Duplicated?'] == True] #25 count
#intake_data_raw.loc[intake_data_raw['Duplicated?'] == True] #37 count
#intake_data_raw.loc[intake_data_raw['Animal ID'] == 'A727043']

Unnamed: 0,Animal ID,Name,DateTime,MonthYear,Found Location,Intake Type,Intake Condition,Animal Type,Sex upon Intake,Age upon Intake,Breed,Color,Duplicated?
4900,A830075,Waffle,2021-03-02 01:35:00,March 2021,Onion Creek And Pleasant Valley in Austin (TX),Stray,Normal,Dog,Intact Male,1 year,Chihuahua Shorthair Mix,Tan,True
7022,A696688,Mari,2015-02-10 11:00:00,February 2015,Austin (TX),Owner Surrender,Normal,Cat,Spayed Female,3 years,Domestic Shorthair Mix,Calico/White,True
9206,A727043,*Larry,2016-05-17 03:46:00,May 2016,9515 N Lamar Blvd in Austin (TX),Stray,Normal,Cat,Intact Male,1 year,Domestic Shorthair Mix,Blue/White,True
13264,A866229,*Jones,2022-09-26 03:17:00,September 2022,9301 Hog Eye Rd in Austin (TX),Stray,Normal,Cat,Intact Male,1 month,Domestic Shorthair,Orange Tabby,True
13977,A761936,,2017-11-12 11:16:00,November 2017,8400 Old Bee Caves in Austin (TX),Stray,Normal,Dog,Intact Male,9 months,Scottish Terrier Mix,Brown Brindle,True
15336,A863452,Amber,2022-08-31 11:55:00,August 2022,4200 Brookview in Austin (TX),Stray,Normal,Dog,Intact Female,10 months,German Shepherd,White,True
18221,A561806,Dasia,2017-06-05 11:36:00,June 2017,2002 Nightview in Austin (TX),Stray,Normal,Dog,Spayed Female,8 years,Pit Bull Mix,Brown Brindle/White,True
30514,A773428,*Atreyu,2018-05-31 06:38:00,May 2018,7Th And Gonzalez in Austin (TX),Stray,Normal,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Orange Tabby,True
49996,A815987,Princess,2020-04-09 03:32:00,April 2020,Austin (TX),Public Assist,Normal,Dog,Intact Female,1 year,Cairn Terrier,White/Brown,True
52864,A876931,,2023-03-22 06:14:00,March 2023,15 Waller Street in Austin (TX),Wildlife,Normal,Other,Unknown,1 year,Bat,Brown,True


In [41]:
outcome_data_raw = outcome_data_raw.loc[outcome_data_raw['Duplicated?'] != True]
#len(outcome_data_raw) #164203 length

intake_data_raw = intake_data_raw.loc[intake_data_raw['Duplicated?'] != True]
#len(intake_data_raw) #164577 length

164577

### Merging intake & outcome data sets

In [42]:
#Group both datasets by Animal ID & use the cumulative count to organize repeat visits 
intake_data_raw['Visit Count'] = intake_data_raw.groupby(by = 'Animal ID').cumcount() +1
outcome_data_raw['Visit Count'] = outcome_data_raw.groupby(by = 'Animal ID').cumcount() +1

In [53]:
# Merge the intake and outcome data sets based on Animal ID and Visit Count
merged_data_raw = pd.merge(intake_data_raw, outcome_data_raw, on = ['Animal ID', 'Visit Count'])
len(outcome_data_raw) - len(merged_data_raw)
#len(merged_data_raw) #163263
#I expected the outcome data length to be longer than the merged data length because some pets will not have both intake and outcome data, and so would not be included in the merged set

940

In [54]:
#When I examine the animals that did not make it into the merged set from the outcome data, most of them are from the time the shelter first started recording data (Oct 1, 2013). 
#Pets admitted prior to this date, would not have intake data to match with their outcome data, so they cannot be included. 
#There are a few pets from later on that are also missing intake data for some reason. None of these pets have multiple visits, so they will not mess up the merged data set. 

outcome_data_not_merged = outcome_data_raw[~outcome_data_raw['Animal ID'].isin(merged_data_raw['Animal ID'])]
outcome_data_not_merged.loc[outcome_data_not_merged['Visit Count'] > 1]

Unnamed: 0,Animal ID,Name,DateTime,MonthYear,Date of Birth,Outcome Type,Outcome Subtype,Animal Type,Sex upon Outcome,Age upon Outcome,Breed,Color,Duplicated?,Visit Count


In [67]:
#When I examine the animals that did not make it into the merged set from the intake data, almost all of them (1116/1189) are from 2024, so there is a good chance, these pets do not yet have outcomes.
#Of the smaller number of animals that predate 2024, none of them have outcome data. They can be considered "lost" and excluded from the data set

intake_data_not_merged = intake_data_raw[~intake_data_raw['Animal ID'].isin(merged_data_raw['Animal ID'])]
#intake_data_not_merged.loc[intake_data_not_merged['DateTime'] >= '2024'].sort_values(by = 'DateTime')
#intake_data_not_merged.loc[intake_data_not_merged['DateTime'] < '2023'].sort_values(by = 'DateTime')
#intake_data_not_merged.loc[intake_data_not_merged['Visit Count'] > 1]

Unnamed: 0,Animal ID,Name,DateTime,MonthYear,Found Location,Intake Type,Intake Condition,Animal Type,Sex upon Intake,Age upon Intake,Breed,Color,Duplicated?,Visit Count
5597,A636629,,2014-09-07 04:31:00,September 2014,706 Terrace Mountain Drive in Austin (TX),Stray,Normal,Cat,Spayed Female,4 years,Domestic Shorthair Mix,Tortie,False,1
33381,A645513,,2015-05-18 07:30:00,May 2015,3001 Sauls in Austin (TX),Stray,Injured,Cat,Neutered Male,6 years,Domestic Longhair Mix,Cream Tabby/White,False,1
157212,A840685,*Tajin,2021-08-12 09:19:00,August 2021,1305 Walnut Avenue in Austin (TX),Stray,Normal,Dog,Intact Male,5 years,Pit Bull/Golden Retriever,White/Black,False,1
41482,A843014,*Mighty Mijo,2021-09-25 08:19:00,September 2021,4434 Frontier Trail in Austin (TX),Stray,Injured,Dog,Intact Male,2 years,Pit Bull,Brown/White,False,1
40691,A843470,Pecho Blanco,2021-10-01 12:13:00,October 2021,Manor (TX),Owner Surrender,Normal,Dog,Neutered Male,10 years,Labrador Retriever Mix,Black,False,1
25264,A846230,*Brody,2021-11-12 08:52:00,November 2021,5501 Ross Rd in Austin (TX),Public Assist,Normal,Dog,Intact Male,2 years,Pit Bull,Brown Brindle,False,1
156359,A851142,*Hotch,2022-02-07 12:46:00,February 2022,12813 Wood Lily Trail in Travis (TX),Stray,Normal,Dog,Intact Male,2 years,American Pit Bull Terrier,Black/White,False,1
55367,A852953,*Fletcher,2022-03-10 11:55:00,March 2022,Long Bow And Sherwood in Austin (TX),Stray,Normal,Dog,Intact Male,2 years,Pit Bull,Black/White,False,1
160722,A853481,*Lovebird,2022-03-19 12:05:00,March 2022,2201 East Ben White in Austin (TX),Stray,Normal,Dog,Intact Female,1 year,American Pit Bull Terrier/Labrador Retriever,Black/White,False,1
55962,A859548,*Thaddeus,2022-06-15 12:01:00,June 2022,460 Bastrop Hwy in Austin (TX),Stray,Normal,Dog,Intact Male,2 years,Pit Bull,Blue,False,1


### Cleaning the merged data set - Nulls

In [73]:
merged_data_raw.columns
# Looking at the columns of the merged data set, there are several columns that should be identical (ex. Breed_x & Breed_y),
# some columns are redundant (monthyear), and some are not helpful (Duplicated?)

Index(['Animal ID', 'Name_x', 'DateTime_x', 'MonthYear_x', 'Found Location',
       'Intake Type', 'Intake Condition', 'Animal Type_x', 'Sex upon Intake',
       'Age upon Intake', 'Breed_x', 'Color_x', 'Duplicated?_x', 'Visit Count',
       'Name_y', 'DateTime_y', 'MonthYear_y', 'Date of Birth', 'Outcome Type',
       'Outcome Subtype', 'Animal Type_y', 'Sex upon Outcome',
       'Age upon Outcome', 'Breed_y', 'Color_y', 'Duplicated?_y'],
      dtype='object')

In [90]:
#First I will make sure that the columns that should be identical actually are.

# merged_data_raw.loc[(merged_data_raw['Breed_x'] == merged_data_raw['Breed_y']) == False] 
# merged_data_raw.loc[(merged_data_raw['Animal Type_x'] == merged_data_raw['Animal Type_y']) == False]
merged_data_raw.loc[(merged_data_raw['Color_x'] == merged_data_raw['Color_y']) == False]

Unnamed: 0,Animal ID,Name_x,DateTime_x,MonthYear_x,Found Location,Intake Type,Intake Condition,Animal Type_x,Sex upon Intake,Age upon Intake,...,MonthYear_y,Date of Birth,Outcome Type,Outcome Subtype,Animal Type_y,Sex upon Outcome,Age upon Outcome,Breed_y,Color_y,Duplicated?_y


In [91]:
# Since all of those duplicate columns matched like they should, I can drop the duplicates without issue. 
# I will also drop the sensitive, redundant, and not helpful data. 

merged_data = merged_data_raw.drop(['Name_x', 'MonthYear_x', 'Found Location', 'Duplicated?_x', 'Name_y', 'MonthYear_y', 'Animal Type_y', 'Breed_y', 'Color_y', 'Duplicated?_y'], axis = 1)
merged_data.head()

Unnamed: 0,Animal ID,DateTime_x,Intake Type,Intake Condition,Animal Type_x,Sex upon Intake,Age upon Intake,Breed_x,Color_x,Visit Count,DateTime_y,Date of Birth,Outcome Type,Outcome Subtype,Sex upon Outcome,Age upon Outcome
0,A786884,2019-01-03 04:19:00,Stray,Normal,Dog,Neutered Male,2 years,Beagle Mix,Tricolor,1,2019-01-08 03:11:00,01/03/2017,Transfer,Partner,Neutered Male,2 years
1,A706918,2015-07-05 12:59:00,Stray,Normal,Dog,Spayed Female,8 years,English Springer Spaniel,White/Liver,1,2015-07-05 03:13:00,07/05/2007,Return to Owner,,Spayed Female,8 years
2,A724273,2016-04-14 06:43:00,Stray,Normal,Dog,Intact Male,11 months,Basenji Mix,Sable/White,1,2016-04-21 05:17:00,04/17/2015,Return to Owner,,Neutered Male,1 year
3,A857105,2022-05-12 12:23:00,Public Assist,Normal,Cat,Neutered Male,2 years,Domestic Shorthair,Orange Tabby,1,2022-05-12 02:35:00,05/12/2020,Transfer,Partner,Neutered Male,2 years
4,A682524,2014-06-29 10:38:00,Stray,Normal,Dog,Neutered Male,4 years,Doberman Pinsch/Australian Cattle Dog,Tan/Gray,1,2014-07-02 02:16:00,06/29/2010,Return to Owner,,Neutered Male,4 years


In [101]:
# And renaming some columns to improve organization. 
merged_data = merged_data.rename({'DateTime_x': 'Intake DateTime', 'Animal Type_x': 'Animal Type', 'Breed_x': 'Breed', 'Color_x':'Color', 'DateTime_y': 'Outcome DateTime'}, axis='columns')
merged_data = merged_data.reindex(columns = ['Animal ID', 'Animal Type', 'Date of Birth', 'Breed', 'Color', 'Intake Type', 'Intake Condition', 'Sex upon Intake', 'Age upon Intake', 'Intake DateTime', 'Outcome DateTime', 'Outcome Type', 'Outcome Subtype', 'Sex upon Outcome', 'Age upon Outcome'])
merged_data.head()


Unnamed: 0,Animal ID,Animal Type,Date of Birth,Breed,Color,Intake Type,Intake Condition,Sex upon Intake,Age upon Intake,Intake DateTime,Outcome DateTime,Outcome Type,Outcome Subtype,Sex upon Outcome,Age upon Outcome
0,A786884,Dog,01/03/2017,Beagle Mix,Tricolor,Stray,Normal,Neutered Male,2 years,2019-01-03 04:19:00,2019-01-08 03:11:00,Transfer,Partner,Neutered Male,2 years
1,A706918,Dog,07/05/2007,English Springer Spaniel,White/Liver,Stray,Normal,Spayed Female,8 years,2015-07-05 12:59:00,2015-07-05 03:13:00,Return to Owner,,Spayed Female,8 years
2,A724273,Dog,04/17/2015,Basenji Mix,Sable/White,Stray,Normal,Intact Male,11 months,2016-04-14 06:43:00,2016-04-21 05:17:00,Return to Owner,,Neutered Male,1 year
3,A857105,Cat,05/12/2020,Domestic Shorthair,Orange Tabby,Public Assist,Normal,Neutered Male,2 years,2022-05-12 12:23:00,2022-05-12 02:35:00,Transfer,Partner,Neutered Male,2 years
4,A682524,Dog,06/29/2010,Doberman Pinsch/Australian Cattle Dog,Tan/Gray,Stray,Normal,Neutered Male,4 years,2014-06-29 10:38:00,2014-07-02 02:16:00,Return to Owner,,Neutered Male,4 years


In [102]:
merged_data.describe(include = 'all')
# The data set contains over 160k animals. All of this information could potentially effect outcome (adoption vs other).
# There is still some redundant information between "age upon intake," "Date of Birth," and "age upon outcome," but we can address this later

Unnamed: 0,Animal ID,Animal Type,Date of Birth,Breed,Color,Intake Type,Intake Condition,Sex upon Intake,Age upon Intake,Intake DateTime,Outcome DateTime,Outcome Type,Outcome Subtype,Sex upon Outcome,Age upon Outcome
count,163263,163263,163263,163263,163263,163263,163263,163261,163262,163263,163263,163226,74974,163261,163253
unique,146673,5,8378,2936,648,6,20,5,55,,,11,26,5,55
top,A721033,Dog,05/01/2016,Domestic Shorthair Mix,Black/White,Stray,Normal,Intact Male,1 year,,,Adoption,Partner,Neutered Male,1 year
freq,33,89841,121,33443,16987,111908,139064,55043,26855,,,77874,38548,57065,27309
mean,,,,,,,,,,2018-07-13 17:37:30.555116800,2018-08-02 18:15:03.282923776,,,,
min,,,,,,,,,,2013-10-01 01:12:00,2013-10-01 01:00:00,,,,
25%,,,,,,,,,,2015-12-11 09:18:30,2016-01-02 02:18:00,,,,
50%,,,,,,,,,,2018-04-30 11:41:00,2018-05-17 12:26:00,,,,
75%,,,,,,,,,,2020-12-02 02:56:30,2021-01-04 08:35:30,,,,
max,,,,,,,,,,2024-06-30 01:24:00,2024-07-01 11:15:00,,,,


In [103]:
merged_data.isnull().sum()
# There are a number of null values throughout the data set, 
# The "outcome subtype" is the largest cluster because not every Outcome Type needs to be further described by a Subtype
# Options for handling the subtype nulls include: 
#      marking the nulls as "None" or similar placeholder, 
#      dropping the subtype variable altogether, 
#      combining the type-subtype data into one variable. 
# For now, I will mark the Nan Subtypes as None, but I might circle back to this later. 

Animal ID               0
Animal Type             0
Date of Birth           0
Breed                   0
Color                   0
Intake Type             0
Intake Condition        0
Sex upon Intake         2
Age upon Intake         1
Intake DateTime         0
Outcome DateTime        0
Outcome Type           37
Outcome Subtype     88289
Sex upon Outcome        2
Age upon Outcome       10
dtype: int64

In [139]:
merged_data['Outcome Subtype'] = merged_data['Outcome Subtype'].fillna(value = 'None')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_data['Outcome Subtype'] = merged_data['Outcome Subtype'].fillna(value = 'None')


In [107]:
merged_data.loc[merged_data['Outcome Type'].isnull() == True]

# There are a few pets that have no recorded Outcome type, and these can't help answer my questions about what gets pets adopted, so I will drop them. 

merged_data = merged_data.loc[merged_data['Outcome Type'].isnull() != True]

In [114]:
merged_data = merged_data.loc[merged_data['Sex upon Outcome'].isnull() != True]

In [136]:
merged_data.loc[merged_data['Age upon Outcome'].isnull() == True]

# There are a few pets that have no recorded age or sex upon outcome. 
# For a couple of these, they have an "age upon intake," so "age upon outcome" 
# can easily be calculated and filled in, but because my data set is large, I'm going to drop these. 

merged_data = merged_data.dropna(subset = ['Age upon Outcome'], axis = 0)

#merged_data = merged_data.loc[merged_data['Age upon Outcome'].isnull() != True &
#                             merged_data['Sex upon Outcome'].isnull() !=]

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  merged_data['Age upon Outcome'][162151] = '1 year'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_data[

In [140]:
merged_data.isnull().sum()

Animal ID           0
Animal Type         0
Date of Birth       0
Breed               0
Color               0
Intake Type         0
Intake Condition    0
Sex upon Intake     0
Age upon Intake     0
Intake DateTime     0
Outcome DateTime    0
Outcome Type        0
Outcome Subtype     0
Sex upon Outcome    0
Age upon Outcome    0
dtype: int64

In [153]:
# Since I hope this data analysis can be repeated on updated data in the future, I put the above
# null management sequence into a method that can go in a module

def drop_null_data(df):
    #identifies null data in provided data frame, drops it, and reports columns in which the data were null

    #generates a Series with index = df's columns, values = number of nulls per column
    drops = df.isnull().sum()

    #if there are no nulls in the df, print & return original df
    if drops.sum() == 0:
        print("No null values identified")
            
    #if there are nulls in the df, drop them all and print which columns contained nulls and how many were dropped; returns df without nulls    
    else:
        df = df.dropna(axis = 0)
        for index, value in drops.items():
            if value != 0:
                print('"{}" contained {} null value(s)'.format(index, value))
    return df

### Cleaning the merged data set - Sanity Checks

##### Ages - Neonates & Aged

In [144]:
# The data contains a small number of animals that have ages listed as negative numbers. 
# They are few enough that I could drop them, but I prefer to fix them. 

merged_data['Age upon Intake'].unique()

array(['2 years', '8 years', '11 months', '4 years', '4 months',
       '6 years', '6 months', '4 weeks', '5 months', '14 years',
       '1 month', '2 months', '18 years', '3 years', '1 year', '4 days',
       '9 years', '2 weeks', '15 years', '1 day', '5 years', '3 weeks',
       '9 months', '8 months', '6 days', '7 years', '12 years', '1 week',
       '10 years', '7 months', '3 months', '10 months', '1 weeks',
       '5 days', '2 days', '0 years', '11 years', '17 years', '3 days',
       '13 years', '5 weeks', '19 years', '16 years', '20 years',
       '-1 years', '22 years', '28 years', '23 years', '30 years',
       '-2 years', '21 years', '-3 years', '25 years', '24 years',
       '-4 years'], dtype=object)

In [147]:
# This method will use regex to convert the ages from strings to ints
# I leave the ages in days because the computer doesn't need years/months/weeks/days to draw conclusions, 
# but for data visualization later, they will need to be converted to more meaningful numbers.

import re

year_pattern = re.compile(r"(-?\d+)\s*(?:year|years)")
month_pattern = re.compile(r"(-?\d+)\s*(?:month|months)")
week_pattern = re.compile(r"(-?\d+)\s*(?:week|weeks)")
day_pattern = re.compile(r"(-?\d+)\s*(?:day|days)")

# Function to convert strings to days
def convert_to_days(age_str):
    if "year" in age_str:
        value = re.search(year_pattern, age_str).group(1)
        return int(value) * 365
    elif "month" in age_str:
        value = re.search(month_pattern, age_str).group(1)
        return int(value) * 30 
    elif "week" in age_str:
        value = re.search(week_pattern, age_str).group(1)
        return int(value) * 7
    elif "day" in age_str:
        value = re.search(day_pattern, age_str).group(1)
        return int(value)
    else:
        print(f"Unknown pattern: {age_str}; -1 days reported")
        return int(-1)

In [146]:
merged_data['Age upon Intake'] = merged_data['Age upon Intake'].apply(convert_to_days)
merged_data['Age upon Outcome'] = merged_data['Age upon Outcome'].apply(convert_to_days)
merged_data.head()

Unnamed: 0,Animal ID,Animal Type,Date of Birth,Breed,Color,Intake Type,Intake Condition,Sex upon Intake,Age upon Intake,Intake DateTime,Outcome DateTime,Outcome Type,Outcome Subtype,Sex upon Outcome,Age upon Outcome
0,A786884,Dog,01/03/2017,Beagle Mix,Tricolor,Stray,Normal,Neutered Male,730,2019-01-03 04:19:00,2019-01-08 03:11:00,Transfer,Partner,Neutered Male,730
1,A706918,Dog,07/05/2007,English Springer Spaniel,White/Liver,Stray,Normal,Spayed Female,2920,2015-07-05 12:59:00,2015-07-05 03:13:00,Return to Owner,,Spayed Female,2920
2,A724273,Dog,04/17/2015,Basenji Mix,Sable/White,Stray,Normal,Intact Male,330,2016-04-14 06:43:00,2016-04-21 05:17:00,Return to Owner,,Neutered Male,365
3,A857105,Cat,05/12/2020,Domestic Shorthair,Orange Tabby,Public Assist,Normal,Neutered Male,730,2022-05-12 12:23:00,2022-05-12 02:35:00,Transfer,Partner,Neutered Male,730
4,A682524,Dog,06/29/2010,Doberman Pinsch/Australian Cattle Dog,Tan/Gray,Stray,Normal,Neutered Male,1460,2014-06-29 10:38:00,2014-07-02 02:16:00,Return to Owner,,Neutered Male,1460


In [149]:
# Dropping rows with negative ages for both Age upon Intake & Age upon Outcome

pre = len(merged_data)
merged_data = merged_data.loc[merged_data['Age upon Intake'] >= 0]
post= len(merged_data)
print(f"Dropped rows due to negative Intake ages: {pre - post}")

pre2 = len(merged_data)
merged_data = merged_data.loc[merged_data['Age upon Outcome'] >= 0]
post2 = len(merged_data)
print(f"Dropped rows due to negative Outcome ages: {pre - post}")

Dropped rows due to negative Intake ages: 0
Dropped rows due to negative Outcome ages: 0


In [150]:
merged_data.head()

Unnamed: 0,Animal ID,Animal Type,Date of Birth,Breed,Color,Intake Type,Intake Condition,Sex upon Intake,Age upon Intake,Intake DateTime,Outcome DateTime,Outcome Type,Outcome Subtype,Sex upon Outcome,Age upon Outcome
0,A786884,Dog,01/03/2017,Beagle Mix,Tricolor,Stray,Normal,Neutered Male,730,2019-01-03 04:19:00,2019-01-08 03:11:00,Transfer,Partner,Neutered Male,730
1,A706918,Dog,07/05/2007,English Springer Spaniel,White/Liver,Stray,Normal,Spayed Female,2920,2015-07-05 12:59:00,2015-07-05 03:13:00,Return to Owner,,Spayed Female,2920
2,A724273,Dog,04/17/2015,Basenji Mix,Sable/White,Stray,Normal,Intact Male,330,2016-04-14 06:43:00,2016-04-21 05:17:00,Return to Owner,,Neutered Male,365
3,A857105,Cat,05/12/2020,Domestic Shorthair,Orange Tabby,Public Assist,Normal,Neutered Male,730,2022-05-12 12:23:00,2022-05-12 02:35:00,Transfer,Partner,Neutered Male,730
4,A682524,Dog,06/29/2010,Doberman Pinsch/Australian Cattle Dog,Tan/Gray,Stray,Normal,Neutered Male,1460,2014-06-29 10:38:00,2014-07-02 02:16:00,Return to Owner,,Neutered Male,1460


In [152]:
merged_data['Intake Condition'].value_counts()

# These are the identifier & counts for the intake condition column. Some are very large & some are only single animals.
# I would ulimately like to consolidate these, but first, I would like to do some checking to make sure they are accurate.
# For example, a neonate should not be an old animal, and a pet described as "aged" should not be young. 
# Pregnant and nursing animals should be female. 

Intake Condition
Normal        135694
Injured         8688
Sick            5524
Nursing         3821
Neonatal        1413
Aged             466
Medical          370
Other            338
Pregnant         148
Feral            139
Behavior          69
Med Attn          52
Unknown           12
Med Urgent        11
Neurologic         9
Parvo              5
Space              4
Agonal             3
Congenital         1
Panleuk            1
Name: count, dtype: int64

In [159]:
merged_data.loc[merged_data['Intake Condition'] == 'Neonatal'].sort_values(by = 'Age upon Intake', ascending = False)

# There are 2 year old animals being called "neonates," so that's not correct. Medically speaking, puppies/kittens are considered neonates when they're younger than 14 days (28 if you're lenient).
# I will drop any animals that are considered "neonatal" with an age upon intake <30 days. 

Unnamed: 0,Animal ID,Animal Type,Date of Birth,Breed,Color,Intake Type,Intake Condition,Sex upon Intake,Age upon Intake,Intake DateTime,Outcome DateTime,Outcome Type,Outcome Subtype,Sex upon Outcome,Age upon Outcome
72672,A841441,Dog,08/27/2019,Queensland Heeler,Blue,Stray,Neonatal,Spayed Female,730,2021-08-27 03:18:00,2021-09-29 06:59:00,Adoption,,Spayed Female,730
162112,A908186,Cat,06/26/2022,Domestic Shorthair,Calico,Stray,Neonatal,Intact Female,730,2024-06-26 11:18:00,2024-06-27 01:29:00,Transfer,Partner,Intact Female,730
63820,A845043,Cat,10/25/2019,Domestic Shorthair,Brown Tabby/White,Public Assist,Neonatal,Unknown,730,2021-10-25 01:06:00,2021-10-27 01:16:00,Return to Owner,,Unknown,730
55242,A845299,Cat,10/27/2019,Domestic Shorthair,Torbie,Abandoned,Neonatal,Intact Female,730,2021-10-27 05:24:00,2021-10-27 05:42:00,Transfer,Partner,Intact Female,730
141172,A843380,Dog,09/30/2019,Chihuahua Shorthair,Tan/White,Public Assist,Neonatal,Intact Female,730,2021-09-30 11:05:00,2022-01-18 03:33:00,Return to Owner,,Intact Female,730
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26520,A868023,Cat,10/27/2022,Domestic Shorthair,Brown Tabby,Stray,Neonatal,Intact Male,0,2022-10-27 12:43:00,2022-10-27 03:52:00,Transfer,Partner,Intact Male,0
153074,A891067,Cat,10/16/2023,Domestic Shorthair,Brown Tabby,Owner Surrender,Neonatal,Intact Female,0,2023-10-16 12:05:00,2023-10-18 01:55:00,Transfer,Partner,Intact Female,2
153073,A891065,Cat,10/16/2023,Domestic Shorthair,White,Owner Surrender,Neonatal,Intact Female,0,2023-10-16 12:05:00,2023-10-18 01:29:00,Transfer,Partner,Intact Female,2
146354,A837039,Other,06/16/2021,Rabbit Sh,Black,Owner Surrender,Neonatal,Unknown,0,2021-06-16 01:03:00,2021-06-17 01:23:00,Transfer,Partner,Unknown,1


In [167]:
merged_data.loc[(merged_data['Intake Condition'] == 'Aged') & (merged_data['Age upon Intake'] < 1460)].sort_values(by = 'Age upon Intake', ascending = True)

# There is no concrete ages at which an animal becomes "aged," and an animal who has had a hard life can appear "aged" before they are really old. 
# BUT there is no situation in which a 2 day old animal should be called "aged."
# I will be lenient and say any animal under 2yr shouldn't be in the "aged" category. This also allows for exotics, which have shorter lifespans.

Unnamed: 0,Animal ID,Animal Type,Date of Birth,Breed,Color,Intake Type,Intake Condition,Sex upon Intake,Age upon Intake,Intake DateTime,Outcome DateTime,Outcome Type,Outcome Subtype,Sex upon Outcome,Age upon Outcome
156375,A887830,Cat,08/25/2023,Domestic Shorthair,Brown Tabby,Owner Surrender,Aged,Neutered Male,2,2023-08-27 02:37:00,2023-08-27 04:06:00,Transfer,Partner,Neutered Male,2
156374,A887829,Cat,08/25/2023,Domestic Shorthair,Brown Tabby,Owner Surrender,Aged,Neutered Male,2,2023-08-27 02:37:00,2023-08-27 04:06:00,Transfer,Partner,Neutered Male,2
156376,A887831,Cat,08/25/2023,Domestic Shorthair,Black/White,Owner Surrender,Aged,Neutered Male,2,2023-08-27 02:37:00,2023-08-27 04:06:00,Transfer,Partner,Neutered Male,2
145208,A888924,Cat,09/12/2023,Domestic Shorthair,Black,Stray,Aged,Intact Male,2,2023-09-14 04:36:00,2023-09-14 06:14:00,Transfer,Partner,Intact Male,2
145210,A888923,Cat,09/12/2023,Domestic Shorthair,Tortie,Stray,Aged,Intact Female,2,2023-09-14 04:36:00,2023-09-14 06:14:00,Transfer,Partner,Intact Female,2
143828,A888856,Cat,09/03/2023,Domestic Shorthair,Tortie,Stray,Aged,Intact Female,7,2023-09-13 03:19:00,2023-09-13 04:17:00,Transfer,Partner,Intact Female,7
145212,A888919,Cat,08/24/2023,Domestic Shorthair,Cream Tabby/White,Stray,Aged,Intact Female,21,2023-09-14 05:21:00,2023-09-14 06:14:00,Transfer,Partner,Intact Female,21
147771,A889057,Cat,08/25/2023,Domestic Shorthair,Orange/White,Stray,Aged,Intact Male,21,2023-09-16 03:40:00,2023-09-16 05:26:00,Transfer,Partner,Intact Male,21
17775,A831549,Cat,03/01/2021,Domestic Shorthair,Cream Tabby,Stray,Aged,Intact Male,28,2021-03-29 12:40:00,2021-09-07 11:00:00,Adoption,Foster,Neutered Male,180
54712,A831547,Cat,03/01/2021,Domestic Shorthair,Black/White,Stray,Aged,Intact Male,28,2021-03-29 12:40:00,2021-07-17 09:48:00,Adoption,Foster,Neutered Male,120


In [None]:
#Sanity checked - no old "neonatal" or young "aged" animals
merged_data = merged_data[(merged_data['Intake Condition'] != 'Neonatal') |
                            (merged_data['Age upon Intake'] < 30)]

merged_data = merged_data[(merged_data['Intake Condition'] != 'Aged') |
                          (merged_data['Age upon Intake'] > 730)]

##### Reproductive status - Pregnant & Nursing

In [171]:
merged_data.loc[(merged_data['Intake Condition'] == 'Pregnant') &
                (merged_data['Sex upon Intake'] != 'Intact Female')]

# There are a fair number of animals listed as pregnant that are male or spayed. 
# Since I can't trust this information, I will drop these ones too. 

Unnamed: 0,Animal ID,Animal Type,Date of Birth,Breed,Color,Intake Type,Intake Condition,Sex upon Intake,Age upon Intake,Intake DateTime,Outcome DateTime,Outcome Type,Outcome Subtype,Sex upon Outcome,Age upon Outcome
1571,A885138,Cat,07/14/2021,Domestic Shorthair,Brown Tabby/White,Owner Surrender,Pregnant,Spayed Female,730,2023-07-14 07:54:00,2023-08-19 01:27:00,Adoption,,Spayed Female,730
6181,A842769,Dog,10/06/2021,Pit Bull/Siberian Husky,Black/White,Stray,Pregnant,Intact Male,0,2021-09-21 12:40:00,2021-11-29 05:50:00,Adoption,,Neutered Male,30
9187,A701482,Cat,04/30/2014,Domestic Longhair Mix,Brown Tabby,Stray,Pregnant,Spayed Female,365,2015-04-30 11:50:00,2015-05-01 09:00:00,Transfer,SCRP,Spayed Female,365
14285,A832146,Cat,04/08/2016,Domestic Shorthair,Blue Tabby/White,Owner Surrender,Pregnant,Spayed Female,1825,2021-04-08 07:57:00,2021-04-12 03:32:00,Rto-Adopt,,Spayed Female,1825
26326,A731134,Dog,07/17/2016,Jack Russell Terrier Mix,Black/Tan,Stray,Pregnant,Intact Male,0,2016-07-17 08:41:00,2016-07-19 02:09:00,Transfer,Partner,Intact Male,2
28138,A861113,Dog,06/07/2022,Pit Bull,Brown/White,Public Assist,Pregnant,Intact Male,28,2022-07-07 11:17:00,2022-07-12 10:05:00,Disposal,,Intact Male,35
31384,A731133,Dog,07/17/2016,Jack Russell Terrier Mix,Sable,Stray,Pregnant,Intact Male,0,2016-07-17 08:41:00,2016-07-19 02:09:00,Transfer,Partner,Intact Male,2
32694,A842777,Dog,09/18/2021,Pit Bull,Black,Stray,Pregnant,Unknown,3,2021-09-21 12:40:00,2021-11-27 08:55:00,Adoption,Foster,Unknown,60
35022,A842775,Dog,09/18/2021,Pit Bull,Black,Stray,Pregnant,Unknown,3,2021-09-21 12:40:00,2021-11-27 08:54:00,Adoption,Foster,Unknown,60
36791,A842773,Dog,09/18/2021,Pit Bull,Black,Stray,Pregnant,Unknown,3,2021-09-21 12:40:00,2021-11-27 08:54:00,Adoption,Foster,Unknown,60


In [172]:
#Sanity checked - male/spayed female pregnant

merged_data = merged_data.loc[(merged_data['Intake Condition'] != 'Pregnant') |
                (merged_data['Sex upon Intake'] == 'Intact Female')]

In [179]:
# Lastly, I want to look more closely at the "nursing" intake condition. These animals might refer to the nursing bitches (and thus should also be female) or to the nursing puppies. 

merged_data.loc[(merged_data['Intake Condition'] == 'Nursing') &
                (merged_data['Age upon Intake'] > 60)].sort_values(by = 'Age upon Intake')

# It looks like the shelter mostly uses "Nursing" to indicate nursing puppies/kittens. 
# HOWEVER, about 10% of the Nursing animals are over 8wks old (when puppies, kittens, and rabbits should be done nursing)
# And some are old enough that these are bitches/queens. There are also a few older males included; I will drop these. 
# I would like to split the young-nursing animals into another group (as being a cute puppy likely increases your odds of adoption)
# and keep the older-nursing females in a separate category (as being an older animal who has nursed recently might also change your odds of adoption).

Unnamed: 0,Animal ID,Animal Type,Date of Birth,Breed,Color,Intake Type,Intake Condition,Sex upon Intake,Age upon Intake,Intake DateTime,Outcome DateTime,Outcome Type,Outcome Subtype,Sex upon Outcome,Age upon Outcome
34991,A796707,Cat,02/04/2019,Domestic Shorthair,Brown Tabby,Stray,Nursing,Unknown,90,2019-06-04 03:32:00,2019-06-04 06:35:00,Transfer,Partner,Unknown,90
113683,A673723,Other,11/01/2013,Rabbit Sh Mix,Blue/White,Public Assist,Nursing,Intact Female,90,2014-03-01 10:12:00,2014-03-05 04:21:00,Transfer,Partner,Intact Female,120
37301,A673720,Other,11/01/2013,Rabbit Sh Mix,Black/White,Public Assist,Nursing,Intact Female,90,2014-03-01 10:12:00,2014-03-05 04:19:00,Transfer,Partner,Intact Female,120
92409,A704491,Dog,02/10/2015,Labrador Retriever/Plott Hound,Black,Stray,Nursing,Intact Female,90,2015-06-05 02:05:00,2015-06-05 04:03:00,Transfer,Partner,Intact Female,90
32164,A673721,Other,11/01/2013,Rabbit Sh Mix,Black/White,Public Assist,Nursing,Intact Female,90,2014-03-01 10:12:00,2014-03-05 04:20:00,Transfer,Partner,Intact Female,120
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110627,A803389,Dog,09/05/2012,Doberman Pinsch,Brown,Stray,Nursing,Intact Female,2190,2019-08-31 01:41:00,2019-09-06 04:49:00,Adoption,,Spayed Female,2555
84862,A737659,Dog,11/02/2008,Chihuahua Shorthair Mix,Black,Stray,Nursing,Neutered Male,2920,2016-11-02 11:38:00,2017-01-08 02:59:00,Adoption,,Neutered Male,2920
81335,A775493,Dog,07/03/2012,Chihuahua Shorthair Mix,Tan/White,Stray,Nursing,Spayed Female,2920,2021-05-11 08:12:00,2018-07-03 04:14:00,Return to Owner,,Spayed Female,2190
59422,A701255,Dog,04/26/2006,Shih Tzu Mix,White,Stray,Nursing,Neutered Male,3285,2015-04-26 01:47:00,2015-05-07 12:00:00,Transfer,Partner,Neutered Male,3285


In [193]:
# Animals younger than 60 days (aka 8.5 weeks) will now have the Intake Condition "Nursing Juvenile"

young_nursing = merged_data.loc[(merged_data['Intake Condition'] == 'Nursing') &
                                (merged_data['Age upon Intake'] <= 60)].index
merged_data['Intake Condition'][young_nursing] = 'Nursing Juvenile'
merged_data.loc[(merged_data['Intake Condition'] == 'Nursing Juvenile') &
                                (merged_data['Age upon Intake'] <= 60)]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_data['Intake Condition'][young_nursing] = 'Nursing Juvenile'


Unnamed: 0,Animal ID,Animal Type,Date of Birth,Breed,Color,Intake Type,Intake Condition,Sex upon Intake,Age upon Intake,Intake DateTime,Outcome DateTime,Outcome Type,Outcome Subtype,Sex upon Outcome,Age upon Outcome
50,A701811,Cat,04/20/2015,Domestic Shorthair Mix,Gray Tabby,Stray,Nursing Juvenile,Unknown,14,2015-05-05 07:29:00,2015-05-05 11:45:00,Transfer,Partner,Unknown,14
137,A724640,Cat,04/05/2016,Domestic Shorthair Mix,Brown Tabby/Black,Stray,Nursing Juvenile,Intact Male,14,2016-04-20 08:19:00,2016-04-20 05:48:00,Transfer,Partner,Intact Male,14
149,A800717,Cat,07/18/2019,Domestic Shorthair,Blue/White,Stray,Nursing Juvenile,Intact Female,7,2019-07-25 01:49:00,2019-07-25 06:39:00,Transfer,Partner,Intact Female,7
183,A728810,Cat,05/24/2016,Domestic Shorthair Mix,Blue,Stray,Nursing Juvenile,Intact Male,14,2016-06-08 05:30:00,2016-06-10 03:11:00,Transfer,Partner,Intact Male,14
194,A793946,Cat,04/17/2019,Domestic Shorthair Mix,Blue Tabby,Stray,Nursing Juvenile,Intact Female,14,2019-05-02 08:21:00,2019-05-02 02:57:00,Transfer,Partner,Intact Female,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162991,A804224,Cat,08/21/2019,Domestic Shorthair Mix,Black,Stray,Nursing Juvenile,Intact Male,21,2019-09-11 09:41:00,2019-09-12 11:08:00,Transfer,Partner,Intact Male,21
162992,A812494,Dog,01/15/2020,Labrador Retriever Mix,Black/White,Owner Surrender,Nursing Juvenile,Intact Female,7,2020-01-23 02:56:00,2020-04-07 08:50:00,Adoption,Foster,Spayed Female,60
163130,A782910,Cat,09/24/2018,Domestic Shorthair Mix,Blue Tabby/White,Stray,Nursing Juvenile,Intact Male,28,2018-10-22 06:31:00,2018-12-18 07:54:00,Adoption,Foster,Neutered Male,60
163141,A808754,Cat,10/22/2019,Domestic Shorthair,Black,Stray,Nursing Juvenile,Intact Male,21,2019-11-13 01:18:00,2019-11-13 01:50:00,Transfer,Partner,Intact Male,21


In [215]:
# Female animals older than 4 months (the youngest age at which a female could be nursing for dogs/cats/rabbits/guinea pigs, which are the only species represented as nursing in this data)

nursing_adults = merged_data.loc[(merged_data['Intake Condition'] == 'Nursing') &
                                 (merged_data['Age upon Intake'] > 120) &
                                 ((merged_data['Sex upon Intake'] == 'Intact Female') |
                                 (merged_data['Sex upon Intake'] == 'Spayed Female'))].index
merged_data['Intake Condition'][nursing_adults] = 'Nursing Adult'
merged_data['Intake Condition'].unique()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_data['Intake Condition'][nursing_adults] = 'Nursing Adult'


array(['Normal', 'Injured', 'Pregnant', 'Sick', 'Nursing Juvenile',
       'Aged', 'Nursing Adult', 'Unknown', 'Nursing', 'Congenital',
       'Medical', 'Other', 'Neonatal', 'Med Attn', 'Feral', 'Behavior',
       'Med Urgent', 'Space', 'Agonal', 'Neurologic', 'Panleuk', 'Parvo'],
      dtype=object)

In [217]:
# Any animals lefts in the 'Nursing' Intake condition are now either male adults or age-ambiguous enough that I can't tell whether they are the mothers or the juveniles. These will be dropped.

merged_data = merged_data.loc[(merged_data['Intake Condition'] != 'Nursing')]
merged_data['Intake Condition'].unique()

array(['Normal', 'Injured', 'Pregnant', 'Sick', 'Nursing Juvenile',
       'Aged', 'Nursing Adult', 'Unknown', 'Congenital', 'Medical',
       'Other', 'Neonatal', 'Med Attn', 'Feral', 'Behavior', 'Med Urgent',
       'Space', 'Agonal', 'Neurologic', 'Panleuk', 'Parvo'], dtype=object)

### Cleaning the data - intake condition organization

In [141]:
merged_data['Intake Type'].unique()
#My primary question (which animals will get adopted) cannot be answered by "wildlife" or "euthanasia request" types as these animals are not eligible for adoption.

array(['Stray', 'Public Assist', 'Owner Surrender', 'Wildlife',
       'Abandoned', 'Euthanasia Request'], dtype=object)

In [169]:
merged_data.loc[(merged_data['Animal Type'] == 'Other')]['Breed'].unique()

array(['Rabbit Sh Mix', 'Guinea Pig', 'Cinnamon', 'Rabbit Sh',
       'Dutch/Angora-Satin', 'Guinea Pig Mix', 'Ferret', 'Himalayan',
       'Cold Water', 'Bat', 'Rat Mix', 'Jersey Wooly', 'Californian',
       'Bat Mix', 'Silver Mix', 'Rex', 'Snake/Python', 'Lop-Mini/Hotot',
       'Lizard', 'Rat', 'Snake', 'Rex Mix', 'Rabbit Sh/Dwarf Hotot',
       'Cold Water Mix', 'Sugar Glider', 'Hamster Mix', 'Raccoon Mix',
       'Ferret Mix', 'Opossum Mix', 'Hedgehog', 'Lionhead',
       'Rex-Mini Mix', 'Raccoon', 'Mouse', 'Hamster', 'Hotot',
       'American Mix', 'Turtle', 'Angora-French Mix', 'Skunk',
       'Californian Mix', 'Rabbit Lh', 'Lop-Amer Fuzzy', 'Coyote',
       'Havana Mix', 'Lop-Mini Mix', 'Tortoise', 'Netherlnd Dwarf Mix',
       'Squirrel', 'Dutch', 'Turtle Mix', 'Lop-English Mix', 'Lizard Mix',
       'Hotot Mix', 'New Zealand Wht Mix', 'Lionhead Mix', 'Lop-Holland',
       'Dutch Mix', 'Rabbit Lh Mix', 'Gerbil', 'Squirrel Mix',
       'Chinchilla Mix', 'English Spot', 'Havan

In [142]:
#dropping animals with Intake Types "wildlife" & "euthanasia request"
merged_data = merged_data[(merged_data['Intake Type'] != 'Wildlife') & 
                        (merged_data['Intake Type'] != 'Euthanasia Request')]
merged_data['Intake Type'].unique()

array(['Stray', 'Public Assist', 'Owner Surrender', 'Abandoned'],
      dtype=object)

In [621]:
outcome_data_after_drops.loc[outcome_data_after_drops['Outcome Subtype'] == 'Foster']

Unnamed: 0,Animal ID,DateTime,Date of Birth,Outcome Type,Outcome Subtype,Animal Type,Sex upon Outcome,Age upon Outcome,Breed,Color
6,A814515,05/06/2020 07:59:00 AM,03/01/2018,Adoption,Foster,Dog,Neutered Male,2 years,American Foxhound/Labrador Retriever,White/Brown
21,A822928,01/24/2022 11:49:00 AM,09/15/2010,Adoption,Foster,Cat,Spayed Female,11 years,Domestic Shorthair,Torbie
27,A765349,06/08/2018 01:04:00 PM,01/18/2009,Adoption,Foster,Dog,Neutered Male,9 years,Chihuahua Shorthair Mix,Tricolor
35,A812473,03/05/2020 04:15:00 PM,01/26/2010,Adoption,Foster,Dog,Neutered Male,10 years,Chihuahua Shorthair,Brown
38,A789298,04/23/2019 11:59:00 AM,02/15/2019,Adoption,Foster,Dog,Neutered Male,2 months,Labrador Retriever Mix,Brown/White
...,...,...,...,...,...,...,...,...,...,...
164076,A901535,06/01/2024 01:14:00 PM,04/03/2023,Adoption,Foster,Cat,Spayed Female,1 year,Bengal Mix,Brown Tabby
164077,A903054,06/01/2024 02:38:00 PM,06/01/2023,Adoption,Foster,Dog,Spayed Female,1 year,German Shepherd,Sable
164082,A761967,06/01/2024 03:12:00 PM,11/12/2013,Adoption,Foster,Dog,Neutered Male,10 years,Labrador Retriever Mix,Black/White
164095,A887625,06/02/2024 12:42:00 PM,08/24/2018,Adoption,Foster,Dog,Neutered Male,5 years,Pit Bull,Blue/White


In [53]:
outcome_data_after_drops.loc[(outcome_data_after_drops['Outcome Type'].isnull()) & (outcome_data_after_drops['Outcome Subtype'].isnull()==False)]

Unnamed: 0,Animal ID,MonthYear,Date of Birth,Outcome Type,Outcome Subtype,Animal Type,Sex upon Outcome,Age upon Outcome,Breed,Color
38121,A828974,Feb 2021,01/30/2019,,Snr,Cat,Spayed Female,2 years,Domestic Shorthair,Black
156108,A890645,Dec 2023,09/11/2023,,Foster,Cat,Spayed Female,3 months,Siamese Mix,Lynx Point


In [463]:
lost_to_follow_up = outcome_data_after_drops.loc[outcome_data_after_drops['Outcome Type'].isnull()]

(0, 10)