In [1]:
## Data Cleaning 

In [3]:
import numpy as np
import pandas as pd
import re

In [17]:
def drop_null_data(df):
    #identifies null data in provided data frame, drops it, and reports columns in which the data were null

    #generates a Series with index = df's columns, values = number of nulls per column
    drops = df.isnull().sum()

    #if there are no nulls in the df, print & return original df
    if drops.sum() == 0:
        print("No null values identified")
            
    #if there are nulls in the df, drop them all and print which columns contained nulls and how many were dropped; returns df without nulls    
    else:
        df = df.dropna(axis = 0)
        for index, value in drops.items():
            if value != 0:
                print('"{}" contained {} null value(s)'.format(index, value))
    return df

#Regex for age parsing
year_pattern = re.compile(r"(-?\d+)\s*(?:year|years)")
month_pattern = re.compile(r"(-?\d+)\s*(?:month|months)")
week_pattern = re.compile(r"(-?\d+)\s*(?:week|weeks)")
day_pattern = re.compile(r"(-?\d+)\s*(?:day|days)")

# Function to convert age-strings to days (int)
def convert_to_days(age_str):
    if "year" in age_str:
        value = re.search(year_pattern, age_str).group(1)
        return int(value) * 365
    elif "month" in age_str:
        value = re.search(month_pattern, age_str).group(1)
        return int(value) * 30 
    elif "week" in age_str:
        value = re.search(week_pattern, age_str).group(1)
        return int(value) * 7
    elif "day" in age_str:
        value = re.search(day_pattern, age_str).group(1)
        return int(value)
    else:
        print(f"Unknown pattern: {age_str}; -1 days reported")
        return int(-1)

#Intake
intake_data = pd.read_csv('Austin_Animal_Center_Intakes_20240701.csv')

# Dropping columns of variables not likely to be useful
intake_data = intake_data.drop(['Name', 'DateTime', 'Found Location'], axis = 1)  

# Drop null values & report from whence they came
intake_data = drop_null_data(intake_data)

# Convert age strings to days (int)
intake_data['Age upon Intake'] = intake_data['Age upon Intake'].apply(convert_to_days)

# Dropping rows with negative ages (which also includes any age strings that were not successfully parsed)
pre = len(intake_data)
intake_data = intake_data.loc[intake_data['Age upon Intake'] >= 0]
post= len(intake_data)
print(f"Dropped rows due to negative ages: {pre - len(intake_data)}")

#Sanity check - dropping pets with "Aged" Intake Condition who are < 1yr old
intake_data = intake_data[(intake_data['Intake Condition'] != 'Aged') |
                          (intake_data['Age upon Intake'] > 365)]

#Outcome Data
outcome_data = pd.read_csv('Austin_Animal_Center_Outcomes_20240701.csv')
#dropping columns of variables not likely to be useful
outcome_data = outcome_data.drop(['Name', 'DateTime'], axis = 1)

#Flag animals w/o recorded outcomes
lost_to_follow_up = outcome_data.loc[outcome_data['Outcome Type'].isnull()]
print("{} animals have no outcome recorded".format(len(lost_to_follow_up)))

#Fill null subtypes with "None"
outcome_data = outcome_data['Outcome Subtype'].fillna('None')

#drop null values & report from whence they came
outcome_data = drop_null_data(outcome_data)



"Sex upon Intake" contained 2 null value(s)
"Age upon Intake" contained 1 null value(s)
Dropped rows due to negative ages: 13
39 animals have no outcome recorded
No null values identified
