In [2]:
import pandas as pd
import numpy as np
import uuid

In [3]:
#reading the data
data = pd.read_csv('animal-data-1.csv')
data

Unnamed: 0,id,intakedate,intakereason,istransfer,sheltercode,identichipnumber,animalname,breedname,basecolour,speciesname,...,movementdate,movementtype,istrial,returndate,returnedreason,deceaseddate,deceasedreason,diedoffshelter,puttosleep,isdoa
0,15801,2009-11-28 00:00:00,Moving,0,C09115463,0A115D7358,Jadzia,Domestic Short Hair,Tortie,Cat,...,2017-05-13 00:00:00,Adoption,0.0,,Stray,,Died in care,0,0,0
1,15932,2009-12-08 00:00:00,Moving,0,D09125594,0A11675477,Gonzo,German Shepherd Dog/Mix,Tan,Dog,...,2017-04-24 00:00:00,Adoption,0.0,,Stray,,Died in care,0,0,0
2,28859,2012-08-10 00:00:00,Abandoned,0,D12082309,0A13253C7B,Maggie,Shep Mix/Siberian Husky,Various,Dog,...,2017-04-15 00:00:00,Adoption,0.0,,Stray,,Died in care,0,0,0
3,30812,2013-01-11 00:00:00,Abandoned,0,C1301091,0A13403D4D,Pretty Girl,Domestic Short Hair,Dilute tortoiseshell,Cat,...,2017-04-18 00:00:00,Foster,0.0,2018-05-29 00:00:00,Stray,,Died in care,0,0,0
4,30812,2013-01-11 00:00:00,Abandoned,0,C1301091,0A13403D4D,Pretty Girl,Domestic Short Hair,Dilute tortoiseshell,Cat,...,2018-05-29 00:00:00,Adoption,0.0,,Stray,,Died in care,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10285,64584,2019-08-27 16:52:09,Litter relinquishment,0,C19081429,981020029330984,Max,Domestic Medium Hair,Orange and White,Cat,...,2019-08-29 00:00:00,Adoption,0.0,,Stray,,Court Order/ Legal,0,0,0
10286,64588,2019-08-27 18:14:11,Stray,0,C19081418,981020033133599,Punkin,Domestic Short Hair,Calico,Cat,...,2019-08-28 00:00:00,Foster,0.0,,Stray,,Court Order/ Legal,0,0,0
10287,64606,2019-08-29 12:10:21,Stray,0,D19081440,0A1243400C,Dozer,Coonhound,Red,Dog,...,2019-08-29 00:00:00,Reclaimed,0.0,,Stray,,Court Order/ Legal,0,0,0
10288,64608,2019-08-29 18:58:06,Stray,0,D19081421,982000363034690,Lucy Farmer,Beagle,Tricolour,Dog,...,2019-08-30 00:00:00,Reclaimed,0.0,,Stray,,Court Order/ Legal,0,0,0


In [4]:
#checking for attribute datatypes
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10290 entries, 0 to 10289
Data columns (total 23 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                10290 non-null  int64  
 1   intakedate        10290 non-null  object 
 2   intakereason      10288 non-null  object 
 3   istransfer        10290 non-null  int64  
 4   sheltercode       10290 non-null  object 
 5   identichipnumber  8324 non-null   object 
 6   animalname        10290 non-null  object 
 7   breedname         10245 non-null  object 
 8   basecolour        10290 non-null  object 
 9   speciesname       10290 non-null  object 
 10  animalage         10290 non-null  object 
 11  sexname           10290 non-null  object 
 12  location          10290 non-null  object 
 13  movementdate      10290 non-null  object 
 14  movementtype      10290 non-null  object 
 15  istrial           10289 non-null  float64
 16  returndate        3256 non-null   object

In [51]:
#checking for null values
data.isna().sum()

id                     0
intakedate             0
intakereason           2
istransfer             0
sheltercode            0
identichipnumber    1966
animalname             0
breedname             45
basecolour             0
speciesname            0
animalage              0
sexname                0
location               0
movementdate           0
movementtype           0
istrial                1
returndate          7034
returnedreason         0
deceaseddate        9964
deceasedreason         0
diedoffshelter         0
puttosleep             0
isdoa                  0
dtype: int64

In [5]:
#dropping the columns that are not needed and have large number of missing values
data=data.drop(columns='returndate')
data=data.drop(columns='deceaseddate')

In [53]:
#Generated new unique identifiers for missing identichipnumber values with a 0A prefix followed by 8 random digits

import random
import string

def generate_identichipnumber():
    return '0A' + ''.join(random.choices(string.digits, k=8))

data['identichipnumber'] = data['identichipnumber'].apply(lambda x: generate_identichipnumber() if pd.isnull(x) else x)

In [54]:
#Filled missing values in intakereason, breedname, and istrial with their respective most common values

intakereason_mode = data['intakereason'].mode()[0]
breedname_mode = data['breedname'].mode()[0]
istrial_mode = data['istrial'].mode()[0]

data['intakereason'] = data['intakereason'].fillna(intakereason_mode)
data['breedname'] = data['breedname'].fillna(breedname_mode)
data['istrial'] = data['istrial'].fillna(istrial_mode)

In [55]:
#making sure there are no null values
data.isna().sum()

id                  0
intakedate          0
intakereason        0
istransfer          0
sheltercode         0
identichipnumber    0
animalname          0
breedname           0
basecolour          0
speciesname         0
animalage           0
sexname             0
location            0
movementdate        0
movementtype        0
istrial             0
returnedreason      0
deceasedreason      0
diedoffshelter      0
puttosleep          0
isdoa               0
dtype: int64

In [56]:
#Converted intakedate and movementdate to date format, removing the time component

data['intakedate'] = pd.to_datetime(data['intakedate']).dt.date
data['movementdate'] = pd.to_datetime(data['movementdate']).dt.date

In [57]:
#Transformed animalage into a float representing years, with months expressed as decimals and rounded to one decimal place

import pandas as pd

def age_to_float(age_str):
    years, months = 0, 0
    if 'year' in age_str:
        years = int(age_str.split('year')[0].strip())
    if 'month' in age_str:
        months = int(age_str.split('month')[0].split()[-1].strip())
        
    return round(years + months / 12.0, 1)

data['animalage'] = data['animalage'].apply(age_to_float)

In [47]:
#reading the preprocessed data into csv
data.to_csv('animal-data-5.csv', index=False)

In [6]:
#contributed by Gowthami Gokul