In [141]:
import pandas as pd
from datetime import date

In [51]:
import matplotlib.pyplot as plt
import seaborn as sns

In [144]:
animals = pd.read_csv("project1.csv")
animals.head()

Unnamed: 0,Animal ID,Date of Birth,Name,DateTime,MonthYear,Outcome Type,Outcome Subtype,Animal Type,Sex upon Outcome,Age upon Outcome,Breed,Color
0,A680855,5/25/14,,2014-06-10T00:00:00-05:00,Jun-14,Transfer,Partner,Bird,Unknown,2 weeks,Duck,Yellow/Black
1,A680857,5/25/14,,2014-06-10T00:00:00-05:00,Jun-14,Transfer,Partner,Bird,Unknown,2 weeks,Duck,Yellow/Black
2,A680858,5/25/14,,2014-06-10T00:00:00-05:00,Jun-14,Transfer,Partner,Bird,Unknown,2 weeks,Duck,Yellow/Black
3,A680859,5/25/14,,2014-06-10T00:00:00-05:00,Jun-14,Transfer,Partner,Bird,Unknown,2 weeks,Duck,Yellow/Black
4,A680860,5/25/14,,2014-06-10T00:00:00-05:00,Jun-14,Transfer,Partner,Bird,Unknown,2 weeks,Duck,Yellow/Black


In [145]:
animals.shape

(131165, 12)

In [146]:
animals.size

1573980

In [147]:
animals.info()
animals.duplicated().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131165 entries, 0 to 131164
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   Animal ID         131165 non-null  object
 1   Date of Birth     131165 non-null  object
 2   Name              93658 non-null   object
 3   DateTime          131165 non-null  object
 4   MonthYear         131165 non-null  object
 5   Outcome Type      131125 non-null  object
 6   Outcome Subtype   65810 non-null   object
 7   Animal Type       131165 non-null  object
 8   Sex upon Outcome  131165 non-null  object
 9   Age upon Outcome  131165 non-null  object
 10  Breed             131165 non-null  object
 11  Color             131165 non-null  object
dtypes: object(12)
memory usage: 12.0+ MB


17

In [148]:
animals.isnull().sum()

Animal ID               0
Date of Birth           0
Name                37507
DateTime                0
MonthYear               0
Outcome Type           40
Outcome Subtype     65355
Animal Type             0
Sex upon Outcome        0
Age upon Outcome        0
Breed                   0
Color                   0
dtype: int64

In [149]:
# All of the data types are objects and likely need to be converted to numerical values for valuable ML results. 
# There are also 17 duplicates among the rows and thus need to be treated. 
# I would convert Date of Birth, DateTime, MonthYear by first converting to datetime. Date of Birth can be converted to age.
# Animal ID and Name don't really need to be changed and can be eliminated as all IDs are unique and Names don't help analysis.
# Outcome Type, Outcome Subtype, Animal Type, Sex upon Outcome, Breed, and Color can be converted to categorial using One-hot encoding. 
# Lastly, Age Upon Outcome can be stripped to only a numerical value of days so everything becomes cohesive. 
animals.drop_duplicates(inplace=True, ignore_index=True)
animals.duplicated().sum()
# Lastly the Animal ID column, the Name column should be eliminated as they do not add any value to the analysis as each value are unique or don't affect the analysis numerically.
animals.drop(["Animal ID"], axis = 1, inplace=True)
animals.drop(["Name"], axis = 1, inplace=True)

In [150]:
animals.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131148 entries, 0 to 131147
Data columns (total 10 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   Date of Birth     131148 non-null  object
 1   DateTime          131148 non-null  object
 2   MonthYear         131148 non-null  object
 3   Outcome Type      131108 non-null  object
 4   Outcome Subtype   65802 non-null   object
 5   Animal Type       131148 non-null  object
 6   Sex upon Outcome  131148 non-null  object
 7   Age upon Outcome  131148 non-null  object
 8   Breed             131148 non-null  object
 9   Color             131148 non-null  object
dtypes: object(10)
memory usage: 10.0+ MB


In [153]:
# Now after removing duplicated rows and useless columns the data types of each column should be converted to useful values.
animals["Date of Birth"] = pd.to_datetime(animals["Date of Birth"], format = '%m/%d/%y')
def calc_age(DOB):
    today = date.today()
    # Take the years
    age = today.year - DOB.year
    return age 
animals["Age"] = animals["Date of Birth"].apply(calc_age)
    


In [154]:
animals.info()
animals.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131148 entries, 0 to 131147
Data columns (total 11 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   Date of Birth     131148 non-null  datetime64[ns]
 1   DateTime          131148 non-null  object        
 2   MonthYear         131148 non-null  object        
 3   Outcome Type      131108 non-null  object        
 4   Outcome Subtype   65802 non-null   object        
 5   Animal Type       131148 non-null  object        
 6   Sex upon Outcome  131148 non-null  object        
 7   Age upon Outcome  131148 non-null  object        
 8   Breed             131148 non-null  object        
 9   Color             131148 non-null  object        
 10  Age               131148 non-null  int64         
dtypes: datetime64[ns](1), int64(1), object(9)
memory usage: 11.0+ MB


Unnamed: 0,Date of Birth,DateTime,MonthYear,Outcome Type,Outcome Subtype,Animal Type,Sex upon Outcome,Age upon Outcome,Breed,Color,Age
0,2014-05-25,2014-06-10T00:00:00-05:00,Jun-14,Transfer,Partner,Bird,Unknown,2 weeks,Duck,Yellow/Black,11
1,2014-05-25,2014-06-10T00:00:00-05:00,Jun-14,Transfer,Partner,Bird,Unknown,2 weeks,Duck,Yellow/Black,11
2,2014-05-25,2014-06-10T00:00:00-05:00,Jun-14,Transfer,Partner,Bird,Unknown,2 weeks,Duck,Yellow/Black,11
3,2014-05-25,2014-06-10T00:00:00-05:00,Jun-14,Transfer,Partner,Bird,Unknown,2 weeks,Duck,Yellow/Black,11
4,2014-05-25,2014-06-10T00:00:00-05:00,Jun-14,Transfer,Partner,Bird,Unknown,2 weeks,Duck,Yellow/Black,11
