# Čišćenje i priprema podataka

In [2]:
import pandas as pd
import numpy as np

In [4]:
train_data = pd.read_csv('data/train.csv')

In [3]:
train_data

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
3,A683430,,2014-07-11 19:09:00,Transfer,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
4,A667013,,2013-11-15 12:52:00,Transfer,Partner,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan
5,A677334,Elsa,2014-04-25 13:04:00,Transfer,Partner,Dog,Intact Female,1 month,Cairn Terrier/Chihuahua Shorthair,Black/Tan
6,A699218,Jimmy,2015-03-28 13:11:00,Transfer,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Tabby
7,A701489,,2015-04-30 17:02:00,Transfer,Partner,Cat,Unknown,3 weeks,Domestic Shorthair Mix,Brown Tabby
8,A671784,Lucy,2014-02-04 17:17:00,Adoption,,Dog,Spayed Female,5 months,American Pit Bull Terrier Mix,Red/White
9,A677747,,2014-05-03 07:48:00,Adoption,Offsite,Dog,Spayed Female,1 year,Cairn Terrier,White


### Starost životinja

Ovaj stupac je tipa string i sadrži podatke poput '1 year', '3 weeks', '1 month', '3 days'.
Želimo podatke prikazati u danima.

In [5]:
def age_to_days(item):
    # convert item to list if it is one string
    if type(item) is str:
        item = [item]
    ages_in_days = []
    for i in range(len(item)):
        # check if item[i] is str
        if type(item[i]) is str:
            if 'day' in item[i]:
                ages_in_days.append(int(item[i].split(' ')[0]))
            if 'week' in item[i]:
                ages_in_days.append(int(item[i].split(' ')[0])*7)
            if 'month' in item[i]:
                ages_in_days.append(int(item[i].split(' ')[0])*30)
            if 'year' in item[i]:
                ages_in_days.append(int(item[i].split(' ')[0])*365)   
        else:
            # item[i] is not a string but a nan
            ages_in_days.append(0) 
    return ages_in_days

In [5]:
age_upon_outcome = train_data['AgeuponOutcome'].values
age_in_days = age_to_days(age_upon_outcome)
train_data['AgeuponOutcome'] = age_in_days

In [6]:
train_data

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,365,Shetland Sheepdog Mix,Brown/White
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,365,Domestic Shorthair Mix,Cream Tabby
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,730,Pit Bull Mix,Blue/White
3,A683430,,2014-07-11 19:09:00,Transfer,Partner,Cat,Intact Male,21,Domestic Shorthair Mix,Blue Cream
4,A667013,,2013-11-15 12:52:00,Transfer,Partner,Dog,Neutered Male,730,Lhasa Apso/Miniature Poodle,Tan
5,A677334,Elsa,2014-04-25 13:04:00,Transfer,Partner,Dog,Intact Female,30,Cairn Terrier/Chihuahua Shorthair,Black/Tan
6,A699218,Jimmy,2015-03-28 13:11:00,Transfer,Partner,Cat,Intact Male,21,Domestic Shorthair Mix,Blue Tabby
7,A701489,,2015-04-30 17:02:00,Transfer,Partner,Cat,Unknown,21,Domestic Shorthair Mix,Brown Tabby
8,A671784,Lucy,2014-02-04 17:17:00,Adoption,,Dog,Spayed Female,150,American Pit Bull Terrier Mix,Red/White
9,A677747,,2014-05-03 07:48:00,Adoption,Offsite,Dog,Spayed Female,365,Cairn Terrier,White


### Spol životinje

Stupac SexuponOutcome sadrži informacije o spolu životinje, ali i o tome je li životinja sterilizirana ili kastrirana. Smatramo da svaka od tih informacija bi bila važna za treniranje modela pa ih želimo razdvojiti u dva zasebna atributa.

In [6]:
def sex_and_intact(items):
    animal_sex = []
    intact_animal = []
    for i in range(len(items)):
        if (type(items[i]) is str) and items[i] != 'Unknown': 
            intact, sex = items[i].split(' ')
            if intact == 'Intact':
                intact_animal.append(1)
            else:
                intact_animal.append(0)
            animal_sex.append(sex)
        else:
            animal_sex.append('NaN')
            intact_animal.append('NaN')
            
    return animal_sex, intact_animal

In [7]:
sex_upon_outcome = train_data['SexuponOutcome'].values
animal_sex, intact_animal = sex_and_intact(sex_upon_outcome)
train_data['Sex'] = animal_sex
train_data['Intact'] = intact_animal

### Ime, vrsta i boja životinje

In [9]:
""" Set value of Name column to 1, if animal has name,
    to 0 if animal is nameless """
def hasName(names):
    hasName = []
    for name in names:
        if pd.isnull(name):
            hasName.append(0)
        else:
            hasName.append(1)
    return hasName

train_data['hasName'] = hasName(train_data['Name'])
train_data

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,Sex,Intact,hasName
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White,Male,0,1
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby,Female,0,1
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White,Male,0,1
3,A683430,,2014-07-11 19:09:00,Transfer,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream,Male,1,0
4,A667013,,2013-11-15 12:52:00,Transfer,Partner,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan,Male,0,0
5,A677334,Elsa,2014-04-25 13:04:00,Transfer,Partner,Dog,Intact Female,1 month,Cairn Terrier/Chihuahua Shorthair,Black/Tan,Female,1,1
6,A699218,Jimmy,2015-03-28 13:11:00,Transfer,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Tabby,Male,1,1
7,A701489,,2015-04-30 17:02:00,Transfer,Partner,Cat,Unknown,3 weeks,Domestic Shorthair Mix,Brown Tabby,,,0
8,A671784,Lucy,2014-02-04 17:17:00,Adoption,,Dog,Spayed Female,5 months,American Pit Bull Terrier Mix,Red/White,Female,0,1
9,A677747,,2014-05-03 07:48:00,Adoption,Offsite,Dog,Spayed Female,1 year,Cairn Terrier,White,Female,0,0


In [10]:
# Add variable called Mix to indicate those animals that are of a mixed breed
""" Set value of Mix to 1 if animal is of a mixed breed,
    to 0 if is not """
def isMixedBreed(breeds):
    mix = []
    for breed in breeds:
        if '/' not in breed:
            mix.append(1)
        else:
            mix.append(0)
    return mix;
train_data['Mix'] = isMixedBreed(train_data['Breed'])

train_data

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,Sex,Intact,hasName,Mix
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White,Male,0,1,1
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby,Female,0,1,1
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White,Male,0,1,1
3,A683430,,2014-07-11 19:09:00,Transfer,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream,Male,1,0,1
4,A667013,,2013-11-15 12:52:00,Transfer,Partner,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan,Male,0,0,0
5,A677334,Elsa,2014-04-25 13:04:00,Transfer,Partner,Dog,Intact Female,1 month,Cairn Terrier/Chihuahua Shorthair,Black/Tan,Female,1,1,0
6,A699218,Jimmy,2015-03-28 13:11:00,Transfer,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Tabby,Male,1,1,1
7,A701489,,2015-04-30 17:02:00,Transfer,Partner,Cat,Unknown,3 weeks,Domestic Shorthair Mix,Brown Tabby,,,0,1
8,A671784,Lucy,2014-02-04 17:17:00,Adoption,,Dog,Spayed Female,5 months,American Pit Bull Terrier Mix,Red/White,Female,0,1,1
9,A677747,,2014-05-03 07:48:00,Adoption,Offsite,Dog,Spayed Female,1 year,Cairn Terrier,White,Female,0,0,1


In [11]:
""" Set value of Color column to 1 if animal is one colored,
    else set it to 0 """
def isOneColored(colors):
    one_color = []
    for color in colors:
        if '/' not in color:
            one_color.append(1)
        else:
            one_color.append(0)
    return one_color

train_data['isOneColored'] = isOneColored(train_data['Color'])
train_data

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,Sex,Intact,hasName,Mix,isOneColored
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White,Male,0,1,1,0
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby,Female,0,1,1,1
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White,Male,0,1,1,0
3,A683430,,2014-07-11 19:09:00,Transfer,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream,Male,1,0,1,1
4,A667013,,2013-11-15 12:52:00,Transfer,Partner,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan,Male,0,0,0,1
5,A677334,Elsa,2014-04-25 13:04:00,Transfer,Partner,Dog,Intact Female,1 month,Cairn Terrier/Chihuahua Shorthair,Black/Tan,Female,1,1,0,0
6,A699218,Jimmy,2015-03-28 13:11:00,Transfer,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Tabby,Male,1,1,1,1
7,A701489,,2015-04-30 17:02:00,Transfer,Partner,Cat,Unknown,3 weeks,Domestic Shorthair Mix,Brown Tabby,,,0,1,1
8,A671784,Lucy,2014-02-04 17:17:00,Adoption,,Dog,Spayed Female,5 months,American Pit Bull Terrier Mix,Red/White,Female,0,1,1,0
9,A677747,,2014-05-03 07:48:00,Adoption,Offsite,Dog,Spayed Female,1 year,Cairn Terrier,White,Female,0,0,1,1


### Vremenske oznake 

Kako bismo mogli istražiti postoji li neki trend u udomljavanju životinja koji je povezan s datumima, godišnjim dobima i sl., razdvojili smo vremensku oznaku na dan, mjesec i godinu.

In [37]:
def date_spliter(row):
    row['Date'] = row['DateTime'].split(' ')[0].split('-')[2]
    row['Month'] = row['DateTime'].split(' ')[0].split('-')[1]
    row['Year'] = row['DateTime'].split(' ')[0].split('-')[0]
    return row

train_data['Date'] = train_data['DateTime']
train_data['Month'] = train_data['DateTime']
train_data['Year'] = train_data['DateTime']
train_data.update(train_data.apply(date_spliter, axis=1))

### Vrsta životinje 

Kako bismo mogli uključiti vrstu životinje u model, vektorom (1, 0) ili (0, 1) označit ćemo je li ona pas ili mačka.

In [35]:
def parse_animal_type(data_set):
    data_set['Dog'] = 0
    data_set['Cat'] = 0
    
    data_set.loc[data_set['AnimalType'] == 'Dog', 'Dog'] = 1
    data_set.loc[data_set['AnimalType'] == 'Cat', 'Cat'] = 1
    
parse_animal_type(train_data)

### Spol životinje

Svojstvo spola pripremamo slično kao i vrstu životinje. Postoje životinje kojima je spol nepoznat. To će biti označeno vektorom (0, 0).

In [38]:
def parse_animal_sex(data_set):
    data_set['Male'] = 0
    data_set['Female'] = 0
    
    data_set.loc[data_set['Sex'] == 'Male', 'Male'] = 1
    data_set.loc[data_set['Sex'] == 'Female', 'Female'] = 1
    
parse_animal_sex(train_data)

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,...,hasName,Mix,isOneColored,Date,Month,Year,Dog,Cat,Male,Female
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,365,Shetland Sheepdog Mix,Brown/White,...,1,1,0,12,02,2014,1,0,1,0
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,365,Domestic Shorthair Mix,Cream Tabby,...,1,1,1,13,10,2013,0,1,0,1
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,730,Pit Bull Mix,Blue/White,...,1,1,0,31,01,2015,1,0,1,0
3,A683430,,2014-07-11 19:09:00,Transfer,Partner,Cat,Intact Male,21,Domestic Shorthair Mix,Blue Cream,...,0,1,1,11,07,2014,0,1,1,0
4,A667013,,2013-11-15 12:52:00,Transfer,Partner,Dog,Neutered Male,730,Lhasa Apso/Miniature Poodle,Tan,...,0,0,1,15,11,2013,1,0,1,0
5,A677334,Elsa,2014-04-25 13:04:00,Transfer,Partner,Dog,Intact Female,30,Cairn Terrier/Chihuahua Shorthair,Black/Tan,...,1,0,0,25,04,2014,1,0,0,1
6,A699218,Jimmy,2015-03-28 13:11:00,Transfer,Partner,Cat,Intact Male,21,Domestic Shorthair Mix,Blue Tabby,...,1,1,1,28,03,2015,0,1,1,0
7,A701489,,2015-04-30 17:02:00,Transfer,Partner,Cat,Unknown,21,Domestic Shorthair Mix,Brown Tabby,...,0,1,1,30,04,2015,0,1,0,0
8,A671784,Lucy,2014-02-04 17:17:00,Adoption,,Dog,Spayed Female,150,American Pit Bull Terrier Mix,Red/White,...,1,1,0,04,02,2014,1,0,0,1
9,A677747,,2014-05-03 07:48:00,Adoption,Offsite,Dog,Spayed Female,365,Cairn Terrier,White,...,0,1,1,03,05,2014,1,0,0,1


In [39]:
#saving clean data 
train_data.to_csv('data/clean_train.csv', index=False)

In [29]:
def parse_outcome(outcome):
    if outcome == "Adoption":
        return 0
    if outcome == "Died":
        return 1
    if outcome == "Euthanasia":
        return 2
    if outcome == "Return_to_owner":
        return 3
    if outcome == "Transfer":
        return 4

    raise Exception("Unknown outcome type")
def parse_animal_type(animal):
    if animal == 'Cat':
        return 1
    if animal == 'Dog':
        return 0
    raise Exception("Unknown animal type")

def parse_sex_type(sex):
    if sex == 'Female':
        return 1 
    if sex == 'Male':
        return 0
    else: 
        return 2
    
X = pd.read_csv('data/clean_train.csv', encoding='ISO-8859-1')
X['OutcomeType'] = X['OutcomeType'].apply(lambda x:parse_outcome(x))
X['AnimalType'] = X['AnimalType'].apply(lambda x:parse_animal_type(x))
X['Sex'] = X['Sex'].apply(lambda x:parse_sex_type(x))
Y = X['OutcomeType'].values
X.drop(['AnimalID','OutcomeType','Name','DateTime','Breed','SexuponOutcome','OutcomeSubtype','Color'], axis=1, inplace=True)
X

Unnamed: 0,AnimalType,AgeuponOutcome,Sex,Intact,hasName,Mix,isOneColored
0,0,365,0,0.0,1,1,0
1,1,365,1,0.0,1,1,1
2,0,730,0,0.0,1,1,0
3,1,21,0,1.0,0,1,1
4,0,730,0,0.0,0,0,1
5,0,30,1,1.0,1,0,0
6,1,21,0,1.0,1,1,1
7,1,21,2,,0,1,1
8,0,150,1,0.0,1,1,0
9,0,365,1,0.0,0,1,1
