In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp
import random
import math

# Selecting 40% of Training set for actual training.
df = pd.read_csv("train.csv")
# n = int(max(df.count()))
# s = int(n * .4)
# skip = sorted(random.sample(range(1, n+1), n-s))
# df = pd.read_csv("train.csv", skiprows=skip)

#Dog group lists
dogGroups = {
    'terrier': ['terrier'],
    'toy': ['shih', 'tzu', 'toy', 'pug' 'poodle', 'pomeranian', 'papillon', 'pekingese',
           'maltese', 'spaniel', 'chihuahua', 'havanese'],
    'working': ['akita', 'malamute', 'shepherd', 'bernese', 'boerboel', 'boxer', 'mastiff',
                'chinook', 'cane corso', 'pinscher', 'schnauzer', 'dane', 'pyrenees', 'swiss mountain', 
                'komondor', 'kuvasz', 'leonberger', 'water', 'rottweiler', 'samoyed', 'husky', 'bernard'],
    'sporting': ['spaniel', 'brittany', 'retriever', 'setter', 'pointer', 'lagotto', 'spinone', 'vizsla',
                'weimaraner', 'wirehaired'],
    'hound': ['hound', 'basenji', 'beagle', 'borzoi', 'cirneco', 'dachshund', 'harrier', 'plott',
             'podengo', 'ridgeback', 'saluki', 'sloughi', 'whippet'],
    'nonSporting': ['eskimo', 'bichon', 'shar-pei', 'chow', 'tulear', 'dalmatian', 'spitz', 'keeshond',
                   'apso', 'lowchen', 'lundehund', 'shiba inu', 'schiperke', 'xoloitzcuintli', 'terrier'],
    'herding': ['cattle', 'shepherd', 'collie', 'malinois', 'beauceron', 'sheepdog', 'tervuren', 'bergamasco',
               'picard', 'flandres', 'briard', 'canaan', 'corgi', 'entlebucher', 'lapphund', 'buhund', 'puli',
               'water', 'vallhund']
}


#Features
#--------

def hasName(x):
    if isinstance(x, str):
        return 1
    return 0
    
def intact(x):
    x = x.lower()
    if x == "unknown":
        return 0
    if ("spayed" in x or "neutered" in x):
        return -1
    return 1

def gender(x):
    if "Male" in x:
        return 1
    if "Female" in x:
        return -1
    return 0

def convertAgeToMonths(x):
    x = x.split()
    x[0] = int(x[0])
    if len(x) == 1:
        return -1
    if x[1] == "years":
        return x[0] / 12
    if x[1] == "weeks":
        return x[0] / 4
    if x[1] == "days":
        return x[0] / 30
    return x[0]

def isShihTzu(x):
    return 1 if "Shih Tzu" in x else 0

def isAggressive(x):
    aggressive = ["rottweiler", "pit", "bull", "siberian", "husky"]
    return 1 if any(match in x.lower() for match in aggressive) else 0

def isMix(x):
    return 1 if "mix" in x.lower() else 0

#Dog groups. Column for each where 0 is unknown or cat.
#NOTE: Should try to condense this if possible since all using same statement.
def isTerrier(x):
    return 1 if any(match in x.lower() for match in dogGroups['terrier']) else 0
def isToy(x):
    return 1 if any(match in x.lower() for match in dogGroups['toy']) else 0
def isWorking(x):
    return 1 if any(match in x.lower() for match in dogGroups['working']) else 0
def isSporting(x):
    return 1 if any(match in x.lower() for match in dogGroups['sporting']) else 0
def isHound(x):
    return 1 if any(match in x.lower() for match in dogGroups['hound']) else 0
def isNonSporting(x):
    return 1 if any(match in x.lower() for match in dogGroups['nonSporting']) else 0
def isHerding(x):
    return 1 if any(match in x.lower() for match in dogGroups['herding']) else 0

def isShortHair(x):
    return 1 if "short" in x.lower() else 0
def isMediumHair(x):
    return 1 if "medium" in x.lower() else 0
def isLongHair(x):
    return 1 if "long" in x.lower() else 0

def isMultiColour(x):
    pass

def isTabby(x):
    pass

def prepare_features(df):
    df['SexuponOutcome'].fillna('Unknown', inplace=True)
    df['AgeuponOutcome'].fillna("-1 months", inplace=True)
    
    df['hasName'] = df['Name'].apply(lambda x: hasName(x))
    
    df['intact'] = df['SexuponOutcome'].apply(lambda x: intact(x))
    df['gender'] = df['SexuponOutcome'].apply(lambda x: gender(x))
    df['age'] = df['AgeuponOutcome'].apply(lambda x: convertAgeToMonths(x))
    df['isShihTzu'] = df['Breed'].apply(lambda x: isShihTzu(x))
    df['isAggressive'] = df['Breed'].apply(lambda x: isAggressive(x))
    df['isMix'] = df['Breed'].apply(lambda x: isMix(x))
    
    df['isTerrier'] = df['Breed'].apply(lambda x: isTerrier(x))
    df['isToy'] = df['Breed'].apply(lambda x: isToy(x))
    df['isWorking'] = df['Breed'].apply(lambda x: isWorking(x))
    df['isSporting'] = df['Breed'].apply(lambda x: isSporting(x))
    df['isHound'] = df['Breed'].apply(lambda x: isHound(x))
    df['isNonSporting'] = df['Breed'].apply(lambda x: isNonSporting(x))
    df['isHerding'] = df['Breed'].apply(lambda x: isHerding(x))
    df['isShortHair'] = df['Breed'].apply(lambda x: isShortHair(x))
    df['isMediumHair'] = df['Breed'].apply(lambda x: isMediumHair(x))
    df['isLongHair'] = df['Breed'].apply(lambda x: isLongHair(x))
    
    df.drop(['AnimalID', 
             'DateTime', 
             'OutcomeType', 
             'SexuponOutcome',
             'Name'], axis=1, inplace=True)

# df['isMultiColour'] = df['Color'].apply(lambda x: isMultiColour(x))
# df['isTabby'] = df['Color'].apply(lambda x: isTabby(x))



#prepare_features(df)
#print(df)
print(df['Color'].value_counts())


Black/White                2824
Black                      2292
Brown Tabby                1635
Brown Tabby/White           940
White                       931
Brown/White                 884
Orange Tabby                841
Tan/White                   773
Tricolor                    752
Blue/White                  702
Black/Tan                   672
White/Black                 643
Brown                       639
Tan                         628
White/Brown                 569
Tortie                      530
Calico                      517
Orange Tabby/White          455
Brown Brindle/White         450
Blue                        450
Black/Brown                 436
Blue Tabby                  433
White/Tan                   389
Red                         337
Torbie                      335
Brown/Black                 333
Red/White                   331
Blue Tabby/White            241
Brown Brindle               232
Chocolate/White             224
                           ... 
Black/Bl