In [None]:
import pandas as pd
import numpy as np
# plotting
import seaborn as sns
import matplotlib.pyplot as plt

# Modelling
from sklearn.model_selection import train_test_split 
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import tree
# import graphviz
# Scaling with Minmax-scaler
from sklearn.preprocessing import MinMaxScaler

# from ydata_profiling import ProfileReport

# import custom functions
from custom_functions import our_metrics

pd.set_option('display.max_rows', 10)

In [None]:
df_raw = pd.read_csv("../data/petfinder-adoption-prediction/train/train.csv")
df_raw.columns = df_raw.columns.str.lower() #make column names lowercase

In [None]:
df_raw.head().T

In [None]:
# #eda_report = ProfileReport(df_raw, title="Eda report")
# eda_report.to_file("eda_report.html")

# Creates initial basic EDA report. Basically an increased .describe() and .info()

In [None]:
df_dropped = df_raw.drop(["state","petid","rescuerid","name"],axis=1) # Drop some columns that are not needed (yet)


In [None]:
df_dropped.adoptionspeed.value_counts(normalize=True) #Distribution of our classes. Relatively balanced! 
# Class 0 (Adopted on day of listing) will later be merge into class 1 (Adopted within the first week) since there are only few instances and for our stakeholder it is not so important.
# Also it is unlikely that pets with certain properties are predictably adopted on day 1 (instead of day 2-7). Its more likely that it is just by chance.

In [None]:
#pd.crosstab([df_dropped["color1"],df_dropped["color2"],df_dropped["type"]],df_dropped["adoptionspeed"],normalize="index")

# Delete this?

In [None]:
for column_name in df_dropped.columns:
    print(pd.crosstab(df_dropped[column_name], df_dropped["adoptionspeed"],normalize="index"))
    print("____________")

# All features two-way influence on the target. There are some patterns but there are no clear and easy univariate ways to determine the adoption speed.
# 
# A few questions and feature engineering ideas arose:
# Some patterns we find might be different for cats and dogs.
# Check if a pure-breed vs mixed breed feature can be created
# Shall we bin the age to get rid of the outliers?
# The quantity column is tricky. If there are more animals listed in one profile (at a certain point) the adoption speed will decrease because it takes time until they all are adopted. 
# However, dropping all rows with quantity > 1 would mean that we loose 23% of data and would alter the age distribution (many profiles of quantity > 1 animals are newborn litters with very low age).
# Speaking of age: There seem to be many interactive effects with age. For example we thought that vaccinated and dewormed animals would have a higher adoption speed. That did not hold true, though, sine young animals have high adoption rate but are often not (yet) vaccinated.


In [None]:
pd.crosstab(df_dropped.quantity, df_dropped.adoptionspeed)#, normalize="index")

In [None]:
df_color_breed = df_dropped[["color1", "color2", "color3", "breed1", "breed2", "type"]]

In [None]:
sns.pairplot(df_color_breed, hue="type")
# If an animal has the color black (1), the colorcode (1) is always used in the color1 column. Or generally for a multi-color animal the lowest colorcode goes into column1. This explains the triangular shapes below. 

color feature is filled in in numerical order: color1 < color2 < color3 (except for 0), not sorted by primary color
dark: (color1,color2,color3)= (1,0,0), (2,0,0), (6,0,0), (1,2,0), (1,6,0), (1,2,6), (2,6,0)
mixed: else
light: (3,0,0), (4,0,0), (5,0,0), (7,0,0), (3,4,0), (3,5,0), (3,7,0), (3,4,5), (3,4,7), (3,5,7), (4,5,0), (4,5,7), (4,7,0), (5,7,0)

1,"Black"
2,"Brown"
3,"Golden"
4,"Yellow"
5,"Cream"
6,"Gray"
7,"White"

breed: mixed breed/no breed 2 dominant, supporting the idea of mixed vs. pure breed feature (stumpy tail breed not in data set)
cat mixed breed: domestic short hair, domestic medium hair, domestic long hair are considered mixed breeds, breed1=!breed2
dog mixed breed: category 307 or different entries in breed1 and breed2 column. Pure breed: entry =!307 in breed 1 and breed2=0


# Idea to create color_pattern column encoding whether an animal is 'dark', "light" or "mixed" colored.

In [None]:
patternlist = []
for colorcode in (df_dropped.color1.astype(str) + df_dropped.color2.astype(str) + df_dropped.color3.astype(str)):
    if colorcode in ("100","200","600","120","160","126","260"):
        patternlist.append(0) # 0 = dark color pattern
    elif colorcode in ("300", "400", "500", "700", "340", "350", "370", "345", "347", "357", "450", "457", "470", "570"):
        patternlist.append(1) # 1 = light color pattern
    else:
        patternlist.append(2) # 2 = mixed color pattern

df_dropped["color_pattern"] = patternlist

In [None]:
df_dropped[["color_pattern"]].value_counts()

In [None]:
pd.crosstab(df_dropped["color_pattern"],df_dropped["adoptionspeed"],normalize='index')
#pd.crosstab([df_dropped["color_pattern"],df_dropped["type"]],df_dropped["adoptionspeed"],normalize='index')
# mixed > light > black animals adoptionspeed. Only true for dogs, though. Presumably because black/dark dogs are intimidating

# Create pure-breed vs mixed breed feature

In [None]:
#df_dropped.breed2.value_counts()

In [None]:
# breed: mixed breed/no breed 2 dominant, supporting the idea of mixed vs. pure breed feature (stumpy tail breed not in data set)
# cat mixed breed: domestic short hair, domestic medium hair, domestic long hair are considered mixed breeds, breed1=!breed2
# dog mixed breed: category 307 or different entries in breed1 and breed2 column. Pure breed: entry =!307 in breed 1 and breed2=0

#264,2,"Domestic Long Hair"
#265,2,"Domestic Medium Hair"
#266,2,"Domestic Short Hair"

#pure_bred: breed2=0 & breed1 != 307, 264, 265, 266



In [None]:
df_dropped.query('breed1 == breed2 and breed1 != 307 and breed1 != 264 and breed1 != 265 and breed1 != 266').breed1.value_counts()

# There are some instances where pure bred animals have the same entry in breed1 and breed2 column. This needs to be taken into account.    

In [None]:
df_dropped['breed_type'] = np.where((((df_dropped["breed2"] == 0) | (df_dropped["breed1"] == df_dropped["breed2"])) & (df_dropped.breed1 != 307) & (df_dropped.breed1 != 264) & (df_dropped.breed1 != 265) & (df_dropped.breed1 != 266)), 0, 1)

# Complicated but it works :) 0 = pure-bred 1 = mixed-bred

In [None]:
df_dropped.breed_type.value_counts()

In [None]:
#we can use this graph in the presentation but separately for cats and dogs 

pd.crosstab(df_dropped["adoptionspeed"],df_dropped["breed_type"],normalize="columns").plot.bar()

# Pure-bred animals have a faster adoption speed.
# However, ...

In [None]:
pd.crosstab([df_dropped.type, df_dropped.breed_type], df_dropped.adoptionspeed,normalize='index')

# ... this is only true for dogs and quite the opposite for cats!

In [None]:
# 3 means not known and we replaced it with 2 that is not vaccinated or dewormed. Unknown vaccination status means that they need to get vaxxed anyways so its equal to "no vaccination"
df_dropped['vaccinated'] = df_dropped['vaccinated'].replace(3,2)
df_dropped['dewormed'] = df_dropped['dewormed'].replace(3,2)

In [None]:
# combine the vaccinated and dewormed pets and turn it into another feature. Vaccinated and dewormed are highly correlated and it most likely makes sense to only keep one.
# three categories: fully vaccinated and dewormed (0), either only vaccinated or dewormed (1), and neither vaccinated or dewormed (2) 

# create a list of our conditions
conditions = [
    ((df_dropped['vaccinated'] == 1) & (df_dropped['dewormed'] == 1)),
    ((df_dropped['vaccinated'] == 1) & (df_dropped['dewormed'] == 2)),
    ((df_dropped['vaccinated'] == 2) & (df_dropped['dewormed'] == 1)),
    ((df_dropped['vaccinated'] == 2) & (df_dropped['dewormed'] == 2))
    ]

# create a list of the values we want to assign for each condition
values = [0, 1, 1, 2]

# create a new column and use np.select to assign values to it using our lists as arguments
df_dropped['vaccinated_dewormed'] = np.select(conditions, values)

In [None]:
df_dropped['vaccinated_dewormed'].value_counts()

This new feature vaccinated_dewormed shows that adoption speed is not drastically effected by it that much, but this needs to be further investigated. But this is an important feature for further analysis as they both were highly correlated too. 

In [None]:
pd.crosstab(df_dropped.query('age < 4 & type == 2')['adoptionspeed'],df_dropped.query('age < 4 & type == 2')["vaccinated_dewormed"],normalize="columns").plot.bar()

# Age


In [None]:
# due to a very low number of instances in adoption speed 0, we will combine cat. 0 and 1
df_dropped.adoptionspeed = df_dropped.adoptionspeed.replace(0,1)

In [None]:
sns.catplot(data=df_dropped, x='adoptionspeed', y='age', hue='type', jitter=0.3)

hypothesis: cats will be adopted faster, regardless of their age, but old dogs will remain in the shelter longer. 
- younger animals are adopted faster
- there are more dogs then cats among the older animals in the shelter (adult or old dogs are in the shelter more frequently then adult or old cats)

In [None]:
# Plot distribution of features 
features = ['type', 'age', 'gender', 'maturitysize', 'furlength', 'sterilized', 'health', 'quantity', 'fee', 'videoamt', 'photoamt', 'color_pattern', 'breed_type', 'vaccinated_dewormed']

fig,ax = plt.subplots(7,2,figsize=(34,30))
count = 0
for item in features:
    sns.countplot(df_dropped, x=item, ax=ax[int(count/2)][count%2], hue='adoptionspeed').set(title=item, xlabel='')#color='#33658A', kde=True
    count += 1
#ax.flat[-1].set_visible(False)
fig.tight_layout(pad=3)

#fig.savefig('images/feature_histogram.jpg')

In [None]:
df_dropped.videoamt.value_counts()

In [None]:
#pd.crosstab(df_dropped["videoamt"], df_dropped["adoptionspeed"],normalize="index")

In [None]:
# drop video column: hardly any animals with videos, from above crosstab/count plot: for those with video all adoption speeds apparent
df_dropped.drop(["videoamt","color1", "color2", "color3","breed1","breed2", "vaccinated", "dewormed"], axis=1, inplace=True)

In [None]:
# bin fee 0 = no, 1 = yes
# hardly any animals with a fee, binned together
df_dropped["fee_bin"] = np.where((df_dropped["fee"] == 0), 0, 1)


In [None]:
df_dropped.fee_bin.value_counts()

In [None]:
# bin age feature:
# cats sterilized at around 4 months old, dogs >= 6 months
# newborn: 0-3 months higher adoption speeds up to this age on average (0)
# puppy/kitten 4-12 (1)
# adult 13-72 month (2)
# senior: >= 73 (3)

# create a list of our conditions
conditions_age = [
    ((df_dropped['age'] >= 0) & (df_dropped['age'] <= 3)),
    ((df_dropped['age'] >= 4) & (df_dropped['age'] <= 12)),
    ((df_dropped['age'] >= 13) & (df_dropped['age'] <= 72)),
    (df_dropped['age'] >= 73)
    ]

# create a list of the values we want to assign for each condition
values_age = [0, 1, 2, 3]

# create a new column and use np.select to assign values to it using our lists as arguments
df_dropped['age_bin'] = np.select(conditions_age, values_age)

In [None]:
df_dropped.age_bin.value_counts()

In [None]:
sns.countplot(df_dropped, x="age_bin", hue='adoptionspeed')


In [None]:
df_dropped.drop(["age","fee"], axis=1, inplace=True) #dropping age and fee column because new binned column was created

In [None]:
features = df_dropped.columns.tolist()
features.remove("description")

corr_matrix = df_dropped[features].corr()

# creating mask
mask = np.triu(np.ones_like(corr_matrix.corr()))
# plotting a triangle correlation heatmap

plt.figure(figsize=(15,7))
sns.heatmap(corr_matrix, annot=True ,cmap="magma", mask=mask)
plt.show()

In [None]:
pd.crosstab(df_dropped["photoamt"], df_dropped["adoptionspeed"],normalize="index")

In [None]:
pd.crosstab(df_dropped["photoamt"], df_dropped["adoptionspeed"]#,normalize="index"
).plot.line()

In [None]:
# what about age bin >=1 (i.e., old enough to be sterilized) and sterilization?
sns.countplot(df_dropped.query("age_bin != 0"), x="sterilized", hue='adoptionspeed').set(title="Sterilization for age_bin >=1")
# it does not matter that much whether animals are sterilized or not

In [None]:
# what about age bin >=1 (i.e., old enough to be sterilized) and sterilization?
sns.countplot(df_dropped.query("age_bin == 0"), x="sterilized", hue='adoptionspeed').set(title="Sterilization for age_bin =0")
# it does not matter if puppies/kittens are sterilized or not

In [None]:
df_dropped.query("photoamt >10").adoptionspeed.value_counts().sum()/df_dropped.shape[0]

In [None]:
# bin photoamt >10 to 11 to get rid of tail
df_dropped["photoamt_11"] = np.where((df_dropped["photoamt"] < 11), df_dropped["photoamt"], 11)

In [None]:
df_dropped.photoamt_11.value_counts()

In [None]:
pd.crosstab(df_dropped["photoamt_11"], df_dropped["adoptionspeed"]#,normalize="index"
).plot.line()

## Recommendation to increase Adoption Speed

* Add pictures, ideally $\geq 4$ (starting from that amount, Adoption speed 4 is not the largest subset anymore)

# Feature Description Length
to utilize the description column, we will create a feature holding the length of the description string, the hypothesis being
that animals with a more detailed description will be adopted faster. 

In [None]:
#check where description col. contains missing values
(df_dropped[df_dropped['description'].isnull()])

In [None]:
# create feature of description length (character count)
df_dropped['description_char'] = df_dropped.description.str.len()

#check if this worked
df_dropped.description_char.value_counts()

In [None]:
# the new feature should have same amount of NaN

(df_dropped[df_dropped['description_char'].isnull()])

In [None]:
# fill missing char count with 0
df_dropped['description_char'] = df_dropped['description_char'].fillna(0)

In [None]:
# create feature of description length (word count)
df_dropped['description_words'] = df_dropped.description.str.count(' ')+1

#check if this worked
df_dropped.description_words.value_counts()

In [None]:
# fill missing word count with 0
df_dropped['description_words'] = df_dropped['description_words'].fillna(0)

In [None]:
sns.histplot(data=df_dropped, x='description_words')

In [None]:
#sns.histplot(data=df_dropped, x='description_len')

the distribution is extremely skewed and something must be done with it

In [None]:
#sns.catplot(data=df_dropped, x='adoptionspeed', y='description_len', jitter=0.2)

In [None]:
sns.catplot(data=df_dropped, x='adoptionspeed', y='description_words', jitter=0.2)

In [None]:
# drop description column
#df_dropped.drop(['description'], inplace=True, axis=1)

the description length does not seem to influence the adoption speed

In [None]:
# plot the heatmap with final features:

#features3 = df_dropped.columns.tolist()

#corr_matrix = df_dropped[features3].corr()

# creating mask
#mask = np.triu(np.ones_like(corr_matrix.corr()))
# plotting a triangle correlation heatmap

#plt.figure(figsize=(15,7))
#sns.heatmap(corr_matrix, annot=True ,cmap="magma", mask=mask)
#plt.show()

# How to handle multiple animal listings
some oberservations do not refer to one single animal but a group (assumption: most often one littre, which is supported by the finding that most of these listings have animals in age category 0 or 1 (0-3 months and 4-12 months). Possible strategies:
- drop the listings as faulty data (+ the faulty features of these observations would not influence the model; - we would lose mostly young animals which might introduce an age bias)
- treat these observations as one single animal, introduce another category for misleading features (like mix for colorpattern)
- split from the data and model separately in a later iteration

In [None]:
df_dropped.query("quantity>1").age_bin.value_counts()

In [None]:
pd.crosstab([df_dropped["quantity"],df_dropped["age_bin"]], df_dropped["adoptionspeed"],normalize="index")

In [None]:
# how much data of animals aged 0 and 1 would we lose by dropping the multiple listings?

print(f'we would loose {(df_dropped.query("quantity>1").age_bin.value_counts().iloc[0]/df_dropped.age_bin.value_counts().iloc[0]).round(2)} percent of newborns.')
print(f'we would loose {(df_dropped.query("quantity>1").age_bin.value_counts().iloc[1]/df_dropped.age_bin.value_counts().iloc[1]).round(2)} percent of kittens/puppies.')
print(f'we would loose {(df_dropped.query("quantity>1").age_bin.value_counts().iloc[2]/df_dropped.age_bin.value_counts().iloc[2]).round(2)} percent of adults.')
print(f'we would loose {(df_dropped.query("quantity>1").age_bin.value_counts().iloc[3]/df_dropped.age_bin.value_counts().iloc[3]).round(2)} percent of seniors.')


# looking at the percentage, we would loose much more of newborns and puppies and kittens if we dropped multiple listings
# but does their distribution actually differ from the entire data (meaning would dropping them skew the distribution differently then it was before)?

In [None]:
# comparing age distribution in the entire dataset and those we would drop:

fig,ax = plt.subplots(1,3,figsize=(15,5))
sns.histplot(ax=ax[0], data=df_dropped, x='age_bin', stat='percent', bins=4)
sns.histplot(ax=ax[1], data=df_dropped.query('quantity > 1'), x='age_bin', stat='percent', bins=4)
sns.histplot(ax=ax[2], data=df_dropped.query('quantity == 1'), x='age_bin', stat='percent', bins=4)
ax[0].set_title('age distribution across all data')
ax[1].set_title('age distribution across multi-listings')
ax[2].set_title('age distribution across single-listings')
fig.tight_layout(pad=3)



In [None]:
# comparing adoption speed distribution in the entire dataset and those we would drop:

fig,ax = plt.subplots(1,3,figsize=(15,5))
sns.histplot(ax=ax[0], data=df_dropped, x='adoptionspeed', stat='percent', bins=4)
sns.histplot(ax=ax[1], data=df_dropped.query('quantity > 1'), x='adoptionspeed', stat='percent', bins=4)
sns.histplot(ax=ax[2], data=df_dropped.query('quantity == 1'), x='adoptionspeed', stat='percent', bins=4)
ax[0].set_title('adoptionspeed distribution across all data')
ax[1].set_title('adoptionspeed distribution across multi-listings')
ax[2].set_title('adoptionspeed distribution across single-listings')
fig.tight_layout(pad=3)

In [None]:
pd.crosstab(df_dropped.quantity, df_dropped.adoptionspeed, normalize='index')

In [None]:
pd.crosstab([df_dropped.quantity, df_dropped.age_bin], df_dropped.adoptionspeed, normalize='index')

In [None]:
df_dropped.query('quantity > 1').adoptionspeed.value_counts()

## Baseline Model

We decided on a decision tree with a limited number of features as a baseline model. 

Features considered:

* age_bin
* photo_11
* breed_type
* type
* color_pattern

"young animals that are pure bred and of a light color pattern listed with photos are adopted fastest"

In [None]:
# Plot distribution of features 
features2 = ['type', 'age_bin', 'gender', 'maturitysize', 'furlength', 'sterilized', 'health', 'quantity', 'fee_bin', 'photoamt_11', 'color_pattern', 'breed_type', 'vaccinated_dewormed']

fig,ax = plt.subplots(7,2,figsize=(34,30))
count = 0
for item in features2:
    sns.countplot(df_dropped, x=item, ax=ax[int(count/2)][count%2], hue='adoptionspeed').set(title=item, xlabel='')#color='#33658A', kde=True
    count += 1
#ax.flat[-1].set_visible(False)
fig.tight_layout(pad=3)

#fig.savefig('../images/feature_histogram2.jpg')

# Baseline moved to the bottom after final cleaning and dummy creation.

# Dummies for different models

- type: change type to 0,1 (substract 1)
- gender: change gender to 0,1,2 (substract 1), get dummy according to your model
- maturitry size: ordinal, substract 1, get dummy according to your model
- fur length: ordinal, substract 1, get dummy according to your model
- sterilization: replace unknown (3) with 2 (no), substract 1
- health: ordinal, substract 1, get dummy according to your model
- quantity: rescale
- description: drop for MVP
- photoamt: drop 
- colorpattern: get dummies 
- breed type: done
- vaccinated_dewormed: could be ordinal if you really want it to :)
- fee_bin: done
- age_bin: ordinal, rescale for distance-based models
- photoamt_11: ordinal, rescale
- description_len: drop
- description_char/description_words: ordinal, rescale if you want to include in model 


In [None]:
# change the type column to 0 and 1
df_dropped['type'] = df_dropped['type'] - 1

In [None]:
# change gender to 0, 1,2 from 1, 2, 3 
df_dropped['gender'] = df_dropped['gender'] - 1

In [None]:
# change the maturity size
df_dropped['maturitysize'] = df_dropped['maturitysize'] -1 

In [None]:
# convert the fur size
df_dropped['furlength'] = df_dropped['furlength'] -1 

In [None]:
# convert the health to 0,1,2
df_dropped['health'] = df_dropped['health'] -1 

In [None]:
# replace the 3 which is unknown with no
df_dropped['sterilized'] = df_dropped['sterilized'].replace(3,2)

In [None]:
# convert the sterilized to 0,1
df_dropped['sterilized'] = df_dropped['sterilized'] - 1

In [None]:
# drop columns that we donot need in the models
df_dropped = df_dropped.drop(['photoamt', 'description', 'description_words'], axis=1, errors='ignore')

# Create dummies and scale

In [None]:
# dummies for some columns: gender, maturity size, fur length, health, color_pattern (first column is not dropped from the dummies)

dummy_list = ['gender', 'maturitysize', 'furlength', 'health', 'color_pattern']

df_dummies = pd.get_dummies(df_dropped[dummy_list], columns=dummy_list, dtype=int)


In [None]:
# first dummy column dropped for the distance based algorithms
df_dummies_distance = pd.get_dummies(df_dropped[dummy_list], columns=dummy_list, dtype=int, drop_first=True)

In [None]:
# create the dataframe with the dropped first dummy column
df_processed_dropped_first =  pd.concat([df_dropped.drop(dummy_list,axis=1), df_dummies_distance],axis=1)

In [None]:
#processed dataframe with dummy columns where first column was not dropped
df_processed =  pd.concat([df_dropped.drop(dummy_list,axis=1), df_dummies],axis=1)

In [None]:
# define X with processed data that still has the all the dummy columns
X_processed = df_processed.drop('adoptionspeed', axis=1)


In [None]:
# define X with data that has the first dummy column dropped
X_dropped_first = df_processed_dropped_first.drop('adoptionspeed', axis=1)

In [None]:
#train test split with X_processed
y = df_processed["adoptionspeed"]
X_train_processed, X_test_processed, y_train, y_test = train_test_split(X_processed, y, random_state=42, stratify=y)

In [None]:
# train test split with the dataframe that has first dummy column removed
X_train_dropped_first, X_test_dropped_first, y_train, y_test = train_test_split(X_dropped_first, y, random_state=42, stratify=y)

In [None]:
minmax_scaler = MinMaxScaler()
# scaled the 4 features 
num_features = ['quantity', 'photoamt_11', 'age_bin', 'description_char']
#scaling with the processed data with all dummy columns
X_train_minmax_scaled_processed_1 = minmax_scaler.fit_transform(X_train_processed[num_features])
X_test_minmax_scaled_processed_1 = minmax_scaler.transform(X_test_processed[num_features])

X_train_minmax_scaled_processed = pd.concat([X_train_processed.drop(num_features,axis=1).reset_index(drop=True), pd.DataFrame(X_train_minmax_scaled_processed_1,columns= num_features)],axis=1)
X_test_minmax_scaled_processed = pd.concat([X_test_processed.drop(num_features,axis=1).reset_index(drop=True), pd.DataFrame(X_test_minmax_scaled_processed_1,columns= num_features)],axis=1)

In [None]:
# scaling the processed data with first dummy column removed
X_train_minmax_scaled_dropped_first_1 = minmax_scaler.fit_transform(X_train_dropped_first[num_features])
X_test_minmax_scaled_dropped_first_1 = minmax_scaler.transform(X_test_dropped_first[num_features])

X_train_minmax_scaled_dropped_first = pd.concat([X_train_dropped_first.drop(num_features,axis=1).reset_index(drop=True), pd.DataFrame(X_train_minmax_scaled_dropped_first_1,columns= num_features)],axis=1)
X_test_minmax_scaled_dropped_first = pd.concat([X_test_dropped_first.drop(num_features,axis=1).reset_index(drop=True), pd.DataFrame(X_test_minmax_scaled_dropped_first_1,columns= num_features)],axis=1)

# Export Processed Data 

In [None]:
# export processed data set to csv

# export features for tree-based models
pd.DataFrame(X_train_minmax_scaled_processed).to_csv('../data/petfinder-adoption-prediction/train/X_train_minmax_scaled_processed.csv', index=False)
pd.DataFrame(X_test_minmax_scaled_processed).to_csv('../data/petfinder-adoption-prediction/train/X_test_minmax_scaled_processed.csv', index=False)

# export features for distance-based models
pd.DataFrame(X_train_minmax_scaled_dropped_first).to_csv('../data/petfinder-adoption-prediction/train/X_train_minmax_scaled_dropped_first.csv', index=False)
pd.DataFrame(X_test_minmax_scaled_dropped_first).to_csv('../data/petfinder-adoption-prediction/train/X_test_minmax_scaled_dropped_first.csv', index=False)

# export target
pd.DataFrame(y_train).to_csv('../data/petfinder-adoption-prediction/train/y_train.csv', index=False)
pd.DataFrame(y_test).to_csv('../data/petfinder-adoption-prediction/train/y_test.csv', index=False)


# Create separate cat / dog dataframes before train test split

In [None]:
df_dogs = df_dropped[df_dropped["type"]==0].reset_index(drop=True)
df_cats = df_dropped[df_dropped["type"]==1].reset_index(drop=True)

In [None]:
dummy_list = ['gender', 'maturitysize', 'furlength', 'health', 'color_pattern']


df_dog_dummies = pd.get_dummies(df_dogs[dummy_list], columns=dummy_list, dtype=int)
df_dogs =  pd.concat([df_dogs.drop(dummy_list,axis=1), df_dog_dummies],axis=1)

df_cat_dummies = pd.get_dummies(df_cats[dummy_list], columns=dummy_list, dtype=int)
df_cats =  pd.concat([df_cats.drop(dummy_list,axis=1), df_cat_dummies],axis=1)

In [None]:
X_dogs = df_dogs.drop('adoptionspeed', axis=1)
y_dogs = df_dogs["adoptionspeed"]
X_train_dogs, X_test_dogs, y_train_dogs, y_test_dogs = train_test_split(X_dogs, y_dogs, random_state=42, stratify=y_dogs)

In [None]:
X_cats = df_cats.drop('adoptionspeed', axis=1)
y_cats = df_cats["adoptionspeed"]
X_train_cats, X_test_cats, y_train_cats, y_test_cats = train_test_split(X_cats, y_cats, random_state=42, stratify=y_cats)

In [None]:
minmax_scaler = MinMaxScaler()
# scaled the 4 features 
num_features = ['quantity', 'photoamt_11', 'age_bin', 'description_char']
#scaling with the processed data with all dummy columns
X_train_dogs_sc = minmax_scaler.fit_transform(X_train_dogs[num_features])
X_test_dogs_sc = minmax_scaler.transform(X_test_dogs[num_features])

X_train_dogs = pd.concat([X_train_dogs.drop(num_features,axis=1).reset_index(drop=True), pd.DataFrame(X_train_dogs_sc,columns= num_features)],axis=1)
X_test_dogs = pd.concat([X_test_dogs.drop(num_features,axis=1).reset_index(drop=True), pd.DataFrame(X_test_dogs_sc,columns= num_features)],axis=1)

In [None]:
minmax_scaler = MinMaxScaler()
#scaling with the processed data with all dummy columns
X_train_cats_sc = minmax_scaler.fit_transform(X_train_cats[num_features])
X_test_cats_sc = minmax_scaler.transform(X_test_cats[num_features])

X_train_cats = pd.concat([X_train_cats.drop(num_features,axis=1).reset_index(drop=True), pd.DataFrame(X_train_cats_sc,columns= num_features)],axis=1)
X_test_cats = pd.concat([X_test_cats.drop(num_features,axis=1).reset_index(drop=True), pd.DataFrame(X_test_cats_sc,columns= num_features)],axis=1)

In [None]:
# export processed data set to csv

# export features for tree-based models
pd.DataFrame(X_train_dogs).to_csv('../data/petfinder-adoption-prediction/train/X_train_dogs.csv', index=False)
pd.DataFrame(X_test_dogs).to_csv('../data/petfinder-adoption-prediction/train/X_test_dogs.csv', index=False)

# export features for distance-based models
pd.DataFrame(X_train_cats).to_csv('../data/petfinder-adoption-prediction/train/X_train_cats.csv', index=False)
pd.DataFrame(X_test_cats).to_csv('../data/petfinder-adoption-prediction/train/X_test_cats.csv', index=False)

# export target
pd.DataFrame(y_train_dogs).to_csv('../data/petfinder-adoption-prediction/train/y_train_dogs.csv', index=False)
pd.DataFrame(y_test_dogs).to_csv('../data/petfinder-adoption-prediction/train/y_test_dogs.csv', index=False)

pd.DataFrame(y_train_cats).to_csv('../data/petfinder-adoption-prediction/train/y_train_cats.csv', index=False)
pd.DataFrame(y_test_cats).to_csv('../data/petfinder-adoption-prediction/train/y_test_cats.csv', index=False)

# Baseline model:

In [None]:
X_baseline = df_processed[["age_bin", "photoamt_11", "breed_type", "type", "color_pattern_0","color_pattern_1","color_pattern_2"]]
y = df_processed["adoptionspeed"]
X_train_bl, X_test_bl, y_train_bl, y_test_bl = train_test_split(X_baseline, y, random_state=42, stratify=y)
baseline = DecisionTreeClassifier(max_depth = 4, min_samples_leaf = 15)
baseline.fit(X_train_bl, y_train_bl)

y_pred_bl = baseline.predict(X_test_bl)
our_metrics(y_test_bl, y_pred_bl, normalize=True)

# Graphviz commented out:

In [None]:
# fig = plt.figure(figsize=(35,20))
# baseline_plot = plot_tree(baseline, filled=True, fontsize=12, feature_names=["age_bin", "photoamt_11", "breed_type", "type", "color_pattern_0","color_pattern_1","color_pattern_2"], class_names=True)
# plt.savefig('../images/baseline_plot.jpg')

In [None]:
# dot_data = tree.export_graphviz(baseline, out_file=None, feature_names=["age_bin", "photoamt_11", "breed_type", "type", "color_pattern"], class_names=['1','2','3','4'], rounded=True, special_characters=True, filled=True) 
# baseline_plot_2 = graphviz.Source(dot_data)

# baseline_plot_2.render('../images/baseline_plot_2')
# baseline_plot_2