# Preprocessing & Training

### Goal:
<p>Create a cleaned development dataset you can use to complete the modeling step of your project.</p>

### Steps:
<ul><li>Create dummy or indicator features for categorical variables</li><li>Standardize the magnitude of numeric features using a scaler</li><li>Split into testing and training datasets</li></ul>
Review the following questions and apply them to your dataset:<ul><li>Does my data set have any categorical data, such as Gender or day of the week?</li><li>Do my features have data values that range from 0 - 100 or 0-1 or both and more?  </li></ul>

In [55]:
import numpy as np
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve

from library.sb_utils import save_file

In [56]:
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.precision', 2)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [57]:
adopted = pd.read_csv('data/dogs_trimmed.csv')
adopted.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6970 entries, 0 to 6969
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   age                    6970 non-null   object 
 1   gender                 6970 non-null   object 
 2   size                   6970 non-null   object 
 3   coat                   6970 non-null   object 
 4   distance               6970 non-null   float64
 5   spayed_neutered        6970 non-null   bool   
 6   house_trained          6970 non-null   bool   
 7   special_needs          6970 non-null   bool   
 8   shots_current          6970 non-null   bool   
 9   breed_primary          6970 non-null   object 
 10  breed_secondary        6970 non-null   object 
 11  breed_mixed            6970 non-null   bool   
 12  color_primary          6970 non-null   object 
 13  color_secondary        6970 non-null   object 
 14  color_tertiary         6970 non-null   object 
 15  good

## Dummies!
### After converting bools to ints, of course

In [58]:
df = adopted[['gender', 'size', 'coat', 'duration_as_adoptable', 'hasimage', 'hasvideo', 'spayed_neutered', 'house_trained', 'special_needs', 'shots_current', 'goodwith_children', 'goodwith_dogs', 'goodwith_cats']]
df.loc[:, ['hasimage', 'hasvideo', 'spayed_neutered', 'house_trained', 'special_needs', 'shots_current']] = adopted.loc[:, ['hasimage', 'hasvideo', 'spayed_neutered', 'house_trained', 'special_needs', 'shots_current']].astype('int64')
df = pd.get_dummies(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [59]:
# drop one of each of the dummy category columns so those features don't double-weight anything
df.drop(['size_Extra Large', 'gender_Female', 'coat_Hairless', 'goodwith_children_False', 'goodwith_dogs_False', 'goodwith_cats_False'], axis=1, inplace=True)

## Scaling using StandardScaler()

In [60]:
scaler = StandardScaler()
scaled = scaler.fit_transform(df)

In [61]:
scaled_df = pd.DataFrame(scaled, columns=df.columns)
scaled_df.describe()

Unnamed: 0,duration_as_adoptable,hasimage,hasvideo,spayed_neutered,house_trained,special_needs,shots_current,gender_Male,size_Large,size_Medium,size_Small,coat_Curly,coat_Long,coat_Medium,coat_Short,coat_Wire,coat_unknown,goodwith_children_True,goodwith_children_unknown,goodwith_dogs_True,goodwith_dogs_unknown,goodwith_cats_True,goodwith_cats_unknown
count,6970.0,6970.0,6970.0,6970.0,6970.0,6970.0,6970.0,6970.0,6970.0,6970.0,6970.0,6970.0,6970.0,6970.0,6970.0,6970.0,6970.0,6970.0,6970.0,6970.0,6970.0,6970.0,6970.0
mean,0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,-0.0,-0.0,0.0,0.0,-0.0,-0.0
std,1.00007,1.00007,1.00007,1.00007,1.00007,1.00007,1.00007,1.00007,1.00007,1.00007,1.00007,1.00007,1.00007,1.00007,1.00007,1.00007,1.00007,1.00007,1.00007,1.00007,1.00007,1.00007,1.00007
min,-0.84461,-8.12522,-0.20113,-1.75115,-0.82999,-0.16876,-3.45418,-1.05366,-0.59293,-0.72238,-0.78656,-0.12663,-0.2561,-0.54629,-1.05366,-0.14107,-0.41428,-0.85714,-0.99114,-1.261,-0.74459,-0.62814,-1.24324
25%,-0.65483,0.12307,-0.20113,0.57105,-0.82999,-0.16876,0.2895,-1.05366,-0.59293,-0.72238,-0.78656,-0.12663,-0.2561,-0.54629,-1.05366,-0.14107,-0.41428,-0.85714,-0.99114,-1.261,-0.74459,-0.62814,-1.24324
50%,-0.37422,0.12307,-0.20113,0.57105,-0.82999,-0.16876,0.2895,0.94907,-0.59293,-0.72238,-0.78656,-0.12663,-0.2561,-0.54629,0.94907,-0.14107,-0.41428,-0.85714,-0.99114,0.79302,-0.74459,-0.62814,0.80435
75%,0.2552,0.12307,-0.20113,0.57105,1.20484,-0.16876,0.2895,0.94907,1.68655,1.38431,1.27136,-0.12663,-0.2561,-0.54629,0.94907,-0.14107,-0.41428,1.16667,1.00894,0.79302,1.34302,1.59201,0.80435
max,4.37501,0.12307,4.97188,0.57105,1.20484,5.92571,0.2895,0.94907,1.68655,1.38431,1.27136,7.89707,3.90475,1.83052,0.94907,7.08872,2.41384,1.16667,1.00894,0.79302,1.34302,1.59201,0.80435


## Split into training and test sets

In [62]:
X_train, X_test, y_train, y_test = train_test_split(scaled_df.drop(columns='duration_as_adoptable'), 
                                                    scaled_df.duration_as_adoptable, test_size=0.3, 
                                                    random_state=192)

In [63]:
X_train.shape, X_test.shape

((4879, 22), (2091, 22))

In [64]:
y_train.shape, y_test.shape

((4879,), (2091,))

In [65]:
# save training and test sets
datapath = 'data/tt_sets'
save_file(X_train, 'dogs_X_train.csv', datapath)
save_file(X_test, 'dogs_X_test.csv', datapath)
save_file(y_train, 'dogs_y_train.csv', datapath)
save_file(y_test, 'dogs_y_test.csv', datapath)

Directory data/tt_sets was created.
Writing file.  "data/tt_sets/dogs_X_train.csv"
Writing file.  "data/tt_sets/dogs_X_test.csv"
Writing file.  "data/tt_sets/dogs_y_train.csv"
Writing file.  "data/tt_sets/dogs_y_test.csv"
