# Preprocessing & Training

### Goal:
<p>Create a cleaned development dataset you can use to complete the modeling step of your project.</p>

### Steps:
<ul><li>Create dummy or indicator features for categorical variables</li><li>Standardize the magnitude of numeric features using a scaler</li><li>Split into testing and training datasets</li></ul>
Review the following questions and apply them to your dataset:<ul><li>Does my data set have any categorical data, such as Gender or day of the week?</li><li>Do my features have data values that range from 0 - 100 or 0-1 or both and more?  </li></ul>

In [1]:
import numpy as np
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve

from library.sb_utils import save_file

In [2]:
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.precision', 2)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [3]:
adopted = pd.read_csv('data/cats_trimmed.csv')
adopted.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6489 entries, 0 to 6488
Data columns (total 25 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   age                    6489 non-null   object 
 1   gender                 6489 non-null   object 
 2   size                   6489 non-null   object 
 3   coat                   6489 non-null   object 
 4   distance               6489 non-null   float64
 5   spayed_neutered        6489 non-null   bool   
 6   house_trained          6489 non-null   bool   
 7   declawed               6489 non-null   bool   
 8   special_needs          6489 non-null   bool   
 9   shots_current          6489 non-null   bool   
 10  breed_primary          6489 non-null   object 
 11  breed_secondary        6489 non-null   object 
 12  breed_mixed            6489 non-null   bool   
 13  breed_unknown          6489 non-null   bool   
 14  color_primary          6489 non-null   object 
 15  colo

## Dummies!
### After converting bools to ints, of course

In [4]:
df = adopted[['gender', 'size', 'coat', 'duration_as_adoptable', 'hasimage', 'hasvideo', 'spayed_neutered', 'house_trained', 'declawed', 'special_needs', 'shots_current', 'goodwith_children', 'goodwith_dogs', 'goodwith_cats']]
df.loc[:, ['hasimage', 'hasvideo', 'spayed_neutered', 'house_trained', 'declawed', 'special_needs', 'shots_current']] = adopted.loc[:, ['hasimage', 'hasvideo', 'spayed_neutered', 'house_trained', 'declawed', 'special_needs', 'shots_current']].astype('int64')
df = pd.get_dummies(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [5]:
# drop one of each of the dummy category columns so those features don't double-weight anything
df.drop(['size_Extra Large', 'gender_Female', 'coat_Hairless', 'goodwith_children_False', 'goodwith_dogs_False', 'goodwith_cats_False'], axis=1, inplace=True)

## Scaling using StandardScaler()

In [6]:
scaler = StandardScaler()
scaled = scaler.fit_transform(df)

In [8]:
scaled_df = pd.DataFrame(scaled, columns=df.columns)
scaled_df.describe()

Unnamed: 0,duration_as_adoptable,hasimage,hasvideo,spayed_neutered,house_trained,declawed,special_needs,shots_current,gender_Male,size_Large,size_Medium,size_Small,coat_Long,coat_Medium,coat_Short,coat_unknown,goodwith_children_True,goodwith_children_unknown,goodwith_dogs_True,goodwith_dogs_unknown,goodwith_cats_True,goodwith_cats_unknown
count,6489.0,6489.0,6489.0,6489.0,6489.0,6489.0,6489.0,6489.0,6489.0,6489.0,6489.0,6489.0,6489.0,6489.0,6489.0,6489.0,6489.0,6489.0,6489.0,6489.0,6489.0,6489.0
mean,0.0,-0.0,-0.0,-0.0,0.0,0.0,-0.0,0.0,-0.0,-0.0,0.0,0.0,0.0,-0.0,-0.0,0.0,-0.0,-0.0,-0.0,0.0,0.0,-0.0
std,1.00008,1.00008,1.00008,1.00008,1.00008,1.00008,1.00008,1.00008,1.00008,1.00008,1.00008,1.00008,1.00008,1.00008,1.00008,1.00008,1.00008,1.00008,1.00008,1.00008,1.00008,1.00008
min,-0.6901,-5.38903,-0.19213,-2.86887,-1.26894,-0.09497,-0.139,-4.48741,-0.9573,-0.29454,-1.9057,-0.38825,-0.30067,-0.37793,-1.76561,-0.18909,-0.6034,-1.36886,-0.49136,-1.71113,-0.98213,-0.92987
25%,-0.59502,0.18556,-0.19213,0.34857,-1.26894,-0.09497,-0.139,0.22285,-0.9573,-0.29454,0.52474,-0.38825,-0.30067,-0.37793,0.56638,-0.18909,-0.6034,-1.36886,-0.49136,-1.71113,-0.98213,-0.92987
50%,-0.38727,0.18556,-0.19213,0.34857,0.78806,-0.09497,-0.139,0.22285,-0.9573,-0.29454,0.52474,-0.38825,-0.30067,-0.37793,0.56638,-0.18909,-0.6034,0.73053,-0.49136,0.58441,-0.98213,-0.92987
75%,0.12802,0.18556,-0.19213,0.34857,0.78806,-0.09497,-0.139,0.22285,1.04461,-0.29454,0.52474,-0.38825,-0.30067,-0.37793,0.56638,-0.18909,1.65727,0.73053,-0.49136,0.58441,1.0182,1.07542
max,5.46,0.18556,5.20489,0.34857,0.78806,10.52992,7.19417,0.22285,1.04461,3.39515,0.52474,2.57568,3.32586,2.64598,0.56638,5.28855,1.65727,0.73053,2.03515,0.58441,1.0182,1.07542


## Split into training and test sets

In [10]:
X_train, X_test, y_train, y_test = train_test_split(scaled_df.drop(columns='duration_as_adoptable'), 
                                                    scaled_df.duration_as_adoptable, test_size=0.3, 
                                                    random_state=192)

In [11]:
X_train.shape, X_test.shape

((4542, 21), (1947, 21))

In [12]:
y_train.shape, y_test.shape

((4542,), (1947,))

In [13]:
# save training and test sets
datapath = 'data/tt_sets'
save_file(X_train, 'cats_X_train.csv', datapath)
save_file(X_test, 'cats_X_test.csv', datapath)
save_file(y_train, 'cats_y_train.csv', datapath)
save_file(y_test, 'cats_y_test.csv', datapath)

Writing file.  "data/tt_sets/cats_X_train.csv"
Writing file.  "data/tt_sets/cats_X_test.csv"
Writing file.  "data/tt_sets/cats_y_train.csv"
Writing file.  "data/tt_sets/cats_y_test.csv"
