In [1]:
# Import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

np.random.seed(2012)

# Configure visual settings:
%matplotlib inline 
plt.rcParams['figure.figsize'] = (10.0, 8.0) 
plt.style.use(['bmh'])

# Load the dataframe
data = pd.read_pickle('assets/engineered_data2.p')

In [2]:
data.iloc[0,:]

help_packing                                                            Yes
boxes                                                                    45
furniture                 Sectional Sofa (11'6"x7'10"x2'9"), Desk (5'x2'...
reference                       Good reviews online. Seen truck everywhere.
mention                   Would like to receive two estimates: one for p...
est_hours                                                              3.25
truck                                                              Behemoth
num_movers                                                                3
rate                                                                    200
travel_fee                                                                1
min_hours                                                                 3
loc1.lengthOfWalkOptID                                                    6
loc1.sqFt                                                               800
loc1.elevato

In [3]:
# Final cleaning. 
# Remove moves that were done with NAT (not a truck):
data = data[data['truck_type'] != 'NAT']

In [4]:
# Set 'help packing' to 1 if answer is yes, else 0:

data['help_packing'] = [1 if x == 'Yes' else 0 for x in data['help_packing']]

In [5]:
# Turn placeType into two new variables: number of rooms and type of place

data['loc1.placeType'] = data['loc1.placeType'].replace(to_replace='1 bedroom apartment + den/office', value = '2 bedroom apartment')

data['loc2.placeType'] = data['loc2.placeType'].replace(to_replace='1 bedroom apartment + den/office', value = '2 bedroom apartment')

data['loc2.placeType'] = data['loc2.placeType'].replace(to_replace = '5 bedroom townhouse apartment', value = '5 bedroom apartment')
data['loc2.placeType'] = data['loc2.placeType'].replace(to_replace = '1 bedroom condo apartment', value = '1 bedroom apartment')

data['loc1.placeType'].value_counts()

import re


def find_rooms(room_string):
    
    pattern = re.compile('[\d+]')

    foo = re.match(pattern, room_string)

    try:
        return(foo.group())
    except:
        return(1)


data['loc1.rooms'] = [find_rooms(x) for x in data['loc1.placeType']]
data['loc2.rooms'] = [find_rooms(x) for x in data['loc2.placeType']]

def get_place_type(placeType_string):
    if 'apartment' in placeType_string:
        return('Apartment')
    if 'house' in placeType_string:
        return('House')
    if 'Storage' in placeType_string:
        return('Storage')
    else:
        return('Other')

data['loc1.type'] = [get_place_type(x) for x in data['loc1.placeType']]
data['loc2.type'] = [get_place_type(x) for x in data['loc2.placeType']]

In [6]:
# Set up target variables:

target_variables = ['num_movers','est_hours','truck_type','rate','travel_fee','min_hours','cc_hours']

data.loc[0,target_variables]

num_movers       3
est_hours     3.25
truck_type     Big
rate           200
travel_fee       1
min_hours        3
cc_hours       5.5
Name: 0, dtype: object

In [7]:
# Set up predictor variables:

predictor_variables = ['help_packing','boxes','furniture','reference','mention','loc1.lengthOfWalkOptID',
 'loc1.sqFt', 'loc1.elevatorType', 'loc1.stairs', 'loc1.parkingType', 'loc1.type',
 'loc2.lengthOfWalkOptID', 'loc2.sqFt', 'loc2.elevatorType', 'loc2.stairs',
 'loc2.parkingType', 'loc2.type','driving_distance', 'loc1.rooms','loc2.rooms']

data.loc[0,predictor_variables]

help_packing                                                              1
boxes                                                                    45
furniture                 Sectional Sofa (11'6"x7'10"x2'9"), Desk (5'x2'...
reference                       Good reviews online. Seen truck everywhere.
mention                   Would like to receive two estimates: one for p...
loc1.lengthOfWalkOptID                                                    6
loc1.sqFt                                                               800
loc1.elevatorType                                                     Large
loc1.stairs                                                               0
loc1.parkingType                                               Loading Dock
loc1.type                                                         Apartment
loc2.lengthOfWalkOptID                                                    6
loc2.sqFt                                                               700
loc2.elevato

In [8]:
# Identify miscellaneous variables:

misc_variables = [x for x in [x for x in data.columns if x not in predictor_variables] if x not in target_variables]
data.loc[0, misc_variables]

truck                                                            Behemoth
loc1.placeType                                        1 bedroom apartment
loc1.lat                                                          38.8339
loc1.lng                                                         -77.0624
loc1.geocodedAddress    3201 Landover St #1723, Alexandria, VA 22305, USA
loc2.placeType                                        1 bedroom apartment
loc2.lat                                                          38.8573
loc2.lng                                                         -77.0604
loc2.geocodedAddress         815 18th St S #205, Arlington, VA 22202, USA
actualRate                                                            200
pre_move_notes          - yelper \n- travel fee: 0.5 hr\n- 1 br apt to...
estSeconds1                                                          1958
estSeconds2                                                           585
loc1.fullAddress        3201 Landover 

In [9]:
categorical_variables = ['loc1.elevatorType','loc1.parkingType','loc1.type','loc2.elevatorType','loc2.parkingType','loc2.type']

In [10]:
data[categorical_variables].head(10)

Unnamed: 0,loc1.elevatorType,loc1.parkingType,loc1.type,loc2.elevatorType,loc2.parkingType,loc2.type
0,Large,Loading Dock,Apartment,,Parking Lot,Apartment
1,,Street,Apartment,,Street,Apartment
3,Medium,Loading Dock,Apartment,,Parking Lot,Apartment
4,Medium,Driveway,Apartment,,Driveway,House
7,Small,Driveway,Apartment,,Street,House
8,,Street,Apartment,Medium,Parking Lot,Storage
10,,Parking Lot,House,,Driveway,House
12,Medium,Loading Dock,Apartment,,Street,House
13,Medium,Loading Dock,Apartment,,Street,Other
14,,Driveway,Apartment,,Driveway,House


In [11]:
numerical_variables = ['help_packing','boxes','loc1.sqFt','loc2.sqFt','loc1.stairs','loc2.stairs','driving_distance','loc1.rooms','loc2.rooms']

In [12]:
data[predictor_variables].dtypes

help_packing                int64
boxes                     float64
furniture                  object
reference                  object
mention                    object
loc1.lengthOfWalkOptID    float64
loc1.sqFt                 float64
loc1.elevatorType          object
loc1.stairs                 int64
loc1.parkingType           object
loc1.type                  object
loc2.lengthOfWalkOptID    float64
loc2.sqFt                 float64
loc2.elevatorType          object
loc2.stairs                 int64
loc2.parkingType           object
loc2.type                  object
driving_distance          float64
loc1.rooms                 object
loc2.rooms                 object
dtype: object

In [13]:
text_variables = ['furniture','reference','mention']

In [14]:
ordinal_variables = ['loc1.lengthOfWalkOptID','loc2.lengthOfWalkOptID']

In [15]:
assert len(predictor_variables) == len(categorical_variables + numerical_variables + text_variables + ordinal_variables)

In [16]:
model_data = data.drop(misc_variables, axis=1, inplace=False)
model_data = pd.get_dummies(model_data, columns = categorical_variables)
categorical_to_drop = ['loc1.elevatorType_None','loc2.elevatorType_None',
                       'loc1.parkingType_Not specified','loc2.parkingType_Not specified',
                      'loc1.type_Other','loc2.type_Other']
model_data.drop(categorical_to_drop, axis=1, inplace=True)


model_data.to_pickle('assets\model_data.p')

In [17]:
variable_names = (target_variables, predictor_variables, categorical_variables, numerical_variables, text_variables, ordinal_variables)

In [19]:
import pickle
with open('var_names.p', 'wb') as f:
    pickle.dump(variable_names, f)

In [23]:
#
categorical_variables

['loc1.elevatorType',
 'loc1.parkingType',
 'loc1.type',
 'loc2.elevatorType',
 'loc2.parkingType',
 'loc2.type']