In [1]:
import os
import sys

setup_dir = os.path.join(os.getcwd(), '..')
src_dir = os.path.join(setup_dir, 'src')
if src_dir not in sys.path:
    sys.path.append(src_dir)

In [2]:
ClEANED_DATA_FOLDER = '../data/cleaned/'
INTERMEDIATE_DATA_FOLDER = '../data/intermediate'

train_data_cleaned_path = os.path.join(ClEANED_DATA_FOLDER, 'train_data.csv')
test_data_cleaned_path = os.path.join(ClEANED_DATA_FOLDER, 'test_data.csv')

train_data_int_path = os.path.join(INTERMEDIATE_DATA_FOLDER, 'train_data.csv')
test_data_int_path = os.path.join(INTERMEDIATE_DATA_FOLDER, 'test_data.csv')

### Import packages

In [3]:
import numpy as np
import pandas as pd

from data_preparation.data_utils import load_data
from modeling.feature_engineering import *

### Load Cleaned Data

In [4]:
train_data = load_data(train_data_cleaned_path)
test_data = load_data(test_data_cleaned_path)

num records:  11381
num dimensions:  27
num records:  4402
num dimensions:  26


### New Features

In [5]:
# Age of the house
train_data = add_house_age(train_data)
test_data = add_house_age(test_data)

In [6]:
# Transaction 'age'
train_data = add_transaction_age(train_data)

In [7]:
# Price per square foot
train_data = add_price_per_sqft(train_data)

In [8]:
# residential, view and garage
residential_codes = ['R0', 'R1', 'R2', 'R3', 'R4', 'R5', 'R6', 'R7', 'R8']
train_data = add_residential(train_data, residential_codes)
train_data = add_has_garage(train_data)
train_data = add_has_view(train_data)

In [9]:
# Distance to city center
train_data = add_distance_to_center(train_data)

### Categorical Features

In [10]:
train_data = add_dummies_categorical(data=train_data, train_data=train_data, feature='ViewType')

In [11]:
train_data = target_encode_categorical(data=train_data, 
                                       train_data=train_data, 
                                       feature='ZoneCodeCounty', 
                                       target_feature='SaleDollarCnt')

### Drop Columns

In [12]:
drop_columns = ['PropertyID', 
                'TransDate', 
                'censusblockgroup', 
                'Usecode', 
                'BGMedYearBuilt']

train_data = train_data.drop(columns=drop_columns)

In [13]:
train_data.columns

Index(['SaleDollarCnt', 'BedroomCnt', 'BathroomCnt', 'FinishedSquareFeet',
       'GarageSquareFeet', 'LotSizeSquareFeet', 'StoryCnt', 'Latitude',
       'Longitude', 'BGMedHomeValue', 'BGMedRent', 'BGPctOwn', 'BGPctVacant',
       'BGMedIncome', 'BGPctKids', 'BGMedAge',
       'BGMedHomeValue_missing_replaced', 'BGMedRent_missing_replaced',
       'BGMedYearBuilt_missing_replaced', 'HomeAge', 'DaysToTrans',
       'BGSqFtPrice_lot', 'BGSqFtPrice_house', 'residential', 'Residential',
       'HasGarage', 'HasView', 'distance_to_center', 'ViewType_0.0',
       'ViewType_78.0', 'ViewType_79.0', 'ViewType_82.0', 'ViewType_241.0',
       'ViewType_244.0', 'ViewType_246.0', 'ViewType_247.0',
       'target_encoded_ZoneCodeCounty'],
      dtype='object')

### Save Intermediate Data

In [14]:
train_data.to_csv(train_data_int_path, index=False)