In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

## 1. Loading the Data

### 1.1 Combined DataSet

In [2]:
df = pd.read_pickle('df')
df.head()

Unnamed: 0,Postal_Code,Unit_Type,Address,Cost,Home_Type,Bedroom_Num,Bathroom_Num,Area_SqFt,Neighborhood,boro_names,median_rent,median_income,car_free_commute,population_density,poverty_rate,crime,subway,park,income_diversity_ratio,labor_force_rate
0,11378.0,1,"6155 60th Pl, Maspeth, NY 11378",2650.0,1,3.0,1.0,1375.0,1,1,2543.07,54870.01,0.78,22.53,0.14,1.766784,0.89,0.79,4.07,0.67
1,11361.0,2,"217th St, Bayside, NY 11361",1250.0,2,0.0,1.0,550.0,2,1,1962.32,61084.74,0.34,12.87,0.07,1.088525,0.21,0.69,4.25,0.63
2,10001.0,1,"247 W 26th St APT 5B, New York, NY 10001",2500.0,3,1.0,1.0,500.0,3,2,3545.57,82393.55,0.89,47.01,0.13,4.959078,0.97,0.71,7.32,0.76
3,10001.0,1,"358 W 30th St APT 2B, New York, NY 10001",2350.0,3,0.0,1.0,,3,2,3545.57,82393.55,0.89,47.01,0.13,4.959078,0.97,0.71,7.32,0.76
4,10001.0,1,"420 W 25th St APT 7K, New York, NY 10001",5900.0,3,1.0,1.0,893.0,3,2,3545.57,82393.55,0.89,47.01,0.13,4.959078,0.97,0.71,7.32,0.76


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5140 entries, 0 to 5322
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Postal_Code             5140 non-null   float64
 1   Unit_Type               5140 non-null   int32  
 2   Address                 5140 non-null   object 
 3   Cost                    5140 non-null   float64
 4   Home_Type               5140 non-null   int32  
 5   Bedroom_Num             5140 non-null   float64
 6   Bathroom_Num            5071 non-null   float64
 7   Area_SqFt               2652 non-null   float64
 8   Neighborhood            5140 non-null   int32  
 9   boro_names              5140 non-null   int32  
 10  median_rent             5140 non-null   float64
 11  median_income           5140 non-null   float64
 12  car_free_commute        5140 non-null   float64
 13  population_density      5140 non-null   float64
 14  poverty_rate            5140 non-null   

In [4]:
df.shape

(5140, 20)

## 2. Encoding the Data

In [5]:
import category_encoders as ce
from sklearn.preprocessing import LabelEncoder

In [6]:
ce_ord = ce.OrdinalEncoder(cols = ['Unit_Type', 'Home_Type', 'Neighborhood', 'boro_names'])

In [7]:
df_ord = ce_ord.fit_transform(df)
df_ord.head()

Unnamed: 0,Postal_Code,Unit_Type,Address,Cost,Home_Type,Bedroom_Num,Bathroom_Num,Area_SqFt,Neighborhood,boro_names,median_rent,median_income,car_free_commute,population_density,poverty_rate,crime,subway,park,income_diversity_ratio,labor_force_rate
0,11378.0,1,"6155 60th Pl, Maspeth, NY 11378",2650.0,1,3.0,1.0,1375.0,1,1,2543.07,54870.01,0.78,22.53,0.14,1.766784,0.89,0.79,4.07,0.67
1,11361.0,2,"217th St, Bayside, NY 11361",1250.0,2,0.0,1.0,550.0,2,1,1962.32,61084.74,0.34,12.87,0.07,1.088525,0.21,0.69,4.25,0.63
2,10001.0,1,"247 W 26th St APT 5B, New York, NY 10001",2500.0,3,1.0,1.0,500.0,3,2,3545.57,82393.55,0.89,47.01,0.13,4.959078,0.97,0.71,7.32,0.76
3,10001.0,1,"358 W 30th St APT 2B, New York, NY 10001",2350.0,3,0.0,1.0,,3,2,3545.57,82393.55,0.89,47.01,0.13,4.959078,0.97,0.71,7.32,0.76
4,10001.0,1,"420 W 25th St APT 7K, New York, NY 10001",5900.0,3,1.0,1.0,893.0,3,2,3545.57,82393.55,0.89,47.01,0.13,4.959078,0.97,0.71,7.32,0.76


In [8]:
ce_ord.category_mapping

[{'col': 'Unit_Type',
  'mapping': 1.0    1
  2.0    2
  3.0    3
  4.0    4
  5.0    5
  NaN   -2
  dtype: int64,
  'data_type': dtype('int32')},
 {'col': 'Home_Type',
  'mapping': 1.0    1
  2.0    2
  3.0    3
  4.0    4
  5.0    5
  NaN   -2
  dtype: int64,
  'data_type': dtype('int32')},
 {'col': 'Neighborhood',
  'mapping': 1.0      1
  2.0      2
  3.0      3
  4.0      4
  5.0      5
  6.0      6
  7.0      7
  8.0      8
  9.0      9
  10.0    10
  11.0    11
  12.0    12
  13.0    13
  14.0    14
  15.0    15
  16.0    16
  17.0    17
  18.0    18
  19.0    19
  20.0    20
  21.0    21
  22.0    22
  23.0    23
  24.0    24
  25.0    25
  26.0    26
  27.0    27
  28.0    28
  29.0    29
  30.0    30
  31.0    31
  32.0    32
  33.0    33
  34.0    34
  35.0    35
  36.0    36
  37.0    37
  38.0    38
  39.0    39
  40.0    40
  41.0    41
  42.0    42
  43.0    43
  44.0    44
  45.0    45
  46.0    46
  47.0    47
  48.0    48
  49.0    49
  50.0    50
  51.0    51
  52.0 

In [9]:
df_ord.to_pickle('df')

## 3. Missing Data

- **BATHROOM MISSING DATA**
- **AREA MISSING DATA**

Unsure whether to add this data now, or after in a machine learning pipeline

## 4. Train / Test Split

In [10]:
X = df_ord.drop(columns = ['Address', 'Cost'])
y = df_ord.Cost
X.shape, y.shape

((5140, 18), (5140,))

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.3, 
                                                    random_state=47)

In [12]:
X_train.shape, X_test.shape

((3598, 18), (1542, 18))

In [13]:
y_train.shape, y_test.shape

((3598,), (1542,))

### 4.1 Scaling the Data

In [14]:
scaler = StandardScaler()

X_tr_scaled = scaler.fit_transform(X_train)
X_te_scaled = scaler.fit_transform(X_test)

# When should I be sclaing my data, encoding my data, filling in missing values? Is there a "correct order" order? 