In [1]:
LOCAL = '../data/nyc/nyc-rent-2016.csv'
WEB = 'https://raw.githubusercontent.com/LambdaSchool/DS-Unit-2-Linear-Models/master/data/nyc/nyc-rent-2016.csv'

import pandas as pd
df = pd.read_csv(WEB)
assert df.shape == (48300, 34)

## Data Exploration

In [2]:
target = 'price'
df.isna().sum()

bathrooms                  0
bedrooms                   0
created                    0
description             1421
display_address          132
latitude                   0
longitude                  0
price                      0
street_address            10
interest_level             0
elevator                   0
cats_allowed               0
hardwood_floors            0
dogs_allowed               0
doorman                    0
dishwasher                 0
no_fee                     0
laundry_in_building        0
fitness_center             0
pre-war                    0
laundry_in_unit            0
roof_deck                  0
outdoor_space              0
dining_room                0
high_speed_internet        0
balcony                    0
swimming_pool              0
new_construction           0
exclusive                  0
terrace                    0
loft                       0
garden_patio               0
common_outdoor_space       0
wheelchair_access          0
dtype: int64

## Basic Wrangle

In [3]:
def basic_wrangle(df):
    df = df.copy()
    
    #Create an is_missing column for every column with any NANs
    for col in df.columns:
        if df[col].isna().sum():
            df[f'{col}_missing'] = df[col].isna()
            
    #Fill numeric columns with mean
    for col in df.select_dtypes(include='number').columns:
        df[col] = df[col].fillna(value=df[col].mean)
    
    #Fill non-numeric columns with 'MISSING'
    for col in df.select_dtypes(exclude='number').columns:
        df[col] = df[col].fillna(value='MISSING')
    return df
df_w = basic_wrangle(df)
df_w.isna().sum()

bathrooms                  0
bedrooms                   0
created                    0
description                0
display_address            0
latitude                   0
longitude                  0
price                      0
street_address             0
interest_level             0
elevator                   0
cats_allowed               0
hardwood_floors            0
dogs_allowed               0
doorman                    0
dishwasher                 0
no_fee                     0
laundry_in_building        0
fitness_center             0
pre-war                    0
laundry_in_unit            0
roof_deck                  0
outdoor_space              0
dining_room                0
high_speed_internet        0
balcony                    0
swimming_pool              0
new_construction           0
exclusive                  0
terrace                    0
loft                       0
garden_patio               0
common_outdoor_space       0
wheelchair_access          0
description_mi

## Train/Test Split

In [4]:
df_w.head(1)

Unnamed: 0,bathrooms,bedrooms,created,description,display_address,latitude,longitude,price,street_address,interest_level,...,new_construction,exclusive,terrace,loft,garden_patio,common_outdoor_space,wheelchair_access,description_missing,display_address_missing,street_address_missing
0,1.5,3,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,40.7145,-73.9425,3000,792 Metropolitan Avenue,medium,...,0,0,0,0,0,0,0,False,False,False


In [5]:
df_w['dt'] = pd.to_datetime(df_w['created'])
df_w['month'] = df_w['dt'].dt.month
df_w['year'] = df_w['dt'].dt.year
df_w.isna().sum()

bathrooms                  0
bedrooms                   0
created                    0
description                0
display_address            0
latitude                   0
longitude                  0
price                      0
street_address             0
interest_level             0
elevator                   0
cats_allowed               0
hardwood_floors            0
dogs_allowed               0
doorman                    0
dishwasher                 0
no_fee                     0
laundry_in_building        0
fitness_center             0
pre-war                    0
laundry_in_unit            0
roof_deck                  0
outdoor_space              0
dining_room                0
high_speed_internet        0
balcony                    0
swimming_pool              0
new_construction           0
exclusive                  0
terrace                    0
loft                       0
garden_patio               0
common_outdoor_space       0
wheelchair_access          0
description_mi

In [6]:
test = df_w[df_w['month'].isin([6])].copy()
train = df_w[df_w['month'].isin([4, 5])].copy()
assert test.shape[0] + train.shape[0] == df.shape[0]

## Baseline

In [7]:
test['pred'] = train[target].mean()

In [8]:
from sklearn.metrics import mean_absolute_error as mae
mae(test['pred'], test[target])

1052.6798017842948

## Linear Regression Baseline

In [9]:
from sklearn.linear_model import LinearRegression as lr

In [10]:
drop_me = list(train.select_dtypes(exclude='number').columns) + ['price']
#drops all non-numeric columns
features = train.drop(columns=drop_me).columns
model = lr()
model.fit(train[features], train[target])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [11]:
test_pred = model.predict(test[features])
mae(test_pred, test[target])

624.7001872025197

## Linear Regression Ordinal Encoding

### Feature Engineering

In [12]:
df = pd.read_csv(WEB)
df['description_length'] = len(df['description'])
df_w = basic_wrangle(df)
df_w['dt'] = pd.to_datetime(df_w['created'])
df_w['month'] = df_w['dt'].dt.month
df_w['year'] = df_w['dt'].dt.year

df.shape, df_w.shape

((48300, 35), (48300, 41))

In [13]:
test = df_w[df_w['month'].isin([6])].copy()
train = df_w[df_w['month'].isin([4, 5])].copy()
assert test.shape[0] + train.shape[0] == df.shape[0]

train.describe(exclude='number')

Unnamed: 0,created,description,display_address,street_address,interest_level,description_missing,display_address_missing,street_address_missing,dt
count,31515,31515,31515,31515,31515,31515,31515,31515,31515
unique,31116,25483,6493,11248,3,2,2,2,31116
top,2016-05-14 05:23:52,MISSING,Broadway,505 West 37th Street,low,False,False,False,2016-05-02 03:41:36
freq,3,966,268,120,21613,30549,31447,31509,3
first,,,,,,,,,2016-04-01 22:12:41
last,,,,,,,,,2016-05-31 23:10:48


In [14]:
drop_me_ordinal = [target, 'created', 'description', 'dt', 'street_address']

x_test = test.drop(columns=drop_me)
x_train = train.drop(columns=drop_me)
y_test = test[target]
y_train = train[target]

x_test.shape

(16785, 31)

In [15]:
from sklearn.pipeline import make_pipeline
import category_encoders as ce

def pre_processing(train, test):
    pipeline = make_pipeline(ce.OrdinalEncoder())
    pipeline.fit(train)
    
    x_train = pd.DataFrame(pipeline.transform(train))
    x_train.columns = train.columns
   
    x_test = pd.DataFrame(pipeline.transform(test))
    x_test.columns = test.columns
    
    return x_train, x_test

In [16]:
features = x_train.columns

pp_train, pp_test=pre_processing(x_train[features], x_test[features])
pp_train.columns

Index(['bathrooms', 'bedrooms', 'latitude', 'longitude', 'elevator',
       'cats_allowed', 'hardwood_floors', 'dogs_allowed', 'doorman',
       'dishwasher', 'no_fee', 'laundry_in_building', 'fitness_center',
       'pre-war', 'laundry_in_unit', 'roof_deck', 'outdoor_space',
       'dining_room', 'high_speed_internet', 'balcony', 'swimming_pool',
       'new_construction', 'exclusive', 'terrace', 'loft', 'garden_patio',
       'common_outdoor_space', 'wheelchair_access', 'description_length',
       'month', 'year'],
      dtype='object')

In [24]:
model = lr(normalize=True)
model.fit(pp_train, y_train)
dir(model)

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_decision_function',
 '_estimator_type',
 '_get_param_names',
 '_get_tags',
 '_more_tags',
 '_preprocess_data',
 '_residues',
 '_set_intercept',
 'coef_',
 'copy_X',
 'fit',
 'fit_intercept',
 'get_params',
 'intercept_',
 'n_jobs',
 'normalize',
 'predict',
 'rank_',
 'score',
 'set_params',
 'singular_']

In [30]:
feature_importance = pd.DataFrame({'features':pp_train.columns, 'coef':model.coef_, 'inter':model.intercept_})
f_im = feature_importance.sort_values(by=['coef'])
f_im

Unnamed: 0,features,coef,inter
16,high_speed_internet,-300.203742,737.051694
9,laundry_in_building,-236.954555,737.051694
19,new_construction,-172.385869,737.051694
4,hardwood_floors,-163.762838,737.051694
8,no_fee,-147.070832,737.051694
13,roof_deck,-114.767954,737.051694
23,garden_patio,-67.57966,737.051694
14,outdoor_space,-61.282337,737.051694
11,pre-war,-54.648445,737.051694
3,cats_allowed,-54.141596,737.051694


In [19]:
test_pred = model.predict(pp_test)
mae(test_pred, test[target])

624.7001872025243

### Linear Regression post-feature importances

Attempting to remove noise

In [23]:
noise = ['latitude', 'longitude', 'description_length']


features = x_train.drop(columns=noise).columns

pp_train, pp_test=pre_processing(x_train[features], x_test[features])
model = lr(normalize=True)
model.fit(pp_train, y_train)

test_pred = model.predict(pp_test)
mae(test_pred, test[target])

686.1394864316793

## Gradient Boosting

In [134]:
from xgboost import XGBRegressor as xgb

model = xgb()
model.fit(pp_train, y_train)

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \




XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [135]:
test_pred = model.predict(pp_test)
mae(test_pred, test[target])

479.17352403647