In [1]:
# importing data...

LOCAL = '../data/nyc/nyc-rent-2016.csv'
WEB = 'https://raw.githubusercontent.com/LambdaSchool/DS-Unit-2-Linear-Models/master/data/nyc/nyc-rent-2016.csv'

import pandas as pd
df = pd.read_csv(WEB)

In [2]:
# getting data ready for test/train split...

df['created'] = pd.to_datetime(df['created'], infer_datetime_format=True)
df['month'] = df['created'].dt.month

In [3]:
# test/train split...

train = df.query('month < 6')
test = df.query('month == 6')

In [4]:
# making a baseline model and calculating baseline error...

import numpy as np
from sklearn.metrics import mean_absolute_error

y_test = test['price']
y_pred = np.full_like(y_test, fill_value = train['price'].mean())

mean_absolute_error(y_test, y_pred)

1052.5193327375632

In [5]:
# some initial data exploration...

import pandas_profiling
df.profile_report()



In [26]:
# looking at features correlated with price...

df.corr()['price'].sort_values()

longitude              -0.315100
latitude               -0.074995
pre-war                -0.032525
loft                   -0.030919
exclusive              -0.020510
laundry_in_building    -0.010473
month                   0.011583
common_outdoor_space    0.023859
wheelchair_access       0.057820
new_construction        0.059502
cats_allowed            0.071227
dogs_allowed            0.079009
garden_patio            0.093430
high_speed_internet     0.097360
hardwood_floors         0.103603
swimming_pool           0.121068
terrace                 0.125204
balcony                 0.133662
roof_deck               0.136020
no_fee                  0.143255
outdoor_space           0.146489
dining_room             0.195146
elevator                0.221581
dishwasher              0.239491
fitness_center          0.240836
laundry_in_unit         0.245388
doorman                 0.289534
bedrooms                0.559335
bathrooms               0.627443
price                   1.000000
Name: pric

In [24]:
# looking at features with highest absolute correlation with price...

df.corr().abs()['price'].sort_values(ascending=False)

price                   1.000000
bathrooms               0.627443
bedrooms                0.559335
longitude               0.315100
doorman                 0.289534
laundry_in_unit         0.245388
fitness_center          0.240836
dishwasher              0.239491
elevator                0.221581
dining_room             0.195146
outdoor_space           0.146489
no_fee                  0.143255
roof_deck               0.136020
balcony                 0.133662
terrace                 0.125204
swimming_pool           0.121068
hardwood_floors         0.103603
high_speed_internet     0.097360
garden_patio            0.093430
dogs_allowed            0.079009
latitude                0.074995
cats_allowed            0.071227
new_construction        0.059502
wheelchair_access       0.057820
pre-war                 0.032525
loft                    0.030919
common_outdoor_space    0.023859
exclusive               0.020510
month                   0.011583
laundry_in_building     0.010473
Name: pric

In [91]:
# doing linear regression with multiple features and one-hot encoding on 
# interest_level, printing mean absolute error, coefficients and intercept...

from sklearn.linear_model import LinearRegression
import category_encoders as ce
from sklearn.preprocessing import StandardScaler
import warnings; warnings.simplefilter('ignore')

features = [
    'bedrooms',
    'bathrooms',
    'doorman', 
    'laundry_in_unit',
    'dishwasher',
    'elevator',
    'longitude',
    'pre-war',
    'exclusive',
    'laundry_in_building',
    'interest_level',
]
target = 'price'

X_train = train[features]
y_train = train[target]

X_test = test[features]
y_test = test[target]

encoder = ce.OneHotEncoder(use_cat_names=True)
X_train = encoder.fit_transform(X_train)
X_test = encoder.transform(X_test)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print('Test MAE:', mean_absolute_error(y_test, y_pred), '\n')
print('Intercept\t ', model.intercept_)

features.remove('interest_level')
features.append('interest_level_high')
features.append('interest_level_low')
features.append('interest_level_medium')

print(pd.Series(model.coef_, features))

Test MAE: 599.4899541709211 

Intercept	  3432.7534190069305
bedrooms                 553.813681
bathrooms                523.917951
doorman                  181.321281
laundry_in_unit          114.748209
dishwasher                 1.063386
elevator                  74.662149
longitude               -362.374467
pre-war                  -36.317437
exclusive                 20.706651
laundry_in_building      -31.280502
interest_level_high     -111.508530
interest_level_low       135.086621
interest_level_medium    -76.815826
dtype: float64
