In [61]:
# importing data...

LOCAL = '../data/nyc/nyc-rent-2016.csv'
WEB = 'https://raw.githubusercontent.com/LambdaSchool/DS-Unit-2-Linear-Models/master/data/nyc/nyc-rent-2016.csv'

import pandas as pd
df = pd.read_csv(WEB)

In [62]:
# creating month column for eventual test/train split...

df['created'] = pd.to_datetime(df['created'], infer_datetime_format=True)
df['month'] = df['created'].dt.month

In [63]:
# let's see what we can do with the non-numeric columns...

df.describe(exclude = 'number')

Unnamed: 0,created,description,display_address,street_address,interest_level
count,48300,46879.0,48168,48290,48300
unique,47643,37490.0,8691,15093,3
top,2016-06-12 13:20:45,,Broadway,3333 Broadway,low
freq,3,1614.0,424,174,33270
first,2016-04-01 22:12:41,,,,
last,2016-06-29 21:41:47,,,,


In [64]:
# let's make interest_level numeric...

df['interest_level'] = df['interest_level'].replace(
    {'high': 3, 'medium': 2, 'low': 1})

In [65]:
# let's also add a column with description length...

df['description_length'] = df['description'].str.len()
df['description_length'] = df['description_length'].fillna(0)

In [66]:
# and let's combine cats_allowed and dogs_allowed into pets_allowed...

import numpy as np

df['pets_allowed'] = np.where(((df['cats_allowed'] == 1) |
                               (df['dogs_allowed'] == 1)), 1, 0)

df = df.drop(columns=['cats_allowed', 'dogs_allowed'])

In [76]:
# finding ideal feature selection for n features, printing mean absolute 
# error, and printing model coefficients and intercept...

import itertools
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
import warnings; warnings.simplefilter('ignore')

features = df.drop(
    columns='price').describe(
    include = 'number').columns.tolist()
target = 'price'

train = df.query('month < 6')
test = df.query('month == 6')

keeper_features = ['swimming_pool',
                   'exclusive',
                   'balcony',
                   'outdoor_space',
                   'wheelchair_access',
                   'terrace',
                   'dining_room',
                   'pets_allowed',
                   'pre-war',
                   'new_construction',
                   'fitness_center',
                   'roof_deck',
                   'laundry_in_building',
                   'dishwasher',
                   'hardwood_floors',
                   'elevator',
                   'high_speed_internet',
                   'doorman',
                   'bathrooms',
                   'laundry_in_unit',
                   'interest_level',
                   'longitude',
                   'bedrooms']
other_features = [elem for elem in features if elem not in keeper_features]

other_features_combo = list(itertools.combinations(other_features, 1))

dictionary = {}
for i in range(len(other_features_combo)):
    X_train = train[list(other_features_combo[i]) + keeper_features]
    y_train = train[target]
    
    X_test = test[list(other_features_combo[i]) + keeper_features]
    y_test = test[target]
    
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    dictionary.update({other_features_combo[i]:
                       mean_absolute_error(y_test, y_pred)})

best_new_features_combo = min(dictionary, key=dictionary.get)

X_train = train[list(best_new_features_combo) + keeper_features]
y_train = train[target]
    
X_test = test[list(best_new_features_combo) + keeper_features]
y_test = test[target]
    
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
    
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print('Test MAE:', mean_absolute_error(y_test, y_pred), '\n')
print('Intercept\t ', model.intercept_)
    
print(pd.Series(model.coef_, list(best_new_features_combo) + keeper_features))

Test MAE: 592.1191937354383 

Intercept	  3432.7534190069314
description_length      -8.398076
loft                     4.945850
swimming_pool            7.311458
exclusive               11.041873
balcony                  0.985995
outdoor_space          -27.642459
wheelchair_access       19.313444
terrace                 18.931583
dining_room             26.139830
pets_allowed            26.673724
pre-war                -34.155127
new_construction       -35.900885
fitness_center          31.812507
roof_deck              -44.282109
laundry_in_building    -41.131670
dishwasher              46.914727
hardwood_floors        -71.535258
elevator                93.867707
high_speed_internet    -73.229465
doorman                189.251386
bathrooms              519.323862
laundry_in_unit        133.340064
interest_level        -233.987455
longitude             -364.771990
bedrooms               555.907008
dtype: float64


In [70]:
target = 'price'
features = ['loft',
            'swimming_pool',
            'exclusive',
            'balcony',
            'outdoor_space',
            'wheelchair_access',
            'terrace',
            'dining_room',
            'pets_allowed',
            'pre-war',
            'new_construction',
            'fitness_center',
            'roof_deck',
            'laundry_in_building',
            'dishwasher',
            'hardwood_floors',
            'elevator',
            'high_speed_internet',
            'doorman',
            'bathrooms',
            'laundry_in_unit',
            'interest_level',
            'longitude',
            'bedrooms']

X_train = train[features]
y_train = train[target]
    
X_test = test[features]
y_test = test[target]
    
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
    
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print('Test MAE:', mean_absolute_error(y_test, y_pred), '\n')
print('Intercept\t ', model.intercept_)
    
print(pd.Series(model.coef_, features))

Test MAE: 592.0555564164101 

Intercept	  3432.7534190069314
loft                     4.940963
swimming_pool            7.216925
exclusive               10.740470
balcony                  1.115794
outdoor_space          -28.337991
wheelchair_access       19.238219
terrace                 18.922734
dining_room             26.110976
pets_allowed            26.397336
pre-war                -34.446897
new_construction       -36.508813
fitness_center          31.342189
roof_deck              -44.467090
laundry_in_building    -41.258366
dishwasher              45.773369
hardwood_floors        -72.375870
elevator                93.924344
high_speed_internet    -73.447628
doorman                188.329471
bathrooms              519.235898
laundry_in_unit        132.553367
interest_level        -234.341445
longitude             -364.838457
bedrooms               555.570428
dtype: float64
