<a href="https://colab.research.google.com/github/rgolds5/DS-Unit-2-Linear-Models/blob/master/Module_2_Doing_Linear_Regression_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install category_encoders

Collecting category_encoders
[?25l  Downloading https://files.pythonhosted.org/packages/6e/a1/f7a22f144f33be78afeb06bfa78478e8284a64263a3c09b1ef54e673841e/category_encoders-2.0.0-py2.py3-none-any.whl (87kB)
[K     |████████████████████████████████| 92kB 3.6MB/s 
Installing collected packages: category-encoders
Successfully installed category-encoders-2.0.0


In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
import category_encoders as ce
from sklearn.preprocessing import StandardScaler

model = LinearRegression()
encoder = ce.OneHotEncoder(use_cat_names = True)
scaler = StandardScaler()

data_url = 'https://raw.githubusercontent.com/LambdaSchool/DS-Unit-2-Linear-Models/master/data/nyc/nyc-rent-2016.csv'

df = pd.read_csv(data_url)
assert df.shape == (48300, 34)



In [0]:
df.created = pd.to_datetime(df.created, infer_datetime_format = True)
df.month = df.created.dt.month

train = df[df.month < 6]
test = df[df.month == 6]

train.price.mean()

  


3432.7534190068222

In [0]:
features = [
    'bathrooms',
    'bedrooms',
    'longitude',
    ]

target = 'price'

X_train = train[features]
y_train = train[target]

X_test = test[features]
y_test = test[target]

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mean_absolute_error(y_test, y_pred)

667.293314729857

In [0]:
features = [
    'bathrooms',
    'bedrooms',
    'doorman',
    'longitude',    
    'interest_level'
  ]

target = 'price'

X_train = train[features]
y_train = train[target]

X_test = test[features]
y_test = test[target]

X_train = encoder.fit_transform(X_train)
X_test = encoder.transform(X_test)

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'Intercept: {model.intercept_}')
model.coef_
              

MAE: 606.4804449587706
Intercept: 3431.9438196502865


array([ 5.46283072e+02,  5.62402433e+02,  2.38591837e+02, -3.72466875e+02,
       -6.65512642e+15, -1.14270815e+16, -1.04346006e+16])

I ran the above code with multiple sets of feature combinations. The five features I have finished with, _bathrooms_, _bedrooms_, _longitude_, _interest_level_, and _doorman_ seemed to be the ones that gave me the most _bang for my buck_.

In [0]:
from sklearn.linear_model import Ridge

reg = Ridge(alpha = 0.5)

features = [
    'bathrooms',
    'bedrooms',
    'doorman',
    'longitude',
    'interest_level'
      ]

target = 'price'

X_train = train[features]
y_train = train[target]

X_test = test[features]
y_test = test[target]


X_train = encoder.fit_transform(X_train)
X_test = encoder.transform(X_test)

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

reg.fit(X_train, y_train)

y_pred = reg.predict(X_test)

print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'Intercept: {reg.intercept_}')
reg.coef_

MAE: 606.3352938459542
Intercept: 3432.7534190069337


array([ 543.16250263,  559.54176157,  237.21277821, -372.80510009,
       -110.29890353,  131.10550301,  -73.22753903])

Using ridge regression, I started the code with every feature and only seen an 11 point improvment in the MAE of the model. I then removed features one-by-one, each time removing the feature with the lowest absolute coefficient, seeing barely a change in the model each time, unless i removed one of the remaining features. With the same features as with OLS, Ridge regression showed no significant improvement.

In [0]:
import pandas as pd
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
import category_encoders as ce
from sklearn.preprocessing import StandardScaler

encoder = ce.OneHotEncoder(use_cat_names = True)
scaler = StandardScaler()

features = [
    'balcony',
    'bathrooms',
    'bedrooms',
    'cats_allowed',
    'common_outdoor_space',
    'dining_room',
    'dishwasher',
    'doorman',
    'exclusive',
    'elevator',
    'fitness_center',
    'garden_patio',
    'hardwood_floors',
    'high_speed_internet',
    'latitude',
    'laundry_in_building',
    'laundry_in_unit',
    'loft',
    'longitude',
    'new_construction',
    'no_fee',
    'outdoor_space',
    'roof_deck',
    'swimming_pool',
    'terrace',
    'wheelchair_access',
    'interest_level',
       ]

target = 'price'

X_train = train[features]
y_train = train[target]

X_test = test[features]
y_test = test[target]

X_train = encoder.fit_transform(X_train)
X_test = encoder.transform(X_test)

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

estimator = LinearRegression()
selector = RFE(estimator, 7, step = 1)
selector = selector.fit(X_train, y_train)


In [0]:
df_2 = (pd.DataFrame(list(zip(features, selector.support_, selector.ranking_)), 
                     columns = ['Feature', 'Selected Feature', 'Feature Ranking'])
        .sort_values(['Feature Ranking', 'Selected Feature']))
df_2.head(8)

Unnamed: 0,Feature,Selected Feature,Feature Ranking
1,bathrooms,True,1
2,bedrooms,True,1
7,doorman,True,1
18,longitude,True,1
26,interest_level,True,1
16,laundry_in_unit,False,2
13,high_speed_internet,False,3
9,elevator,False,4


In [0]:
#df_lat_long = df[['latitude', 'longitude']]


In [0]:
#from google.colab import files

#df_lat_long.to_csv('lat_long.csv')
#files.download('lat_long.csv')

In [3]:
lat_long_url = 'https://raw.githubusercontent.com/rgolds5/DS-Unit-2-Linear-Models/master/module2-doing-linear-regression/lat_long%20(1).csv'

df_lat_long = pd.read_csv('https://raw.githubusercontent.com/rgolds5/DS-Unit-2-Linear-Models/master/module2-doing-linear-regression/lat_long%20(1).csv')

df_lat_long.head()

Unnamed: 0,the_geom,cartodb_id,field_1,latitude,longitude,neighborhood
0,0101000020E6100000BE30992A187952C0265305A39262...,3070,3069,40.7701,-73.8921,Ditmars Steinway
1,0101000020E610000085EB51B81E7952C0ED9E3C2CD462...,3165,3164,40.7721,-73.8925,Ditmars Steinway
2,0101000020E61000009A081B9E5E7952C035EF38454762...,19413,19412,40.7678,-73.8964,Ditmars Steinway
3,0101000020E610000005C58F31777952C07CF2B0506B62...,31513,31512,40.7689,-73.8979,Ditmars Steinway
4,0101000020E610000013F241CF667952C0E0BE0E9C3362...,37568,37567,40.7672,-73.8969,Ditmars Steinway


In [4]:
df_nhood = df.merge(df_lat_long, on = ['latitude', 'longitude'])
          
df_nhood.head()

Unnamed: 0,bathrooms,bedrooms,created,description,display_address,latitude,longitude,price,street_address,interest_level,elevator,cats_allowed,hardwood_floors,dogs_allowed,doorman,dishwasher,no_fee,laundry_in_building,fitness_center,pre-war,laundry_in_unit,roof_deck,outdoor_space,dining_room,high_speed_internet,balcony,swimming_pool,new_construction,exclusive,terrace,loft,garden_patio,common_outdoor_space,wheelchair_access,the_geom,cartodb_id,field_1,neighborhood
0,1.5,3,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,40.7145,-73.9425,3000,792 Metropolitan Avenue,medium,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0101000020E6100000B81E85EB517C52C0FA7E6ABC745B...,1,0,Williamsburg
1,1.5,3,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,40.7145,-73.9425,3000,792 Metropolitan Avenue,medium,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0101000020E6100000B81E85EB517C52C0FA7E6ABC745B...,15473,15472,Williamsburg
2,1.5,3,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,40.7145,-73.9425,3000,792 Metropolitan Avenue,medium,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0101000020E6100000B81E85EB517C52C0FA7E6ABC745B...,21119,21118,Williamsburg
3,1.0,1,2016-06-14 15:32:30,Enjoy These Following Apartment Features As Yo...,Metropolitan Avenue,40.7145,-73.9425,2500,792 Metropolitan Avenue,low,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0101000020E6100000B81E85EB517C52C0FA7E6ABC745B...,1,0,Williamsburg
4,1.0,1,2016-06-14 15:32:30,Enjoy These Following Apartment Features As Yo...,Metropolitan Avenue,40.7145,-73.9425,2500,792 Metropolitan Avenue,low,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0101000020E6100000B81E85EB517C52C0FA7E6ABC745B...,15473,15472,Williamsburg


In [5]:
import pandas as pd

df_nhood.created = pd.to_datetime(df_nhood.created, infer_datetime_format = True)
df_nhood.month = df_nhood.created.dt.month

train = df_nhood[df_nhood.month < 6]
test = df_nhood[df_nhood.month == 6]

train.price.mean()

  after removing the cwd from sys.path.


3646.4900870275774

In [13]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
import category_encoders as ce
from sklearn.preprocessing import StandardScaler

ce_hash = ce.HashingEncoder(cols = ['neighborhood'])
encoder = ce.OneHotEncoder(use_cat_names = True)
scaler = StandardScaler()

model = LinearRegression()

features = [
    'bathrooms',
    'bedrooms',
    'doorman',
    'interest_level',
    'neighborhood',
    'longitude'
  ]

target = 'price'

X_train = train[features]
y_train = train[target]

X_test = test[features]
y_test = test[target]

X_train = ce_hash.fit_transform(X_train)
X_test = ce_hash.transform(X_test)

X_train = encoder.fit_transform(X_train)
X_test = encoder.transform(X_test)

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'Intercept: {model.intercept_}')
model.coef_
             

MAE: 545.2087217484531
Intercept: 3646.0911539204367


array([ 3.11320843e+14,  2.88293012e+14,  8.58445430e+13,  1.46262929e+14,
        2.39886517e+14,  8.13741924e+13,  3.62501537e+14,  2.42331580e+14,
        5.52679688e+02,  5.35496094e+02,  8.89375000e+01,  6.86729893e+14,
        6.38468721e+14,  3.59591481e+14, -3.01125000e+02])