In [1]:
# Let's do our imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler


In [2]:
!pip install category_encoders



In [3]:
import category_encoders as ce

In [4]:
!pip install -U pandas-profiling

Requirement already up-to-date: pandas-profiling in c:\programdata\anaconda3\lib\site-packages (2.1.2)


In [5]:
# Now let's import our csv into a dataframe
url = 'https://raw.githubusercontent.com/LambdaSchool/DS-Unit-2-Linear-Models/master/data/nyc/nyc-rent-2016.csv'
df = pd.read_csv(url)
df.head()

Unnamed: 0,bathrooms,bedrooms,created,description,display_address,latitude,longitude,price,street_address,interest_level,...,high_speed_internet,balcony,swimming_pool,new_construction,exclusive,terrace,loft,garden_patio,common_outdoor_space,wheelchair_access
0,1.5,3,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,40.7145,-73.9425,3000,792 Metropolitan Avenue,medium,...,0,0,0,0,0,0,0,0,0,0
1,1.0,2,2016-06-12 12:19:27,,Columbus Avenue,40.7947,-73.9667,5465,808 Columbus Avenue,low,...,0,0,0,0,0,0,0,0,0,0
2,1.0,1,2016-04-17 03:26:41,"Top Top West Village location, beautiful Pre-w...",W 13 Street,40.7388,-74.0018,2850,241 W 13 Street,high,...,0,0,0,0,0,0,0,0,0,0
3,1.0,1,2016-04-18 02:22:02,Building Amenities - Garage - Garden - fitness...,East 49th Street,40.7539,-73.9677,3275,333 East 49th Street,low,...,0,0,0,0,0,0,0,0,0,0
4,1.0,4,2016-04-28 01:32:41,Beautifully renovated 3 bedroom flex 4 bedroom...,West 143rd Street,40.8241,-73.9493,3350,500 West 143rd Street,low,...,0,0,0,0,0,0,0,0,0,0


In [6]:
df.shape # should be (48300, 34)

(48300, 34)

In [7]:
# 'created' should be in the correct format
df['created'] = pd.to_datetime(df['created'], infer_datetime_format=True)
df['created'].head()

0   2016-06-24 07:54:24
1   2016-06-12 12:19:27
2   2016-04-17 03:26:41
3   2016-04-18 02:22:02
4   2016-04-28 01:32:41
Name: created, dtype: datetime64[ns]

In [8]:
df['created'].sample(10)

6557    2016-04-06 02:26:43
40488   2016-05-29 01:21:04
46884   2016-04-24 03:54:37
40157   2016-05-08 02:28:27
22091   2016-06-11 01:23:55
41857   2016-04-02 06:03:35
12912   2016-06-16 06:52:31
5776    2016-04-04 02:13:30
15549   2016-06-13 02:17:47
22390   2016-06-16 06:35:14
Name: created, dtype: datetime64[ns]

In [9]:
# Now let's get a month column
df['month'] = df['created'].dt.month
df['month'].head()

0    6
1    6
2    4
3    4
4    4
Name: month, dtype: int64

In [10]:
df['month'].unique()

array([6, 4, 5], dtype=int64)

In [11]:
# We'll use months 4 & 5 to train and month 6 to test
train = df[df['month'] < 6]
test = df[df['month'] == 6]

In [12]:
# Let's make sure we didn't lose any data
train.shape[0] + test.shape[0]

48300

In [13]:
tpm = train['price'].mean()
tpm

3432.7534190068222

In [14]:
# Now let's get our baseline
ytest = test['price']
ypred = np.full_like(ytest, fill_value=tpm)
print(len(ytest), len(ypred))
print(ypred)
print(f"MAE: {mean_absolute_error(ytest, ypred)}")

16785 16785
[3432 3432 3432 ... 3432 3432 3432]
MAE: 1052.5193327375632


In [15]:
# Let's see all the features so we can choose some for our model
list(df)

['bathrooms',
 'bedrooms',
 'created',
 'description',
 'display_address',
 'latitude',
 'longitude',
 'price',
 'street_address',
 'interest_level',
 'elevator',
 'cats_allowed',
 'hardwood_floors',
 'dogs_allowed',
 'doorman',
 'dishwasher',
 'no_fee',
 'laundry_in_building',
 'fitness_center',
 'pre-war',
 'laundry_in_unit',
 'roof_deck',
 'outdoor_space',
 'dining_room',
 'high_speed_internet',
 'balcony',
 'swimming_pool',
 'new_construction',
 'exclusive',
 'terrace',
 'loft',
 'garden_patio',
 'common_outdoor_space',
 'wheelchair_access',
 'month']

In [16]:
# We'll go with the same number of features as the lecture
feats = [
    'bedrooms',
    'latitude',
    'doorman',
    'laundry_in_unit',
    'balcony'
]

target = 'price'
xtrain = train[feats]
ytrain = train[target]

xtest = test[feats]
ytest = test[target]

In [17]:
xtrain.shape, ytrain.shape

((31515, 5), (31515,))

In [18]:
# Now we'll do the one hot encoding
encoder = ce.OneHotEncoder(use_cat_names=True)
xtrain = encoder.fit_transform(xtrain)
xtest = encoder.fit_transform(xtest)

In [19]:
scaler = StandardScaler()
xtrain = scaler.fit_transform(xtrain)
xtest = scaler.fit_transform(xtest)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [20]:
xtrain

array([[-0.45807512, -0.30435554, -0.85677923, -0.45019882, -0.25008427],
       [-0.45807512,  0.07968984, -0.85677923, -0.45019882, -0.25008427],
       [ 2.28697859,  1.86511936, -0.85677923, -0.45019882, -0.25008427],
       ...,
       [-0.45807512,  0.23737735,  1.16716181, -0.45019882, -0.25008427],
       [-1.37309302, -1.12331324,  1.16716181, -0.45019882, -0.25008427],
       [ 0.45694278,  3.02997224, -0.85677923, -0.45019882, -0.25008427]])

In [21]:
# Ok, now let's use our model
model = LinearRegression()
model.fit(xtrain, ytrain)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [22]:
ypred = model.predict(xtest)
ypred

array([4144.83579264, 4107.28292324, 2443.38548612, ..., 2593.17835091,
       4105.36003665, 3852.96279607])

In [23]:
# Let's see the MAE
mean_absolute_error(ytest, ypred)

771.1981260334376

In [24]:
# Now the coefficient
model.coef_

array([791.33792681, -76.93359624, 413.77943184, 182.89486926,
        10.82489064])

In [25]:
# and the intercept
model.intercept_

3432.753419006835

In [26]:
# Now let's try some different features to improve our MAE
# We'll go with the same number of features as the lecture
feats = [
    'bedrooms',
    'outdoor_space',
    'bathrooms',
    'dining_room',
    'loft'
]

target = 'price'
xtrain = train[feats]
ytrain = train[target]

xtest = test[feats]
ytest = test[target]

In [27]:
# Now we'll do the one hot encoding
encoder = ce.OneHotEncoder(use_cat_names=True)
xtrain = encoder.fit_transform(xtrain)
xtest = encoder.fit_transform(xtest)

In [28]:
scaler = StandardScaler()
xtrain = scaler.fit_transform(xtrain)
xtest = scaler.fit_transform(xtest)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [29]:
model = LinearRegression()
model.fit(xtrain, ytrain)
ypred = model.predict(xtest)
ypred

array([4501.19200955, 3332.35744273, 2911.92962418, ..., 2911.92962418,
       3332.35744273, 3332.35744273])

In [30]:
# Now let's see if we get a better MAE
mean_absolute_error(ytest, ypred)

751.0437102311328

In [31]:
# Slightly better. Let's give it one more try
feats = [
    'bedrooms',
    'hardwood_floors',
    'bathrooms',
    'fitness_center',
    'swimming_pool'
]


target = 'price'
xtrain = train[feats]
ytrain = train[target]

xtest = test[feats]
ytest = test[target]

# Now we'll do the one hot encoding
encoder = ce.OneHotEncoder(use_cat_names=True)
xtrain = encoder.fit_transform(xtrain)
xtest = encoder.fit_transform(xtest)

scaler = StandardScaler()
xtrain = scaler.fit_transform(xtrain)
xtest = scaler.fit_transform(xtest)

model = LinearRegression()
model.fit(xtrain, ytrain)
ypred = model.predict(xtest)
ypred

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


array([4393.39286199, 3778.22781201, 2790.89574528, ..., 2790.89574528,
       3236.24310828, 3264.99906125])

In [32]:
# Now let's see if we get a better MAE
mean_absolute_error(ytest, ypred)

723.8899901597703

In [33]:
# Let's create an amenities feature that lets us know how many ameninities the listing has
# First, let's list the features we have 
list(df)

['bathrooms',
 'bedrooms',
 'created',
 'description',
 'display_address',
 'latitude',
 'longitude',
 'price',
 'street_address',
 'interest_level',
 'elevator',
 'cats_allowed',
 'hardwood_floors',
 'dogs_allowed',
 'doorman',
 'dishwasher',
 'no_fee',
 'laundry_in_building',
 'fitness_center',
 'pre-war',
 'laundry_in_unit',
 'roof_deck',
 'outdoor_space',
 'dining_room',
 'high_speed_internet',
 'balcony',
 'swimming_pool',
 'new_construction',
 'exclusive',
 'terrace',
 'loft',
 'garden_patio',
 'common_outdoor_space',
 'wheelchair_access',
 'month']

In [34]:
# Now let's make our amenities list and check their format
amlist = ['elevator', 'doorman', 'laundry_in_building', 'fitness_center', 'laundry_in_unit', 'high_speed_internet', 
          'swimming_pool', 'common_outdoor_space', 'roof_deck']
df[amlist].head()

Unnamed: 0,elevator,doorman,laundry_in_building,fitness_center,laundry_in_unit,high_speed_internet,swimming_pool,common_outdoor_space,roof_deck
0,0,0,0,0,0,0,0,0,0
1,1,1,0,1,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0


In [35]:
# Let's add up the ameninities
df['amenities'] = df[amlist].sum(axis=1, skipna=True)
df['amenities'].head()

0    0
1    3
2    1
3    0
4    0
Name: amenities, dtype: int64

In [36]:
# Let's take a sample
df.sample(5)

Unnamed: 0,bathrooms,bedrooms,created,description,display_address,latitude,longitude,price,street_address,interest_level,...,swimming_pool,new_construction,exclusive,terrace,loft,garden_patio,common_outdoor_space,wheelchair_access,month,amenities
26642,2.0,2,2016-05-04 05:29:33,BEAUTIFUL 2BED 2 Bath with W/D in unit!!! Will...,North 10th St.,40.7176,-73.9531,5213,250 North 10th St.,low,...,0,1,0,0,0,0,0,0,5,6
5042,1.0,1,2016-04-17 02:43:59,*ACTUAL PHOTOS OF APT*\r\rGenerously sized 1 B...,E 56 Street,40.7575,-73.9626,3400,405 E 56 Street,low,...,0,0,0,0,0,0,0,0,4,2
10885,1.0,3,2016-06-09 04:21:51,This beautiful renovated 3 bedroom 2 bath apar...,Lexington Avenue,40.7443,-73.9809,5100,175 Lexington Avenue,medium,...,0,0,0,0,0,0,0,0,6,1
38061,2.0,2,2016-05-26 04:20:42,HUGE 1550sqft 2 BED 2 BATH JUST HIT THE MARKET...,E 58th St.,40.7606,-73.966,7000,245 E 58th St.,low,...,1,0,0,1,0,1,0,0,5,5
1387,1.0,1,2016-04-29 03:28:12,Heaven on the Hudson! This phenomenal one-bed...,W 37th St.,40.7568,-73.9982,3200,505 W 37th St.,low,...,1,0,0,1,0,0,0,0,4,5


In [37]:
# We need to refresh some values since we added a feature
train = df[df['month'] < 6]
test = df[df['month'] == 6]
tpm = train['price'].mean()
tpm

3432.7534190068222

In [38]:
# Now let's use amenities to hopefully give usour best MAE
feats = [
    'bedrooms',
    'hardwood_floors',
    'bathrooms',
    'latitude',    'amenities']


target = 'price'
xtrain = train[feats]
ytrain = train[target]

xtest = test[feats]
ytest = test[target]

# Now we'll do the one hot encoding
encoder = ce.OneHotEncoder(use_cat_names=True)
xtrain = encoder.fit_transform(xtrain)
xtest = encoder.fit_transform(xtest)

scaler = StandardScaler()
xtrain = scaler.fit_transform(xtrain)
xtest = scaler.fit_transform(xtest)

model = LinearRegression()
model.fit(xtrain, ytrain)
ypred = model.predict(xtest)
ypred

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


array([4367.72364474, 3593.64119158, 2518.80639551, ..., 2670.41974998,
       3244.26303403, 3439.23510759])

In [39]:
# Now let's get our MAE
mean_absolute_error(ytest, ypred)

707.1581075770334