<a href="https://colab.research.google.com/github/ewuerfel66/DS-Unit-2-Linear-Models/blob/master/LinearRegression_EricWuerfel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Linear Regression with SciKitLearn

## Imports

In [0]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

import category_encoders as ce

In [0]:
LOCAL = '../data/nyc/nyc-rent-2016.csv'
WEB = 'https://raw.githubusercontent.com/LambdaSchool/DS-Unit-2-Linear-Models/master/data/nyc/nyc-rent-2016.csv'

df = pd.read_csv(WEB)
assert df.shape == (48300, 34)

In [140]:
df.head()

Unnamed: 0,bathrooms,bedrooms,created,description,display_address,latitude,longitude,price,street_address,interest_level,elevator,cats_allowed,hardwood_floors,dogs_allowed,doorman,dishwasher,no_fee,laundry_in_building,fitness_center,pre-war,laundry_in_unit,roof_deck,outdoor_space,dining_room,high_speed_internet,balcony,swimming_pool,new_construction,exclusive,terrace,loft,garden_patio,common_outdoor_space,wheelchair_access
0,1.5,3,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,40.7145,-73.9425,3000,792 Metropolitan Avenue,medium,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1.0,2,2016-06-12 12:19:27,,Columbus Avenue,40.7947,-73.9667,5465,808 Columbus Avenue,low,1,1,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1.0,1,2016-04-17 03:26:41,"Top Top West Village location, beautiful Pre-w...",W 13 Street,40.7388,-74.0018,2850,241 W 13 Street,high,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1.0,1,2016-04-18 02:22:02,Building Amenities - Garage - Garden - fitness...,East 49th Street,40.7539,-73.9677,3275,333 East 49th Street,low,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1.0,4,2016-04-28 01:32:41,Beautifully renovated 3 bedroom flex 4 bedroom...,West 143rd Street,40.8241,-73.9493,3350,500 West 143rd Street,low,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Train/Test Split

In [0]:
df['created'] = pd.to_datetime(df['created'], infer_datetime_format=True)

In [0]:
# Extract the month that the apartment listing was created
df['month'] = df['created'].dt.month

In [0]:
# Here's the actual split into TRAIN/TEST
train = df.query('month < 6')
test = df.query('month == 6')

In [144]:
train.shape, test.shape, df.shape

((31515, 35), (16785, 35), (48300, 35))

## Baseline Model

**MAE: $1020.06**

In [145]:
# A baseline regression
train['price'].median()

3100.0

In [0]:
# How far off are our baseline predictions on average?
y_test = test['price']
# y_pred = train['price'].median() * len(test)
y_pred = np.full_like(y_test, fill_value=train['price'].median())

In [147]:
# lots of room for improvement
mean_absolute_error(y_test, y_pred)

1020.0558236520703

## Linear Regression

**MAE: $624.07**

* Several Features
* One-Hot Encoding

In [0]:
# Select features and target
features = ['bedrooms', 'bathrooms', 'fitness_center', 'interest_level', 'longitude']
target = 'price'

# Create train and test dataframes/vectors
X_train = train[features]
y_train = train[target]

X_test = test[features]
y_test = test[target]

# One-Hot Encoder
encoder = ce.OneHotEncoder(use_cat_names=True)
X_train = encoder.fit_transform(X_train)
X_test = encoder.transform(X_test)

# Create & fit the model
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [149]:
mean_absolute_error(y_test, y_pred)

624.0727086075918

In [150]:
print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)

Coefficients: [   488.4683177    1323.94778987    312.17629928   -329.5442508
    397.79431216    -68.25006136 -13553.01162863]
Intercept: -1001725.2550012637


## With PCA

**MAE: $674.33**

(not worth it)

In [0]:
# Imports
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [0]:
# Select features and target
features = ['bedrooms', 'bathrooms', 'fitness_center', 'longitude']
target = 'price'

# Create train and test dataframes/vectors
X_train = train[features]
y_train = train[target]

X_test = test[features]
y_test = test[target]

# Encode Data
# encoder = ce.OneHotEncoder(use_cat_names=True)
# X_train = encoder.fit_transform(X_train)
# X_test = encoder.transform(X_test)

#features = ['bedrooms', 'fitness_center', 'interest_level_high', 'interest_level_medium', 'interest_level_low', 'longitude']

In [153]:
df.shape, X_train.shape, X_test.shape

((48300, 35), (31515, 4), (16785, 4))

In [0]:
# Get the data ready to be Standardized
X_train = X_train.loc[:, features].values
X_test = X_test.loc[:, features].values

In [0]:
X_train = StandardScaler().fit_transform(X_train)
X_test = StandardScaler().fit_transform(X_test)

In [0]:
X_train = pd.DataFrame(data = X_train, columns = features)
X_test = pd.DataFrame(data = X_test, columns = features)

In [0]:
# Do some PCA
pca = PCA(n_components=2)
train_PCs = pca.fit_transform(X_train)
test_PCs = pca.fit_transform(X_test)

In [0]:
train_PCdf = pd.DataFrame(data = train_PCs, columns = ['PC 1', 'PC 2'])
test_PCdf = pd.DataFrame(data = test_PCs, columns = ['PC 1', 'PC 2'])

In [159]:
train_PCdf.shape, test_PCdf.shape

((31515, 2), (16785, 2))

In [0]:
# Create & fit the model
model = LinearRegression()
model.fit(train_PCdf, y_train)
y_pred = model.predict(test_PCdf)

In [161]:
mean_absolute_error(y_test, y_pred)

674.3324627315778

## Amenities feature

**MAE: $653.52**

In [0]:
df['amenities'] = True
df['amenities'] = (df['elevator']+
                   df['cats_allowed']+
                   df['dogs_allowed']+
                   df['hardwood_floors']+
                   df['doorman']+
                   df['dishwasher']+
                   df['no_fee']+
                   df['laundry_in_building']+
                   df['fitness_center']+
                   df['laundry_in_unit']+
                   df['roof_deck']+
                   df['outdoor_space']+
                   df['dining_room']+
                   df['high_speed_internet']+
                   df['balcony']+
                   df['swimming_pool']+
                   df['new_construction']+
                   df['terrace'])

In [163]:
df.head()

Unnamed: 0,bathrooms,bedrooms,created,description,display_address,latitude,longitude,price,street_address,interest_level,elevator,cats_allowed,hardwood_floors,dogs_allowed,doorman,dishwasher,no_fee,laundry_in_building,fitness_center,pre-war,laundry_in_unit,roof_deck,outdoor_space,dining_room,high_speed_internet,balcony,swimming_pool,new_construction,exclusive,terrace,loft,garden_patio,common_outdoor_space,wheelchair_access,month,amenities
0,1.5,3,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,40.7145,-73.9425,3000,792 Metropolitan Avenue,medium,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,0
1,1.0,2,2016-06-12 12:19:27,,Columbus Avenue,40.7947,-73.9667,5465,808 Columbus Avenue,low,1,1,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,5
2,1.0,1,2016-04-17 03:26:41,"Top Top West Village location, beautiful Pre-w...",W 13 Street,40.7388,-74.0018,2850,241 W 13 Street,high,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,3
3,1.0,1,2016-04-18 02:22:02,Building Amenities - Garage - Garden - fitness...,East 49th Street,40.7539,-73.9677,3275,333 East 49th Street,low,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,2
4,1.0,4,2016-04-28 01:32:41,Beautifully renovated 3 bedroom flex 4 bedroom...,West 143rd Street,40.8241,-73.9493,3350,500 West 143rd Street,low,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0


**gotta redo the train/test split now**

In [0]:
# Here's the actual split into TRAIN/TEST
train = df.query('month < 6')
test = df.query('month == 6')

In [165]:
train.shape, test.shape, df.shape

((31515, 36), (16785, 36), (48300, 36))

Now the model

In [0]:
# Select features and target
features = ['bedrooms', 'bathrooms', 'amenities', 'longitude']
target = 'price'

# Create train and test dataframes/vectors
X_train = train[features]
y_train = train[target]

X_test = test[features]
y_test = test[target]

# Create & fit the model
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [167]:
mean_absolute_error(y_test, y_pred)

653.5166391920351