<a href="https://colab.research.google.com/github/ewuerfel66/DS-Unit-2-Linear-Models/blob/master/UnderstandingLinearRegression_EricWuerfel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Linear Regression with SciKitLearn

## Imports

In [0]:
# !pip install category_encoders

In [0]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import category_encoders as ce

In [0]:
LOCAL = '../data/nyc/nyc-rent-2016.csv'
WEB = 'https://raw.githubusercontent.com/LambdaSchool/DS-Unit-2-Linear-Models/master/data/nyc/nyc-rent-2016.csv'

df = pd.read_csv(WEB)
assert df.shape == (48300, 34)

In [0]:
df.head()

Unnamed: 0,bathrooms,bedrooms,created,description,display_address,latitude,longitude,price,street_address,interest_level,elevator,cats_allowed,hardwood_floors,dogs_allowed,doorman,dishwasher,no_fee,laundry_in_building,fitness_center,pre-war,laundry_in_unit,roof_deck,outdoor_space,dining_room,high_speed_internet,balcony,swimming_pool,new_construction,exclusive,terrace,loft,garden_patio,common_outdoor_space,wheelchair_access
0,1.5,3,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,40.7145,-73.9425,3000,792 Metropolitan Avenue,medium,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1.0,2,2016-06-12 12:19:27,,Columbus Avenue,40.7947,-73.9667,5465,808 Columbus Avenue,low,1,1,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1.0,1,2016-04-17 03:26:41,"Top Top West Village location, beautiful Pre-w...",W 13 Street,40.7388,-74.0018,2850,241 W 13 Street,high,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1.0,1,2016-04-18 02:22:02,Building Amenities - Garage - Garden - fitness...,East 49th Street,40.7539,-73.9677,3275,333 East 49th Street,low,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1.0,4,2016-04-28 01:32:41,Beautifully renovated 3 bedroom flex 4 bedroom...,West 143rd Street,40.8241,-73.9493,3350,500 West 143rd Street,low,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Train/Test Split

In [0]:
df['created'] = pd.to_datetime(df['created'], infer_datetime_format=True)

In [0]:
# Extract the month that the apartment listing was created
df['month'] = df['created'].dt.month

In [0]:
# Here's the actual split into TRAIN/TEST
train = df.query('month < 6')
test = df.query('month == 6')

In [0]:
train.shape, test.shape, df.shape

((31515, 35), (16785, 35), (48300, 35))

## Baseline Model

In [0]:
# A baseline regression
train['price'].median()

3100.0

In [0]:
# How far off are our baseline predictions on average?
y_test = test['price']
# y_pred = train['price'].median() * len(test)
y_pred = np.full_like(y_test, fill_value=train['price'].median())

In [0]:
# lots of room for improvement
print("MAE:", mean_absolute_error(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R^2:", r2_score(y_test, y_pred))

MAE: 1020.0558236520703
MSE: 2101068.4048257372
RMSE: 1449.5062624306725
R^2: -0.06142924192681343


## 2 Feature Linear Regression

In [0]:
# Select features and target
features = ['bedrooms', 'bathrooms']
target = 'price'

# Create train and test dataframes/vectors
X_train = train[features]
y_train = train[target]

X_test = test[features]
y_test = test[target]

# Create & fit the model
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [0]:
print("MAE:", mean_absolute_error(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R^2:", r2_score(y_test, y_pred))

MAE: 754.5508722934989
MSE: 1066040.6702140642
RMSE: 1032.4924552819086
R^2: 0.4614517367213872


In [0]:
print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)

Coefficients: [ 426.71051253 1499.03032982]
Intercept: 1022.9589927686366


## 3D Plot

* The html file should appear in the 'Files' tab in CoLab
* You may have to download the file to view it

In [0]:
import plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

In [0]:
trace = go.Mesh3d(
    x=test[features[0]],
    y=test[features[1]],
    z=y_pred,
    opacity=0.8
    )

In [0]:
plot([trace])

'file:///content/temp-plot.html'

## Multiple Linear Regression

* Features
  * Bedrooms, Bathrooms, Fitness Center, Interest Level, Longitude
* One-Hot Encoding

In [0]:
# Select features and target
features = ['bedrooms', 'bathrooms', 'fitness_center', 'interest_level', 'longitude']
target = 'price'

# Create train and test dataframes/vectors
X_train = train[features]
y_train = train[target]

X_test = test[features]
y_test = test[target]

# One-Hot Encoder
encoder = ce.OneHotEncoder(use_cat_names=True)
X_train = encoder.fit_transform(X_train)
X_test = encoder.transform(X_test)

# Create & fit the model
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [0]:
print("MAE:", mean_absolute_error(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R^2:", r2_score(y_test, y_pred))

MAE: 624.0727086075918
MSE: 769963.4552933192
RMSE: 877.4756152129353
R^2: 0.6110256454353178


In [0]:
print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)

Coefficients: [   488.4683177    1323.94778987    312.17629928   -329.5442508
    397.79431216    -68.25006136 -13553.01162863]
Intercept: -1001725.2550012637


## Amenities feature

In [0]:
df['amenities'] = True
df['amenities'] = (df['elevator']+
                   df['cats_allowed']+
                   df['dogs_allowed']+
                   df['hardwood_floors']+
                   df['doorman']+
                   df['dishwasher']+
                   df['no_fee']+
                   df['laundry_in_building']+
                   df['fitness_center']+
                   df['laundry_in_unit']+
                   df['roof_deck']+
                   df['outdoor_space']+
                   df['dining_room']+
                   df['high_speed_internet']+
                   df['balcony']+
                   df['swimming_pool']+
                   df['new_construction']+
                   df['terrace'])

**gotta redo the train/test split now**

In [0]:
# Here's the actual split into TRAIN/TEST
train = df.query('month < 6')
test = df.query('month == 6')

In [0]:
train.shape, test.shape, df.shape

((31515, 36), (16785, 36), (48300, 36))

Now the model

In [0]:
# Select features and target
features = ['bedrooms', 'bathrooms', 'amenities', 'longitude', 'interest_level']
target = 'price'

# Create train and test dataframes/vectors
X_train = train[features]
y_train = train[target]

X_test = test[features]
y_test = test[target]

# One-Hot Encoder
encoder = ce.OneHotEncoder(use_cat_names=True)
X_train = encoder.fit_transform(X_train)
X_test = encoder.transform(X_test)

# Create & fit the model
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [0]:
print("MAE:", mean_absolute_error(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R^2:", r2_score(y_test, y_pred))

MAE: 617.6979910224458
MSE: 762460.2451447417
RMSE: 873.1896959680306
R^2: 0.6148161582247778


In [0]:
print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)

Coefficients: [   478.23651654   1289.04776435     55.12684404 -13101.25251737
   -337.28284225    420.15052098    -82.86767873]
Intercept: -968416.2947589218


## Compare R^2 Values for Different Features

### *Best Model*

* Created a linear model for each feature
* Only kept features whose model's R^2 > 0.01
* Created the final model with only those features

In [0]:
r2_scores = []

In [0]:
def LinModel_1d(feature):
  # Select features and target
  features = [feature]
  target = 'price'

  # Create train and test dataframes/vectors
  X_train = train[features]
  y_train = train[target]

  X_test = test[features]
  y_test = test[target]

  # Create & fit the model
  model = LinearRegression()
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  
  # Return the R^2 Value
  r2_scores.append((feature, r2_score(y_test, y_pred)))

In [0]:
# Select all of the numerical features
my_feats = ['bedrooms', 'bathrooms']

# Amenities
for x in range(10, 34):
  my_feats.append(all_features[x])

# Longitude / Latitude
for x in range(5, 7):
  my_feats.append(all_features[x])

In [0]:
# Create a model for each numerical feature
for feature in my_feats:
  LinModel_1d(feature)

In [0]:
# Only keep those features whose R^2 is greater than 0.01
for score in r2_scores:
  if score[1] < 0.01:
    r2_scores.remove(score)

In [0]:
# Our final features
features = []

for score in r2_scores:
  features.append(score[0])
  
features.append('interest_level')

In [0]:
# Select features and target
target = 'price'

# Create train and test dataframes/vectors
X_train = train[features]
y_train = train[target]

X_test = test[features]
y_test = test[target]

# One-Hot Encoder
encoder = ce.OneHotEncoder(use_cat_names=True)
X_train = encoder.fit_transform(X_train)
X_test = encoder.transform(X_test)

# Create & fit the model
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [0]:
print("MAE:", mean_absolute_error(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R^2:", r2_score(y_test, y_pred))

MAE: 593.2060868641156
MSE: 715189.646032596
RMSE: 845.6888588793139
R^2: 0.6386965783318959


In [0]:
print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)

Coefficients: [   513.04668314   1209.60748244    179.19445646   -129.49229121
    392.67248037    110.72722029    -75.07939851     71.15325928
    353.81171068   -134.58747844   -100.60150455     99.53590305
   -281.80257601     24.76409322     33.50965178     71.76950918
    112.56188005    -20.63421386     85.32446476 -12297.2899395
   -300.87105917    370.52226469    -69.65120552]
Intercept: -908901.2225445384
