<a href="https://colab.research.google.com/github/JenBanks8585/DS-Unit-2-Linear-Models/blob/master/module2-regression-2/2.2_Assignments_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
%%capture
import sys

# If you're on Colab:
if 'google.colab' in sys.modules:
    DATA_PATH = 'https://raw.githubusercontent.com/LambdaSchool/DS-Unit-2-Applied-Modeling/master/data/'
    !pip install category_encoders==2.*

# If you're working locally:
else:
    DATA_PATH = '../data/'
    
# Ignore this Numpy warning when using Plotly Express:
# FutureWarning: Method .ptp is deprecated and will be removed in a future version. Use numpy.ptp instead.
import warnings
warnings.filterwarnings(action='ignore', category=FutureWarning, module='numpy')

In [149]:
import numpy as np
import pandas as pd

# Read New York City apartment rental listing data
df = pd.read_csv(DATA_PATH+'apartments/renthop-nyc.csv', parse_dates=['created'])#, index_col='created')
assert df.shape == (49352, 34)

# Remove the most extreme 1% prices,
# the most extreme .1% latitudes, &
# the most extreme .1% longitudes
df = df[(df['price'] >= np.percentile(df['price'], 0.5)) & 
        (df['price'] <= np.percentile(df['price'], 99.5)) & 
        (df['latitude'] >= np.percentile(df['latitude'], 0.05)) & 
        (df['latitude'] < np.percentile(df['latitude'], 99.95)) &
        (df['longitude'] >= np.percentile(df['longitude'], 0.05)) & 
        (df['longitude'] <= np.percentile(df['longitude'], 99.95))]

In [150]:
df.columns

Index(['bathrooms', 'bedrooms', 'created', 'description', 'display_address',
       'latitude', 'longitude', 'price', 'street_address', 'interest_level',
       'elevator', 'cats_allowed', 'hardwood_floors', 'dogs_allowed',
       'doorman', 'dishwasher', 'no_fee', 'laundry_in_building',
       'fitness_center', 'pre-war', 'laundry_in_unit', 'roof_deck',
       'outdoor_space', 'dining_room', 'high_speed_internet', 'balcony',
       'swimming_pool', 'new_construction', 'terrace', 'exclusive', 'loft',
       'garden_patio', 'wheelchair_access', 'common_outdoor_space'],
      dtype='object')

In [None]:
df.info()

## 1. Engineer Features

In [None]:
#df['apt_desc_yes_or_no'] = df['description'].apply(lambda x: 0 if df.loc[df['description']== np.NaN] else 1 )

In [18]:
#cond = df['description']== np.NaN
#df.loc[df['description']== np.NaN,'apt_desc_yes_or_no' ] = 0


In [151]:
# Adding description length
df['apt_desc_len'] = len(df['description'])

# All perks
df['perks'] = df.iloc[:, 9:-2].sum(axis = 1)

# dogs and cats
df['dogs and cats'] = df['dogs_allowed'] + df['cats_allowed']

df['dogs or cats'] = [1 if x == 1 else 0 for x in df['dogs and cats']]
df['dogs_and_cats'] = [1 if x == 2 else 0 for x in df['dogs and cats']]
df['pets'] = [0 if x == 0 else 1 for x in df['dogs and cats']]
df= df.drop('dogs and cats', axis= 1)

# Total rooms
df['total_rooms'] = df['bedrooms'] + df['bathrooms']

# Room ratio
df['bath_per_bed'] = df['bathrooms']/ df['bedrooms'] 

# Add month colum
df['month'] = df['created'].dt.month

df = df.replace({'interest_level':{"medium": 2, "low": 1, "high": 3}})


df.head(2)

Unnamed: 0,bathrooms,bedrooms,created,description,display_address,latitude,longitude,price,street_address,interest_level,elevator,cats_allowed,hardwood_floors,dogs_allowed,doorman,dishwasher,no_fee,laundry_in_building,fitness_center,pre-war,laundry_in_unit,roof_deck,outdoor_space,dining_room,high_speed_internet,balcony,swimming_pool,new_construction,terrace,exclusive,loft,garden_patio,wheelchair_access,common_outdoor_space,apt_desc_len,perks,dogs or cats,dogs_and_cats,pets,total_rooms,bath_per_bed,month
0,1.5,3,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,40.7145,-73.9425,3000,792 Metropolitan Avenue,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,48817,0,0,0,0,4.5,0.5,6
1,1.0,2,2016-06-12 12:19:27,,Columbus Avenue,40.7947,-73.9667,5465,808 Columbus Avenue,1,1,1,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,48817,5,0,1,1,3.0,0.5,6


## Train/Test Split

In [None]:
df = df.dropna()
df.isnull().sum()

In [155]:
mask = df.month == 6
train = df.loc[mask]
test = df.loc[~mask]

train.shape, test.shape

((16402, 42), (30707, 42))

## Split feature and target matrices

In [156]:
target = 'price'
y_train = train[target]
X_train = train.drop(['price', 'created', 'description', 'display_address', 'street_address'], axis = 1)

y_test = test[target]
X_test = test.drop(['price', 'created', 'description', 'display_address', 'street_address'], axis = 1)


print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(16402, 37) (16402,)
(30707, 37) (30707,)


## Baseline

In [157]:
baseline = y_train.mean()
print(f'baseline price is: ${baseline}')

baseline price is: $3588.801731496159


In [158]:
from sklearn.metrics import mean_absolute_error, mean_squared_error,r2_score

In [159]:
# baseline metrics

y_pred = [baseline] * len(y_train)
baseline_MAE = mean_absolute_error(y_train, y_pred)
baseline_R2 = r2_score(y_train, y_pred)
baseline_MSE = mean_squared_error(y_train, y_pred)
print(f'Baseline MAE: ${baseline_MAE}')
print(f'Baseline R2: ${baseline_R2}')
print(f'Baseline RMSE: ${baseline_MSE}')

Baseline MAE: $1204.6323797195153
Baseline R2: $0.0
Baseline RMSE: $3123768.66377459


## Build a Model

In [160]:
X_train.columns

Index(['bathrooms', 'bedrooms', 'latitude', 'longitude', 'interest_level',
       'elevator', 'cats_allowed', 'hardwood_floors', 'dogs_allowed',
       'doorman', 'dishwasher', 'no_fee', 'laundry_in_building',
       'fitness_center', 'pre-war', 'laundry_in_unit', 'roof_deck',
       'outdoor_space', 'dining_room', 'high_speed_internet', 'balcony',
       'swimming_pool', 'new_construction', 'terrace', 'exclusive', 'loft',
       'garden_patio', 'wheelchair_access', 'common_outdoor_space',
       'apt_desc_len', 'perks', 'dogs or cats', 'dogs_and_cats', 'pets',
       'total_rooms', 'bath_per_bed', 'month'],
      dtype='object')

In [161]:
from sklearn.linear_model import LinearRegression

model_all = LinearRegression()

model_all.fit(X_train.iloc[:, 0:-3], y_train)

pred_all = model_all.predict(X_test.iloc[:, 0:-3])

In [142]:
len(pred_all)

30707

In [143]:
len(y_test)

30707

## Metrics

In [162]:
# test metrics

#y_test_pred = [pred_all] * len(y_test)
test_MAE = mean_absolute_error(y_test, pred_all)
test_R2 = r2_score(y_test, pred_all)
test_MSE = mean_squared_error(y_test, pred_all)
print(f'Test MAE: ${test_MAE}')
print(f'Test R2: ${test_R2}')
print(f'Test RMSE: ${test_MSE}')

Test MAE: $678.3709491276674
Test R2: $0.6384645209125762
Test RMSE: $1125446.3657726229


In [165]:
train_pred = model_all.predict(X_train.iloc[:, 0:-3])
train_R2 = r2_score(y_train, train_pred)
train_R2

0.6579577765091398

In [166]:
train_MAE = mean_absolute_error(y_train, train_pred)
train_MAE

674.8188549795757