# Modeling

## Imports

In [47]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

## Load Data

In [2]:
df = pd.read_csv('../listings_ready.csv')

In [3]:
df.head()

Unnamed: 0,description,neighborhood_overview,host_about,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,host_has_profile_pic,host_identity_verified,latitude,...,Terra Nova,Tierrasanta,Tijuana River Valley,Torrey Pines,University City,Valencia Park,Webster,West University Heights,Wooded Area,Yosemite Dr
0,1,1,1,1.0,1.0,0,3.0,1,1,32.7843,...,0,0,0,0,0,0,0,0,0,0
1,1,0,1,1.0,0.5,0,7.0,1,1,32.80724,...,0,0,0,0,0,0,0,0,0,0
2,1,1,1,0.0,0.86,0,3.0,1,1,32.74202,...,0,0,0,0,0,0,0,0,0,0
3,1,1,1,1.0,0.75,1,5.0,1,1,32.81301,...,0,0,0,0,0,0,0,0,0,0
4,1,1,1,1.0,0.96,1,2.0,1,1,32.80734,...,0,0,0,0,0,0,0,0,0,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9268 entries, 0 to 9267
Columns: 217 entries, description to Yosemite Dr
dtypes: float64(12), int64(205)
memory usage: 15.3 MB


In [5]:
df.describe()

Unnamed: 0,description,neighborhood_overview,host_about,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,host_has_profile_pic,host_identity_verified,latitude,...,Terra Nova,Tierrasanta,Tijuana River Valley,Torrey Pines,University City,Valencia Park,Webster,West University Heights,Wooded Area,Yosemite Dr
count,9268.0,9268.0,9268.0,9268.0,9268.0,9268.0,9268.0,9268.0,9268.0,9268.0,...,9268.0,9268.0,9268.0,9268.0,9268.0,9268.0,9268.0,9268.0,9268.0,9268.0
mean,0.982844,0.717091,0.699612,0.833221,0.80146,0.412603,147.242016,0.997734,0.817976,32.769449,...,0.000647,0.001511,0.000324,0.001618,0.016077,0.001834,0.001079,0.018666,0.003776,0.000539
std,0.129859,0.450437,0.458452,0.346571,0.319024,0.492329,880.329771,0.04755,0.385885,0.06477,...,0.025437,0.038839,0.01799,0.0402,0.125778,0.042791,0.032832,0.135351,0.06134,0.023222
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,32.53251,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,0.0,0.94,0.75,0.0,1.0,1.0,1.0,32.72728,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,1.0,1.0,1.0,0.97,0.0,3.0,1.0,1.0,32.760985,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,1.0,1.0,1.0,1.0,1.0,17.0,1.0,1.0,32.79845,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,7584.0,1.0,1.0,33.08582,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Train/Test Split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='price'), 
                                                    df.price, test_size=0.3, 
                                                    random_state=42)

In [7]:
X_train.shape, X_test.shape

((6487, 216), (2781, 216))

In [8]:
y_train.shape, y_test.shape

((6487,), (2781,))

In [9]:
X_train.dtypes

description                  int64
neighborhood_overview        int64
host_about                   int64
host_response_rate         float64
host_acceptance_rate       float64
                            ...   
Valencia Park                int64
Webster                      int64
West University Heights      int64
Wooded Area                  int64
Yosemite Dr                  int64
Length: 216, dtype: object

In [10]:
X_test.dtypes

description                  int64
neighborhood_overview        int64
host_about                   int64
host_response_rate         float64
host_acceptance_rate       float64
                            ...   
Valencia Park                int64
Webster                      int64
West University Heights      int64
Wooded Area                  int64
Yosemite Dr                  int64
Length: 216, dtype: object

In [11]:
#check to see if there is any missing data
df.isnull().sum().sort_values().head()

description        0
Del Mar Heights    0
East Lake          0
East Village       0
Eastlake Trails    0
dtype: int64

## Dummy Model

In [12]:
train_mean = y_train.mean()
train_mean

227.7963619546786

In [13]:
#initiate dummy regressor for baseline
dumb_reg = DummyRegressor(strategy='mean')
dumb_reg.fit(X_train, y_train)
dumb_reg.constant_

array([[227.79636195]])

In [14]:
y_tr_pred = dumb_reg.predict(X_train)
y_tr_pred

array([227.79636195, 227.79636195, 227.79636195, ..., 227.79636195,
       227.79636195, 227.79636195])

In [15]:
y_te_pred = train_mean * np.ones(len(y_test))

In [16]:
#R-Squared
r2_score(y_train, y_tr_pred), r2_score(y_test, y_te_pred)

(0.0, -0.0008742204571565715)

In [17]:
#Mean absolute error
mean_absolute_error(y_train, y_tr_pred), mean_absolute_error(y_test, y_te_pred)

(168.5274535505418, 196.0639762084399)

In [18]:
#Mean squared error
mean_squared_error(y_train, y_tr_pred), mean_squared_error(y_test, y_te_pred)

(408946.0807760354, 1026448.1131555447)

As expected, the dummy model did very poorly even on the training set

## Simple Linear Regression

In [36]:
#scale the data
scaler = StandardScaler()
scaler.fit(X_train)
X_tr_scaled = scaler.transform(X_train)
X_te_scaled = scaler.transform(X_test)

In [37]:
#initialize linear regression model
lm = LinearRegression().fit(X_tr_scaled, y_train)

In [43]:
#making prediction
y_tr_pred = lm.predict(X_tr_scaled)
y_te_pred = lm.predict(X_te_scaled)

In [44]:
#R-squareed
r2_score(y_train, y_tr_pred), r2_score(y_test, y_te_pred)

(0.12862894701802297, -3.6656376633072083e+19)

In [45]:
#Mean absolute error - MAE
mean_absolute_error(y_train, y_tr_pred), mean_absolute_error(y_test, y_te_pred)

(135.96310742674748, 335151440322.6142)

In [46]:
#Mean squared error
mean_squared_error(y_train, y_tr_pred), mean_squared_error(y_test, y_te_pred)

(356343.77701866656, 3.759300405694329e+25)