# UW - Regression Course - WK2 Assignment 1 - Multiple Regression 

###### Load Libraries 

In [119]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import sklearn
from sklearn import linear_model
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error
from math import log

###### Load Dataset 

###### 1.  Load in the house data 

In [48]:
df=pd.read_csv('home_data.csv')

###### 2.Split into training and test data Use this command to set the same seed for everyone 

In [49]:
train_data, test_data=train_test_split(df, test_size=0.2, random_state=0)

###### 3. Although we often think of multiple regression as including multiple different features (e.g. # of bedrooms, square feet, and # of bathrooms) but we can also consider transformations of existing variables e.g. the log of the square feet or even "interaction" variables such as the product of bedrooms and bathrooms. Add 4 new variables in both your train_data and test_data. 
•‘bedrooms_squared’ = ‘bedrooms’*‘bedrooms’
•‘bed_bath_rooms’ = ‘bedrooms’*‘bathrooms’
•‘log_sqft_living’ = log(‘sqft_living’)
•‘lat_plus_long’ = ‘lat’ + ‘long’
 

In [51]:
# Computing bedrooms_squared using .apply and lambda
# Squaring bedrooms will increase the separation between not many bedrooms (e.g. 1) and lots of bedrooms (e.g. 4) 
# since 1^2 = 1 but 4^2 = 16. 
# Consequently this feature will mostly affect houses with many bedrooms.

In [52]:
df['bedrooms_squared']=df.bedrooms.apply(lambda x:x**2)
df['bedrooms_squared'].head()

0     9
1     9
2     4
3    16
4     9
Name: bedrooms_squared, dtype: int64

In [53]:
# Computing bed_bath_rooms 
# Defining a function that passes bedrooms and bathrooms as arguments
# Returns the product of bedrooms and bathrooms

# bedrooms times bathrooms gives what's called an "interaction" feature. It is large when both of them are large.

In [54]:
def multiply(bedbath):
    bedrooms,bathrooms=bedbath
    return bedrooms*bathrooms

In [55]:
df['bed_bath_rooms']=df[['bedrooms','bathrooms']].apply(multiply, axis=1)
df['bed_bath_rooms'].head()

0     3.00
1     6.75
2     2.00
3    12.00
4     6.00
Name: bed_bath_rooms, dtype: float64

In [56]:
# Computing log_sqft_living using .apply and lambda
# Taking the log of squarefeet has the effect of bringing large values closer together and spreading out small values.

In [57]:
df['log_sqft_living']=df['sqft_living'].apply(lambda x:log(x))
df['log_sqft_living'].head()

0    7.073270
1    7.851661
2    6.646391
3    7.580700
4    7.426549
Name: log_sqft_living, dtype: float64

In [58]:
# Computing lat_plus_long 
# Defining a function that passes lat and long as arguments
# Returns the sum of lat and long
# Adding latitude to longitude is totally non-sensical but we will do it anyway

In [59]:
def addlatlong(latlong):
    lat,long=latlong
    return lat+long

In [60]:
df['lat_plus_long']=df[['lat','long']].apply(addlatlong, axis=1)
df['lat_plus_long'].head()

0   -74.7458
1   -74.5980
2   -74.4951
3   -74.8722
4   -74.4282
Name: lat_plus_long, dtype: float64

###### 4. What are the mean (arithmetic average) values of your 4 new variables on TEST data? (round to 2 digits) 

In [61]:
#Squaring bedrooms will increase the separation between not many bedrooms (e.g. 1) and lots of bedrooms (e.g. 4) 
    #since 1^2 = 1 but 4^2 = 16. Consequently this variable will mostly affect houses with many bedrooms.
#Bedrooms times bathrooms is what's called an "interaction" variable. It is large when both of them are large.
#Taking the log of square feet has the effect of bringing large values closer together and spreading out small values.
#Adding latitude to longitude is non-sensical but we will do it anyway 


print("Mean of bedrooms squared: %.2f" %np.mean(df['bedrooms_squared']))
print("Mean of bed_bath_rooms: %.2f" %np.mean(df['bed_bath_rooms']))
print("Mean of log_sqft_living: %.2f" %np.mean(df['log_sqft_living']))
print("Mean of lat_plus_long: %.2f" %np.mean(df['lat_plus_long']))

Mean of bedrooms squared: 12.23
Mean of bed_bath_rooms: 7.50
Mean of log_sqft_living: 7.55
Mean of lat_plus_long: -74.65


###### 5. Use any regression library/function to estimate the regression coefficients/weights for predicting ‘price’ for the following three models:(In all 3 models include an intercept -- most software does this by default). 

Model 1: ‘sqft_living’, ‘bedrooms’, ‘bathrooms’, ‘lat’, and ‘long’

Model 2: ‘sqft_living’, ‘bedrooms’, ‘bathrooms’, ‘lat’,‘long’, and ‘bed_bath_rooms’

Model 3: ‘sqft_living’, ‘bedrooms’, ‘bathrooms’, ‘lat’,‘long’, ‘bed_bath_rooms’, ‘bedrooms_squared’, ‘log_sqft_living’, and ‘lat_plus_long’


In [62]:
# Redo train test split since we add variables to out dataset
train_data, test_data=train_test_split(df, test_size=0.2, random_state=0)

In [63]:
# Defining features for 3 models
model_1_features= ['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long']
model_2_features= model_1_features+['bed_bath_rooms']
model_3_features= model_2_features+['bedrooms_squared','log_sqft_living','lat_plus_long']

In [64]:
# Building a function to perform multiple linear regression
# Return the intercept and slope
# Input the X_features and Y_Observed in training set
def MultipleLinearReg(feature,output):
    feature=feature
    output=output.reshape(-1,1)
    model=linear_model.LinearRegression()
    model.fit(feature,output)
    intercept=model.intercept_
    coef=model.coef_
    return(intercept,coef) 

In [98]:
lreg1=MultipleLinearReg(train_data[model_1_features],train_data.price)
model_1_intercept, model_1_slope=lreg1

In [99]:
lreg2=MultipleLinearReg(train_data[model_2_features],train_data.price)
model_2_intercept, model_2_slope=lreg2

In [100]:
lreg3=MultipleLinearReg(train_data[model_3_features],train_data.price)
model_3_intercept, model_3_slope=lreg3

In [102]:
print("Model 1: Intercept %.2f" %model_1_intercept)
print("Model 1: Slope ", model_1_slope)
print("Model 2: Intercept %.2f" %model_2_intercept)
print("Model 2: Slope ", model_2_slope)
print("Model 3: Intercept %.2f" %model_3_intercept)
print("Model 3: Slope ", model_3_slope)

Model 1: Intercept -70870847.44
Model 1: Slope  [[  3.12942011e+02  -5.30962684e+04   1.47770422e+04   6.53983345e+05
   -3.25707345e+05]]
Model 2: Intercept -68606821.79
Model 2: Slope  [[  3.06819574e+02  -1.04604713e+05  -7.01815223e+04   6.50590954e+05
   -3.09965761e+05   2.49441476e+04]]
Model 3: Intercept -62628451.68
Model 3: Slope  [[  5.37808087e+02   2.78048472e+03   1.01363772e+05   5.30798411e+05
   -4.09655443e+05  -1.81822573e+04   7.24579880e+02  -5.71030023e+05
    1.21142968e+05]]


###### 6. Quiz Question: What is the sign (positive or negative) for the coefficient/weight for ‘bathrooms’ in Model 1? 

###### 7. Quiz Question: What is the sign (positive or negative) for the coefficient/weight for ‘bathrooms’ in Model 2? 

###### 8. Is the sign for the coefficient the same in both models? Think about why this might be the case. 

###### 9. Now using your three estimated models compute the RSS (Residual Sum of Squares) on the Training data 

###### 10. Quiz Question: Which model (1, 2 or 3) had the lowest RSS on TRAINING data?

###### 11. Now using your three estimated models compute the RSS on the Testing data

###### 12. Quiz Question: Which model (1, 2, or 3) had the lowest RSS on TESTING data?

###### 13. Did you get the same answer for 9 and 11? Think about why this might be the case.


In [111]:
# Buliding functions to pred and calculate RSS
def get_residual_sum_of_squares(feature, output):
        feature=np.array(feature)
        output=output.reshape(-1,1)
        lreg=linear_model.LinearRegression()
        lreg.fit(feature,output)
        pred=lreg.predict(feature)
        RMSE=np.sqrt(mean_squared_error(pred,output))
        return(RMSE)

In [117]:
#RMSE of training dataset
Model_1_Train_RMSE=get_residual_sum_of_squares(train_data[model_1_features],train_data.price)
Model_2_Train_RMSE=get_residual_sum_of_squares(train_data[model_2_features],train_data.price)
Model_3_Train_RMSE=get_residual_sum_of_squares(train_data[model_3_features],train_data.price)

#RMSE of testing dataset
Model_1_Test_RMSE=get_residual_sum_of_squares(test_data[model_1_features],test_data.price)
Model_2_Test_RMSE=get_residual_sum_of_squares(test_data[model_2_features],test_data.price)
Model_3_Test_RMSE=get_residual_sum_of_squares(test_data[model_3_features],test_data.price)


In [118]:
print("Model 1 Training RMSE: %.2f" %Model_1_Train_RMSE)
print("Model 2 Training RMSE: %.2f" %Model_2_Train_RMSE)
print("Model 3 Training RMSE: %.2f" %Model_3_Train_RMSE)
print("Model 1 Test RMSE: %.2f" %Model_1_Test_RMSE)
print("Model 2 Test RMSE: %.2f" %Model_2_Test_RMSE)
print("Model 3 Test RMSE: %.2f" %Model_3_Test_RMSE)

Model 1 Training RMSE: 238056.96
Model 2 Training RMSE: 236955.73
Model 3 Training RMSE: 229875.82
Model 1 Test RMSE: 221066.48
Model 2 Test RMSE: 219737.17
Model 3 Test RMSE: 215667.99
