# Core Statistics Using Python
### Hana Choi, Simon Business School, University of Rochester


# Multiple Linear Regression in Python

## Topics covered

- House price example
- RFJ example

## Required packages

In [None]:
import pandas as pd
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt

# House Prices Example

## Load data

In [None]:
# Load hprices2.csv dataset
hprices2 = pd.read_csv("/Users/hanachoi/Dropbox/teaching/core_statistics/Data/hprices2.csv")

# Display first few rows of the dataframe
hprices2.head()

## Histogram of bedrooms

In [None]:
# Histogram of bedrooms
hprices2['bdrms'].plot(kind='hist')
plt.title('Histogram of Bedrooms')
plt.show()

## Regression analysis

In [None]:
# Simple regression with one X
model_simple = smf.ols('price ~ sqrft', data=hprices2).fit()
print(model_simple.summary())

In [None]:
# Multiple regression with 3 X's
model_multi = smf.ols('price ~ sqrft + lotsize + bdrms', data=hprices2).fit()
print(model_multi.summary())

In [None]:
# Comparing the coefficients
print("Coefficients for model with multiple X's:") 
print(model_multi.params)
print('----')
print("Coefficients for model with only sqrft:")
print(model_simple.params)

## Heteroskedasticity-Robust Standard Error (HR SE)

In [None]:
# model_multi_HRse = smf.ols('price ~ sqrft + lotsize + bdrms', data=hprices2).fit(cov_type='HC1')

# Another way of getting HR SE
model_multi_HRse = model_multi.get_robustcov_results(cov_type='HC1') 
print(model_multi_HRse.summary())

## Constructing confidence and prediction intervals

- We will predict prices for two different house types

### House type 1: (sqrft=2000,lotsize=4000,bdrms=3)

In [None]:
new_data1 = pd.DataFrame({'sqrft': [2000], 'lotsize': [4000], 'bdrms': [3]})
predictions1 = model_multi.get_prediction(new_data1)
predictions1.summary_frame(alpha=0.05)  # 95% confidence and prediction intervals

### House type 2: (sqrft=3000,lotsize=5000,bdrms=4)

In [None]:
new_data2 = pd.DataFrame({'sqrft': [3000], 'lotsize': [5000], 'bdrms': [4]})
predictions2 = model_multi.get_prediction(new_data2)
predictions2.summary_frame(alpha=0.05)  # 95% confidence and prediction intervals

# Additional RFJ Example

- Same data as in the problem set, but I made the units easier to work with
- To make the interpretation easier, I have converted quantities to thousands of 64 oz containers sold (by dividing the original q1 by 1000 and dividing the result by 64).
- I also converted price to price per 64 oz container by multiplying the original prices by 64.
- This will not change any of our substantive conclusions, but does make interpretation and discussion easier.


## Load data

In [None]:
# Load hprices2.csv dataset
rfj_small = pd.read_csv("/Users/hanachoi/Dropbox/teaching/core_statistics/Data/rfj_small.csv")

# Display first few rows of the dataframe
rfj_small.head()

## Simple regression with one X

In [None]:
# First let's run the regression with only own price
model_rfj = smf.ols('q1 ~ p1', data=rfj_small).fit()
print(model_rfj.summary().tables[1])
print('----')

# Get slope estimate and compute elasticity
slope_estimate = model_rfj.params['p1']
avg_price = rfj_small['p1'].mean()
avg_quantity = rfj_small['q1'].mean()
elasticity = slope_estimate * avg_price / avg_quantity
print("Price Elasticity:", elasticity)

## Multiple linear regression with many Xs

In [None]:
# Now let's run the full regression (with all three prices)
model_rfj_multi = smf.ols('q1 ~ p1 + p2 + p3', data=rfj_small).fit()
print(model_rfj_multi.summary().tables[1])
print('----')


# Correlation matrix for prices
print("Correlation Matrix:")
print(rfj_small[['p1', 'p2', 'p3']].corr())