## Dummy Variable 測試

In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import IPython as ip
mpl.style.use('ggplot')
mpl.rc('font', family='Noto Sans CJK TC')
ip.display.set_matplotlib_formats('svg')

In [2]:
df = pd.read_csv('data/APPENC07.csv')

In [3]:
df['Quality'].replace({1:3, 3:1}, inplace=True) #inplace=True才會寫入

In [4]:
df.head()

Unnamed: 0,id,sales price,Finished square feet,Number of bedrooms,Number of bathrooms,Air conditioning,Garage size,Pool,Year built,Quality,Style,Lot size,Adjacent to highway
0,1,360000,3032,4,4,1,2,0,1972,2,1,22221,0
1,2,340000,2058,4,2,1,2,0,1976,2,1,22912,0
2,3,250000,1780,4,3,1,2,0,1980,2,1,21345,0
3,4,205500,1638,4,2,1,2,0,1963,2,1,17342,0
4,5,275500,2196,4,3,1,2,0,1968,2,7,21786,0


In [5]:
#將三個應該是類別變數的轉為類別
df["Quality"] = df["Quality"].astype("category")
df["Style"] = df["Style"].astype("category")

df["id"] = df["id"].astype("category")

In [6]:
#複製一組資料集來做下列的運算，避免做錯被覆蓋
df2 = df.copy() #整份資料要copy成另一份時

In [7]:
df2['Quality'].value_counts() #確認一下Quality的次數分配

2    290
1    164
3     68
Name: Quality, dtype: int64

In [8]:
dummies = pd.get_dummies(df['Quality']).rename(columns=lambda x: 'Quality_' + str(x))
df2 = pd.concat([df2, dummies], axis=1)

# Quality_1 = low
# Quality_2 = medium
# Quality_3 = high

In [11]:
df2.head()

Unnamed: 0,id,sales price,Finished square feet,Number of bedrooms,Number of bathrooms,Air conditioning,Garage size,Pool,Year built,Quality,Style,Lot size,Adjacent to highway,Quality_1,Quality_2,Quality_3,Quality_1.1,Quality_2.1,Quality_3.1
0,1,360000,3032,4,4,1,2,0,1972,2,1,22221,0,0,1,0,0,1,0
1,2,340000,2058,4,2,1,2,0,1976,2,1,22912,0,0,1,0,0,1,0
2,3,250000,1780,4,3,1,2,0,1980,2,1,21345,0,0,1,0,0,1,0
3,4,205500,1638,4,2,1,2,0,1963,2,1,17342,0,0,1,0,0,1,0
4,5,275500,2196,4,3,1,2,0,1968,2,7,21786,0,0,1,0,0,1,0


In [9]:
dummies1 = pd.get_dummies(df['Style']).rename(columns=lambda x: 'Style_' + str(x))
df2 = pd.concat([df2, dummies1], axis=1)

In [10]:
df2.head()

Unnamed: 0,id,sales price,Finished square feet,Number of bedrooms,Number of bathrooms,Air conditioning,Garage size,Pool,Year built,Quality,...,Style_1,Style_2,Style_3,Style_4,Style_5,Style_6,Style_7,Style_9,Style_10,Style_11
0,1,360000,3032,4,4,1,2,0,1972,2,...,1,0,0,0,0,0,0,0,0,0
1,2,340000,2058,4,2,1,2,0,1976,2,...,1,0,0,0,0,0,0,0,0,0
2,3,250000,1780,4,3,1,2,0,1980,2,...,1,0,0,0,0,0,0,0,0,0
3,4,205500,1638,4,2,1,2,0,1963,2,...,1,0,0,0,0,0,0,0,0,0
4,5,275500,2196,4,3,1,2,0,1968,2,...,0,0,0,0,0,0,1,0,0,0


## Linear Regression Test

In [11]:
import statsmodels.api as sm
import numpy as np
import pandas as pd

In [12]:
X = df2[["Finished square feet", "Number of bedrooms", "Number of bathrooms","Air conditioning","Garage size"]]
y = df2["sales price"]
X = sm.add_constant(X) ## let's add an intercept (beta_0) to our model


# Note the difference in argument order
model = sm.OLS(y, X).fit()
predictions = model.predict(X) # make the predictions by the model

# Print out the statistics
model.summary()


0,1,2,3
Dep. Variable:,sales price,R-squared:,0.712
Model:,OLS,Adj. R-squared:,0.709
Method:,Least Squares,F-statistic:,255.5
Date:,"Tue, 13 Apr 2021",Prob (F-statistic):,4.88e-137
Time:,17:46:29,Log-Likelihood:,-6592.6
No. Observations:,522,AIC:,13200.0
Df Residuals:,516,BIC:,13220.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.01e+05,1.44e+04,-7.000,0.000,-1.29e+05,-7.26e+04
Finished square feet,129.3685,7.476,17.304,0.000,114.681,144.056
Number of bedrooms,-1.454e+04,4060.252,-3.581,0.000,-2.25e+04,-6563.912
Number of bathrooms,1.905e+04,5003.064,3.808,0.000,9225.044,2.89e+04
Air conditioning,1.172e+04,9384.611,1.249,0.212,-6715.761,3.02e+04
Garage size,3.659e+04,6078.642,6.019,0.000,2.46e+04,4.85e+04

0,1,2,3
Omnibus:,140.376,Durbin-Watson:,1.377
Prob(Omnibus):,0.0,Jarque-Bera (JB):,426.3
Skew:,1.267,Prob(JB):,2.69e-93
Kurtosis:,6.63,Cond. No.,10900.0


In [16]:
import numpy as np
import pandas as pd
import scipy

import statsmodels
import statsmodels.api as sm
from statsmodels.formula.api import ols