### Linear Regression

In [1]:
import pandas as pd
from pandas.io.formats.format import DataFrameFormatter
import numpy as np
from ISLP import load_data

import statsmodels.api as sm

from statsmodels.stats.anova import anova_lm
from ISLP.models import (ModelSpec as MS, summarize, poly, contrast)

In [2]:
Boston = load_data("Boston")
Boston.columns

Index(['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax',
       'ptratio', 'lstat', 'medv'],
      dtype='object')

In [15]:
X = pd.DataFrame({"intercept" : np.ones(Boston.shape[0]), 'lstat' : Boston['lstat']})
y = Boston.medv

model = sm.OLS(y, X)
results = model.fit()
results.summary()

0,1,2,3
Dep. Variable:,medv,R-squared:,0.544
Model:,OLS,Adj. R-squared:,0.543
Method:,Least Squares,F-statistic:,601.6
Date:,"Mon, 28 Oct 2024",Prob (F-statistic):,5.08e-88
Time:,18:13:58,Log-Likelihood:,-1641.5
No. Observations:,506,AIC:,3287.0
Df Residuals:,504,BIC:,3295.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
intercept,34.5538,0.563,61.415,0.000,33.448,35.659
lstat,-0.9500,0.039,-24.528,0.000,-1.026,-0.874

0,1,2,3
Omnibus:,137.043,Durbin-Watson:,0.892
Prob(Omnibus):,0.0,Jarque-Bera (JB):,291.373
Skew:,1.453,Prob(JB):,5.36e-64
Kurtosis:,5.319,Cond. No.,29.7


In [16]:
summarize(results)

Unnamed: 0,coef,std err,t,P>|t|
intercept,34.5538,0.563,61.415,0.0
lstat,-0.95,0.039,-24.528,0.0


In [22]:
yhat = results.get_prediction(X)
yhat.predicted_mean
yhat.conf_int(alpha = 0.05)
yhat.conf_int(obs = True, alpha = 0.05)

array([[17.58460325, 42.06058695],
       [13.64341294, 38.09736664],
       [18.48348779, 42.96679618],
       ...,
       [16.95986422, 41.43126083],
       [16.16444421, 40.63059793],
       [14.83800291, 39.29690103]])

`-` 가변수 처리

In [27]:
bike = load_data('Bikeshare')
encod = contrast('hr', 'sum')
X2 = MS([encod, 'workingday','temp', 'weathersit']).fit_transform(bike) ## 마지막 계수는 (-1, -1, ..., -1)(C-1개)
y2 = bike.bikers

model2 = sm.OLS(y2, X2)
results2 = model2.fit()
results2.summary()

0,1,2,3
Dep. Variable:,bikers,R-squared:,0.654
Model:,OLS,Adj. R-squared:,0.652
Method:,Least Squares,F-statistic:,580.5
Date:,"Mon, 28 Oct 2024",Prob (F-statistic):,0.0
Time:,19:29:59,Log-Likelihood:,-50013.0
No. Observations:,8645,AIC:,100100.0
Df Residuals:,8616,BIC:,100300.0
Df Model:,28,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
intercept,37.2651,2.678,13.917,0.000,32.016,42.514
hr[0],-93.7371,4.068,-23.044,0.000,-101.711,-85.763
hr[1],-107.8735,4.074,-26.479,0.000,-115.859,-99.888
hr[2],-113.9943,4.118,-27.680,0.000,-122.067,-105.922
hr[3],-122.5199,4.175,-29.345,0.000,-130.704,-114.336
hr[4],-127.3834,4.206,-30.288,0.000,-135.628,-119.139
hr[5],-114.9436,4.117,-27.921,0.000,-123.013,-106.874
hr[6],-70.9227,4.074,-17.410,0.000,-78.908,-62.937
hr[7],27.7307,4.061,6.829,0.000,19.771,35.691

0,1,2,3
Omnibus:,311.343,Durbin-Watson:,0.484
Prob(Omnibus):,0.0,Jarque-Bera (JB):,540.711
Skew:,0.304,Prob(JB):,3.86e-118
Kurtosis:,4.064,Cond. No.,127.0


### Classification

In [55]:
from ISLP import confusion_table
from sklearn.discriminant_analysis import  (LinearDiscriminantAnalysis as LDA, QuadraticDiscriminantAnalysis as QDA)
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [28]:
Smarket = load_data('Smarket')
allvars = Smarket.columns.drop(['Today', 'Direction', 'Year'])
X = MS(allvars).fit_transform(Smarket)
y = Smarket.Direction == "Up"

In [33]:
glm = sm.GLM(y, X, family=sm.families.Binomial())
results = glm.fit()
summarize(results)

Unnamed: 0,coef,std err,z,P>|z|
intercept,-0.126,0.241,-0.523,0.601
Lag1,-0.0731,0.05,-1.457,0.145
Lag2,-0.0423,0.05,-0.845,0.398
Lag3,0.0111,0.05,0.222,0.824
Lag4,0.0094,0.05,0.187,0.851
Lag5,0.0103,0.05,0.208,0.835
Volume,0.1354,0.158,0.855,0.392


In [54]:
pred = np.array([True]*len(y))
pred[results.predict() <= 0.5] = False
confusion_table(pred, y)

Truth,False,True
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
False,145,141
True,457,507


In [9]:
lda = LDA(store_covariance = True)
lda.fit(X, y)
print(lda.means_) ## 범주 1에 대한 mean vector, 범주 2에 대한 mean vector
print(lda.covariance_) ## 공분산 행렬(공유)
print(lda.classes_) ## 분류 라벨
print(lda.priors_) ## y 실제 범주의 비율(pi_k의 추정치)

[[ 0.04279022  0.03389409]
 [-0.03954635 -0.03132544]]
[[ 1.50886781 -0.03340234]
 [-0.03340234  1.5095363 ]]
['Down' 'Up']
[0.49198397 0.50801603]


In [30]:
qda = QDA(store_covariance=True)
print(qda.fit(XX_train, L_train))

qda.means_, qda.priors_
print(qda.covariance_[0]) ## 공분산 행렬이 다름을 가정
print(qda.covariance_[1])

QuadraticDiscriminantAnalysis(store_covariance=True)
[[ 1.50662277 -0.03924806]
 [-0.03924806  1.53559498]]
[[ 1.51700576 -0.02787349]
 [-0.02787349  1.49026815]]


In [None]:
NB = GaussianNB()
rs = NB.fit(XX_train, L_train)
print(rs)
print(NB.class_prior_) ## 
print(NB.theta_) ## 각 종별 평균 벡터
print(NB.var_) ## 공분산 행렬이 아님. QDA와 같은 느낌인데 분산 term만 나타낸 것
NB.predict(X)

In [None]:
knn1 = KNeighborsClassifier(n_neighbors=1) ## K = 1인 KNN
knn1.fit(XX_train , L_train)
knn1_pred = knn1.predict(XX_test)
print(confusion_table(knn1_pred , L_test))

knn3 = KNeighborsClassifier(n_neighbors=3)
knn3_pred = knn3.fit(XX_train , L_train).predict(XX_test)
print(confusion_table(knn3_pred , L_test))