In [2]:
"""
회귀분석 모델에서 행렬곱 예
 - 회귀방정식(1차함수)
 y예측치 = (X * a) + b : X(입력), a(기울기), b(절편)
 y예측치 = np.dot(X, a) + b
"""

import numpy as np
import statsmodels.formula.api as sm
import pandas as pd

score = pd.read_csv("C:/Users/hyebin/Desktop/study/python_ML/data/score_iq.csv")
print(score.info())
score.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   sid      150 non-null    int64
 1   score    150 non-null    int64
 2   iq       150 non-null    int64
 3   academy  150 non-null    int64
 4   game     150 non-null    int64
 5   tv       150 non-null    int64
dtypes: int64(6)
memory usage: 7.2 KB
None


Unnamed: 0,sid,score,iq,academy,game,tv
0,10001,90,140,2,1,0
1,10002,75,125,1,3,3
2,10003,77,120,1,0,4
3,10004,83,135,2,3,2
4,10005,65,105,0,4,4


In [3]:
# 상관계수 행렬 
print(score.corr())

              sid     score        iq   academy      game        tv
sid      1.000000 -0.014399 -0.007048 -0.004398  0.018806  0.024565
score   -0.014399  1.000000  0.882220  0.896265 -0.298193 -0.819752
iq      -0.007048  0.882220  1.000000  0.671783 -0.031516 -0.585033
academy -0.004398  0.896265  0.671783  1.000000 -0.351315 -0.948551
game     0.018806 -0.298193 -0.031516 -0.351315  1.000000  0.239217
tv       0.024565 -0.819752 -0.585033 -0.948551  0.239217  1.000000


In [4]:
# subset 생성 -> 변수 선택  
score_df = score[['score', 'iq', 'academy']]
score_df.info() # (total 3 columns):
'''
y = score : 종속변수 
x = iq + academy : 독립변수 
'''

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   score    150 non-null    int64
 1   iq       150 non-null    int64
 2   academy  150 non-null    int64
dtypes: int64(3)
memory usage: 3.6 KB


In [5]:
# 통계적 방법 : 회귀모델 생성 : formula =(y ~ x1 + x2)
lr_model = sm.ols(formula = 'score ~ iq + academy', data = score_df).fit()

print(lr_model.params) # 기울기, 절편 
'''
Intercept    25.229141 : 절편 
iq            0.376966 : x1 기울기 
academy       2.992800 : x2 기울기 
'''

Intercept    25.229141
iq            0.376966
academy       2.992800
dtype: float64


In [6]:
# y의 예측치 생성 : 준비(X data, 기울기, 절편) 
score_X = score_df.iloc[:, 1:] # x변수 subset
score_y = score_df.iloc[:, 0] # y변수(정답) subset
score_y.shape # (150,)

(150,)

In [7]:
# X data
score_X.shape # (150, 2)

(150, 2)

In [8]:
# 회귀방정식 = 행렬곱(dot) 적용 
# y예측치 = dot(입력(X), 기울기) + 절편 

# 기울기 
slope = np.array([[0.376966],[2.992800]])
slope.shape # (2, 1)

(2, 1)

In [9]:
# 절편 
Intercept = 25.229141

# y예측치 
y_pred = np.dot(score_X, slope) + Intercept# 곱을 해줄수 없어서dot로해줌

y_pred.shape # (150, 1)

(150, 1)

In [10]:
y_pred_1d = y_pred.reshape(150) # 2d -> 1d
y_pred_1d.shape #  (150,) score_y와 같은차원으로 만들어줌

(150,)

In [11]:
df = pd.DataFrame({"predict":y_pred_1d, "score":score_y})
df.head()

Unnamed: 0,predict,score
0,83.989981,90
1,75.342691,75
2,73.457861,77
3,82.105151,83
4,64.810571,65


In [12]:
df.tail()

Unnamed: 0,predict,score
145,82.105151,83
146,64.810571,65
147,80.574359,80
148,64.810571,65
149,82.105151,83


In [13]:
# 상관계수 : 정답 vs 예측치 비교 
cor = df['predict'].corr(df['score']) # 0.972779
print('corr = ', cor)

corr =  0.972779206959475
