# **1. 단순 선형 회귀 분석**
- 전복의 나이를 예측하는 선형회귀모델을 생성하세요.
- 전복의 ‘성별’, ‘키’, ‘지름’, ‘높이’, ‘전체무게’, ‘몸통무게’, ‘내장무게’, ‘껍질무게’를 이용해 ‘껍질의 고리 수’를 예측한 뒤, **예측된 ‘껍질의 고리 수’에 1.5를 더하면 전복의 나이**가 됩니다.

In [1]:
# 기본 모듈 불러오기
import numpy as np
import pandas as pd

**1) 데이터 load 및 변형**

In [3]:
# 데이터 로드
data = pd.read_csv("abalone.csv")
data.head()
print(data.shape)

# 성별 M은 Male, F는 Female, I는 Infant 이므로 따로 열 만들기
for label in "MFI":
    data[label] = data["Sex"] == label
data.drop('Sex', axis=1, inplace=True)

(4177, 9)


In [5]:
data

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings,M,F,I
0,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15,True,False,False
1,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7,True,False,False
2,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9,False,True,False
3,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10,True,False,False
4,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...
4172,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11,False,True,False
4173,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10,True,False,False
4174,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9,True,False,False
4175,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10,False,True,False


**2) X, y 선택**
: y는 Rings열, X는 Rings열을 제외한 나머지를 선택하되 전부 실수가 되도록 한다.

In [8]:
# X,y 데이터 선택
y =data["Rings"]
X =data.drop("Rings",axis=1)

 **3) train/test set 분리**

In [11]:
# 필요한 모듈 불러오기
from sklearn.model_selection import train_test_split

In [12]:
# train과 test set 분리 (train:test = 7:3 비율로)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

**4) 선형회귀모델 생성, 모델 예측치 구하기**

In [16]:
#필요한 모듈 불러오기
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [17]:
#선형회귀모델 생성 및 훈련
model =LinearRegression()
model.fit(X_train,y_train)

LinearRegression()

In [58]:
# 모델 예측치 구하기
pred =model.predict(X_test)
print("모델 예측치: ",pred)
# 모델 예측치를 활용해 최종적으로 전복의 나이를 예측
print("\n전복의 나이:",pred+1.5)

모델 예측치:  [10.56802325  8.46426323 10.65687197 ... 11.33568577  8.24363502
 10.87302576]

전복의 나이: [12.06802325  9.96426323 12.15687197 ... 12.83568577  9.74363502
 12.37302576]


**5) 모델 평가: MSE, RMSE, R2 score, corr 구하기**

In [25]:
#필요한 모듈 불러오기
from sklearn.metrics import mean_squared_error, mean_squared_error, r2_score

- MSE, RMSE

In [29]:
#mse, rmse
mse = mean_squared_error(y_test, pred)
rmse = np.sqrt(mse)
print("mse:",mse)
print("rmse:",rmse)

mse: 4.810561358533467
rmse: 2.193299194942055


- R2 score

In [31]:
#R2 score 측정
r2 = r2_score(y_test, pred)
r2

0.526401377864711

- 회귀 절편값

In [32]:
#회귀 절편 값
model.intercept_

3.7258689145054324

- 회귀 계수 값

In [33]:
#회귀 계수 값
model.coef_

array([  0.39857834,  10.07799084,   9.2923264 ,   8.36118056,
       -19.14610382,  -9.91991043,   9.84566023,   0.37112738,
         0.22117052,  -0.5922979 ])

- 상관계수

Hint: corr 함수 이용.

In [39]:
# 상관계수 구하기
data.corr()["Rings"]

Length            0.556720
Diameter          0.574660
Height            0.557467
Whole weight      0.540390
Shucked weight    0.420884
Viscera weight    0.503819
Shell weight      0.627574
Rings             1.000000
M                 0.181831
F                 0.250279
I                -0.436063
Name: Rings, dtype: float64

In [59]:
correlation = np.corrcoef(y_test, pred)[0, 1]
correlation

0.7263005646047401

# **2. Polynomial features**

In [43]:
# PolynomialFeatures 라이브러리 호출
from sklearn.preprocessing import PolynomialFeatures

In [48]:
# 임의 데이터 생성
X = np.arange(6).reshape(3, 2)

df =  pd.DataFrame(X)
df.columns = ['x_1','x_2']
print(df)
print('일차 단항식 계수 피처:\n', X)

   x_1  x_2
0    0    1
1    2    3
2    4    5
일차 단항식 계수 피처:
 [[0 1]
 [2 3]
 [4 5]]


In [54]:
# 차원은 2로 설정
poly = PolynomialFeatures(degree=2)
# fit_transform 메소드를 통해 데이터 변환
poly=PolynomialFeatures(degree=2)
poly.fit(X)
poly_ftr = poly.transform(X)
print('변환된 2차 다항식 계수 피처: \n', poly_ftr)
# PolynomialFeatures로 변환 된 데이터를 데이터 프레임 형태로 변환
columns = poly.get_feature_names(['x_1', 'x_2'])
df_poly = pd.DataFrame(X_poly)
df_poly

변환된 2차 다항식 계수 피처: 
 [[ 1.  0.  1.  0.  0.  1.]
 [ 1.  2.  3.  4.  6.  9.]
 [ 1.  4.  5. 16. 20. 25.]]


Unnamed: 0,0,1,2,3,4,5
0,1.0,0.0,1.0,0.0,0.0,1.0
1,1.0,2.0,3.0,4.0,6.0,9.0
2,1.0,4.0,5.0,16.0,20.0,25.0


In [53]:
# df_poly의 컬럼을 1,x1,x2,x1^2,x1*x2,x2^2 로 변경
df_poly.columns = [1,"x1","x2","x1^2","x1*x2","x2^2"]
df

Unnamed: 0,1,x_1,x_2,x_1^2,x_1 x_2,x_2^2
0,1.0,0.0,1.0,0.0,0.0,1.0
1,1.0,2.0,3.0,4.0,6.0,9.0
2,1.0,4.0,5.0,16.0,20.0,25.0
