## 회귀분석팀 3주차 클린업 파이썬 예제

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [3]:
data = pd.read_csv('2017-18_NBA_salary2.csv')

기본 전처리 한거 가져옴, 파이썬 내 모스트가 아니어서!! (결측치 제거, 문제있는 관측치 제거, 컬럼제거)

In [4]:
data_y = data.iloc[:, [-0]]
data_x = data.drop(columns = ['Salary'])

In [5]:
data_x.head()

Unnamed: 0,NBA_DraftNumber,Age,MP,PER,TS%,DRB%,VORP,G,OBPM,DBPM,BPM,OWS,DWS,WS,AST%,STL%
0,43,22,87,0.6,0.303,16.8,-0.2,16,-10.6,0.5,-10.1,-0.4,0.1,-0.2,1.5,1.1
1,42,33,937,16.8,0.608,25.0,0.7,66,-0.6,1.3,0.8,1.7,1.4,3.1,15.4,1.9
2,19,36,1508,17.3,0.529,23.8,0.0,59,-0.6,-1.3,-1.9,0.3,1.1,1.4,14.9,1.4
3,13,22,656,14.6,0.499,14.4,-0.1,24,-0.7,-2.0,-2.6,-0.1,0.5,0.4,18.6,1.8
4,10,20,979,8.2,0.487,18.3,-0.2,62,-3.7,0.9,-2.9,-0.4,1.2,0.8,7.3,0.8


In [6]:
data_y.head()

Unnamed: 0,Salary
0,19.976838
1,27.4841
2,36.294336
3,26.989769
4,26.716065


In [7]:
col_names = data_x.columns

In [8]:
standardScaler = StandardScaler()
standardScaler.fit(data_x)
data_x = standardScaler.transform(data_x)
data_x = pd.DataFrame(data_x)
data_x.columns = col_names

In [9]:
data_x.head()

Unnamed: 0,NBA_DraftNumber,Age,MP,PER,TS%,DRB%,VORP,G,OBPM,DBPM,BPM,OWS,DWS,WS,AST%,STL%
0,0.650289,-1.004976,-1.333247,-1.509411,-2.133965,0.258369,-0.644549,-1.404987,-1.940168,0.410942,-1.579593,-0.896228,-1.051963,-1.002106,-1.268535,-0.446609
1,0.602847,1.57125,-0.280471,0.396572,0.649736,1.470814,0.076633,0.628347,0.119331,0.753439,0.45536,0.219198,0.207211,0.233249,0.263713,0.365561
2,-0.488333,2.273857,0.426747,0.455399,-0.071288,1.293383,-0.484286,0.34368,0.119331,-0.359675,-0.048711,-0.524419,-0.083368,-0.403146,0.208596,-0.142045
3,-0.772989,-1.004976,-0.628506,0.137735,-0.345095,-0.096493,-0.564418,-1.079654,0.098736,-0.659359,-0.179396,-0.736881,-0.664525,-0.777496,0.616461,0.264039
4,-0.915317,-1.473381,-0.228451,-0.615246,-0.454618,0.480158,-0.644549,0.46568,-0.519114,0.582191,-0.235404,-0.896228,0.013492,-0.627756,-0.62918,-0.751172


In [10]:
train_x, test_x, train_y, test_y = train_test_split(data_x, data_y, test_size=0.2, random_state=2020)

### 모델 만들기

In [11]:
from sklearn.metrics import mean_squared_error #모델 평가지표
from sklearn.model_selection import cross_val_score # 평가
from sklearn.model_selection import validation_curve# 평가
from sklearn.model_selection import GridSearchCV # Cv

### Ridge

In [12]:
alphas = [0.1,0.5, 1, 5, 10, 20, 40, 50, 60, 70, 80, 100]

tuning parameter 정하기

파이썬에서는 람다가 아니라 알파로 가중치를 정하는데, 알파의 역수로 들어간다고 생각하면 편하다

In [13]:
from sklearn.linear_model import Ridge

In [14]:
ridge = Ridge()
parameters = {'alpha': alphas }
ridge_reg = GridSearchCV(ridge, parameters, scoring='neg_mean_squared_error',cv=5)
ridge_reg.fit(train_x,train_y)
print(ridge_reg.best_params_)
print(ridge_reg.best_score_)

{'alpha': 60}
-35.103738519142055


neg_mean_squared error는 0으로 갈수록 좋다! 

cv를 통해 최적의 파라미터 alpha = 60을 구했다.

In [15]:
opt_ridge_alpha = 60
ridge = Ridge(alpha = opt_ridge_alpha).fit(train_x, train_y)

정해진 튜닝 파라미터로 전체 train set에 대해 모델을 적합한다. 예측은 좀있다가 한번에 하자!

### Lasso

In [16]:
from sklearn.linear_model import Lasso

In [17]:
lasso = Lasso()
parameters = {'alpha': alphas }
lasso_reg = GridSearchCV(lasso, parameters, scoring='neg_mean_squared_error',cv=5)
lasso_reg.fit(train_x,train_y)
print(lasso_reg.best_params_)
print(lasso_reg.best_score_)

{'alpha': 0.5}
-32.59798174238084


In [18]:
opt_lasso_alpha = 0.5
lasso = Lasso(alpha = opt_lasso_alpha).fit(train_x, train_y)

동일한 과정이다

### elastic net

In [19]:
l1_ratio = [0.2, 0.4, 0.6, 0.8]

In [20]:
from sklearn.linear_model import ElasticNet

릿지와 라쏘의 가중치를 설정!

In [21]:
elastic = ElasticNet()
# parameters = {'alpha': alphas, 'l1_rati0':l1_ratio }

elasticnet_reg = GridSearchCV(elastic, param_grid={'alpha':np.logspace(-5,2,8),'l1_ratio':[.2,.4,.6,.8]}, 
                              scoring='neg_mean_squared_error',cv=5)
elasticnet_reg.fit(train_x,train_y)
print(elasticnet_reg.best_params_)
print(elasticnet_reg.best_score_)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


{'alpha': 0.1, 'l1_ratio': 0.8}
-34.4079270640098


잘 수렴하지 않았다고 하고 지금 duality gap이 꽤나 큰데....일단 그냥 가자 ㅎㅎㅎ

In [22]:
elastic=ElasticNet(alpha=0.1, l1_ratio=0.8).fit(train_x, train_y)

### 모델 비교!

In [23]:
ridge_pred_y = ridge.predict(test_x)
ridge_mse = mean_squared_error(test_y, ridge_pred_y)

lasso_pred_y = lasso.predict(test_x)
lasso_mse = mean_squared_error(test_y, lasso_pred_y)

elastic_pred_y = elastic.predict(test_x)
elastic_mse = mean_squared_error(test_y, elastic_pred_y)

In [24]:
print(ridge_mse)
print(lasso_mse)
print(elastic_mse)

29.055221203060526
29.028773545020552
28.878785357816884


elastic net이 제일 좋았네?!