In [11]:
###

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.filterwarnings('ignore')

car = pd.read_csv('./data/CarPrice_Assignment.csv')
car.head()

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


In [2]:
# 연속형 변수들을 설명변수 price를 타깃변수로 설정
car_num = car.select_dtypes(['number'])
features = list(car_num.columns.difference(['car_ID','symboling','price']))

X= car_num[features]
y= car_num['price']

# 학습 데이터와 테스트 데이터 분할
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### (1) LinearRegresssion

In [3]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error

# 모델 생성
model = LinearRegression()

# 모델 학습
model.fit(X_train, y_train)

# 테스트 데이터 예측
y_pred = model.predict(X_test)

# 평가
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mse)
acc= model.score(X_test, y_test)

print('MSE\t{}'.format(round(mse,3)))
print('MAE\t{}'.format(round(mae,3)))
print('RMSE\t{}'.format(round(rmse,3)))
print('ACC\t{}'.format(round(acc,3)))

MSE	14628664.036
MAE	2701.749
RMSE	3824.744
ACC	0.815


### (2) 다중회귀

In [4]:
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet

# 모델 생성
ridge = Ridge()
lasso = Lasso()
elasticnet = ElasticNet()

# 모델학습
ridge.fit(X_train,y_train)
lasso.fit(X_train,y_train)
elasticnet.fit(X_train,y_train)

# 테스트 데이터 예측
y_pred_rid = ridge.predict(X_test)
y_pred_las = lasso.predict(X_test)
y_pred_ela = elasticnet.predict(X_test)


# 평가
mse_rid = mean_squared_error(y_test, y_pred_rid)
mae_rid = mean_absolute_error(y_test, y_pred_rid)
rmse_rid = np.sqrt(mse_rid)
acc_rid = ridge.score(X_test, y_test)

print('MSE_ridge\t{}'.format(round(mse_rid,3)))
print('MAE_ridge\t{}'.format(round(mae_rid,3)))
print('RMSE_ridge\t{}'.format(round(rmse_rid,3)))
print('ACC_ridge\t{}'.format(round(acc_rid,3)))

mse_las = mean_squared_error(y_test, y_pred_las)
mae_las = mean_absolute_error(y_test, y_pred_las)
rmse_las = np.sqrt(mse_las)
acc_las= lasso.score(X_test, y_test)

print('MSE_las\t{}'.format(round(mse_las,3)))
print('MAE_las\t{}'.format(round(mae_las,3)))
print('RMSE_las\t{}'.format(round(rmse_las,3)))
print('ACC_las\t{}'.format(round(acc_las,3)))

mse_ela = mean_squared_error(y_test, y_pred_ela)
mae_ela = mean_absolute_error(y_test, y_pred_ela)
rmse_ela = np.sqrt(mse_ela)
acc_ela= elasticnet.score(X_test, y_test)

print('MSE_ela\t{}'.format(round(mse_ela,3)))
print('MAE_ela\t{}'.format(round(mae_ela,3)))
print('RMSE_ela\t{}'.format(round(rmse_ela,3)))
print('ACC_ela\t{}'.format(round(acc_ela,3)))

MSE_ridge	14589795.699
MAE_ridge	2703.757
RMSE_ridge	3819.659
ACC_ridge	0.815
MSE_las	14628811.309
MAE_las	2703.271
RMSE_las	3824.763
ACC_las	0.815
MSE_ela	14125856.131
MAE_ela	2667.43
RMSE_ela	3758.438
ACC_ela	0.821


### (4) SVR

In [5]:
df= pd.read_csv('./data/student_data.csv')

from sklearn.impute import KNNImputer

# 결측치가 있는 수치형 데이터 만을 추출
KNN_data = df.drop(columns=['school','sex','paid','activities'])

# 모델링
imputer = KNNImputer()
df_filled = imputer.fit_transform(KNN_data)
df_filled = pd.DataFrame(df_filled, columns=KNN_data.columns)
df[KNN_data.columns] = df_filled

df = pd.get_dummies(data = df, columns=['school','sex','paid','activities'],drop_first=True)

from sklearn.model_selection import train_test_split
X= df.drop(columns = 'grade')
y= df['grade']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=1)

In [6]:
from sklearn.svm import SVR

# 모델 생성
svr_rbf = SVR(kernel = 'rbf', C=100, gamma=0.1, epsilon=0.1)
svr_lin = SVR(kernel='linear', C=100, gamma='auto')
#svr_poly = SVR(kernel='poly', C=100, gamma='auto', degree=3, epsilon=0.1, coef0=1)

# 모델학습
svr_rbf.fit(X_train, y_train)
svr_lin.fit(X_train, y_train)
#svr_poly.fit(X_train, y_train)

# 테스트 데이터 예측
rbf_pred = svr_rbf.predict(X_test)
lin_pred = svr_lin.predict(X_test)
#poly_pred = svr_poly.predict(X_test)

preds = [rbf_pred, lin_pred]
kernel = ['rbf','linear']
evls= ['mse','rmse', 'mae']

results = pd.DataFrame(index=kernel, columns=evls)

for pred, nm in zip(preds, kernel):
    mse = mean_squared_error(y_test, pred)
    mae = mean_absolute_error(y_test, pred)
    rmse = np.sqrt(mse)
    
    results.loc[nm]['mse']=round(mse,2)
    results.loc[nm]['rmse']=round(rmse,2)
    results.loc[nm]['mae']=round(mae,2)

results

Unnamed: 0,mse,rmse,mae
rbf,2.02,1.42,0.94
linear,0.52,0.72,0.45


### (5) KNetighborsRegressor

In [7]:
from sklearn.neighbors import KNeighborsRegressor

# 모델 생성
knn_uni = KNeighborsRegressor(n_neighbors=20, weights='uniform')
knn_dis = KNeighborsRegressor(n_neighbors=20, weights='distance')

# 모델 학습
knn_uni.fit(X_train, y_train)
knn_dis.fit(X_train, y_train)

# 테스트 데이터 예측
uni_pred = knn_uni.predict(X_test)
dis_pred = knn_dis.predict(X_test)

preds = [uni_pred, dis_pred]
weights = ['uniform','distance']
evls = ['mse', 'rmse','mae']

results = pd.DataFrame(index=weights, columns=evls)

for pred, nm in zip(preds,weights):
    mse = mean_squared_error(y_test, pred)
    mae = mean_absolute_error(y_test, pred)
    rmse = np.sqrt(mse)
    
    results.loc[nm]['mse']=round(mse,2)
    results.loc[nm]['rmse']=round(rmse,2)
    results.loc[nm]['mae']=round(mae,2)

results

Unnamed: 0,mse,rmse,mae
uniform,1.13,1.06,0.77
distance,1.02,1.01,0.74


### (6) Bagging

In [8]:
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor

# 모델 생성
reg = BaggingRegressor(base_estimator = DecisionTreeRegressor(), oob_score =True)

# 모델 학습
reg.fit(X_train,y_train)

# 테스트 데이터 예측
y_pred = reg.predict(X_test)

mse = mean_squared_error(y_test,y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mse)
acc= reg.score(X_test, y_test)

print('MSE\t{}'.format(round(mse,3)))
print('MAE\t{}'.format(round(mae,3)))
print('RMSE\t{}'.format(round(rmse,3)))
print('ACC\t{}'.format(round(acc,3)))

MSE	0.467
MAE	0.48
RMSE	0.683
ACC	0.958


### (7) Boosting

In [9]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
# 모델 생성
reg = AdaBoostRegressor(base_estimator=DecisionTreeRegressor())

# 모델 학습
reg.fit(X_train,y_train)

# 테스트 데이터 예측
y_pred = reg.predict(X_test)

mse = mean_squared_error(y_test,y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mse)
acc= reg.score(X_test, y_test)

print('MSE\t{}'.format(round(mse,3)))
print('MAE\t{}'.format(round(mae,3)))
print('RMSE\t{}'.format(round(rmse,3)))
print('ACC\t{}'.format(round(acc,3)))

MSE	0.571
MAE	0.403
RMSE	0.756
ACC	0.948


### (8) RandomForestRegressor

In [10]:
from sklearn.ensemble import RandomForestRegressor

reg = RandomForestRegressor()
pred = reg.fit(X_train, y_train).predict(X_test)

from sklearn.metrics import mean_squared_error, mean_absolute_error
mse = mean_squared_error(y_test, pred)
mae = mean_absolute_error(y_test, pred)
rmse = np.sqrt(mse)
acc= reg.score(X_test, y_test)

print('MSE\t{}'.format(round(mse,3)))
print('MAE\t{}'.format(round(mae,3)))
print('RMSE\t{}'.format(round(rmse,3)))
print('ACC\t{}'.format(round(acc,3)))

MSE	0.406
MAE	0.453
RMSE	0.637
ACC	0.963


### (9) MLP

In [11]:
from sklearn.neural_network import MLPRegressor

reg = MLPRegressor(hidden_layer_sizes=(100, 50), activation='relu', solver='adam', learning_rate_init=0.001, alpha=0.0001)

reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)

from sklearn.metrics import mean_squared_error, mean_absolute_error

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mse)
acc= reg.score(X_test, y_test)

print('MSE\t{}'.format(round(mse,3)))
print('MAE\t{}'.format(round(mae,3)))
print('RMSE\t{}'.format(round(rmse,3)))
print('ACC\t{}'.format(round(acc,3)))

MSE	0.443
MAE	0.469
RMSE	0.666
ACC	0.96


### (10) XGBRegressor

In [12]:
from xgboost import XGBRegressor

# 모델 생성
reg = XGBRegressor(n_estimators=100, learning_rate=0.5)
reg.fit(X_train,y_train)
y_pred= reg.predict(X_test)

# 모델 평가
from sklearn.metrics import mean_squared_error, mean_absolute_error
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mse)
acc= reg.score(X_test, y_test)

print('MSE\t{}'.format(round(mse,3)))
print('MAE\t{}'.format(round(mae,3)))
print('RMSE\t{}'.format(round(rmse,3)))
print('ACC\t{}'.format(round(acc,3)))

MSE	0.73
MAE	0.633
RMSE	0.855
ACC	0.934


### (11) LGBMRegressor

In [13]:
# !pip install lightgbm
from lightgbm import LGBMRegressor

# 모델 생성
reg = LGBMRegressor(n_estimators=100, learning_rate=0.5)
reg.fit(X_train,y_train)
y_pred= reg.predict(X_test)

# 모델 평가
from sklearn.metrics import mean_squared_error, mean_absolute_error
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mse)
acc= reg.score(X_test, y_test)

print('MSE\t{}'.format(round(mse,3)))
print('MAE\t{}'.format(round(mae,3)))
print('RMSE\t{}'.format(round(rmse,3)))
print('ACC\t{}'.format(round(acc,3)))

MSE	0.691
MAE	0.668
RMSE	0.831
ACC	0.937


### (12) 앙상블 

In [14]:
# voting

from sklearn.ensemble import VotingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

# 회귀 모델 생성
regressor1 = LinearRegression()
regressor2 = DecisionTreeRegressor()
regressor3 = RandomForestRegressor()

# VotingRegressor 모델 생성
voting_regressor = VotingRegressor(estimators=[('lr', regressor1), ('dt', regressor2), ('rf', regressor3)])

# VotingRegressor 모델 학습
voting_regressor.fit(X_train, y_train)

# 테스트 데이터로 예측
y_pred = voting_regressor.predict(X_test)

# 모델 평가
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mse)
acc= reg.score(X_test, y_test)

print('MSE\t{}'.format(round(mse,3)))
print('MAE\t{}'.format(round(mae,3)))
print('RMSE\t{}'.format(round(rmse,3)))
print('ACC\t{}'.format(round(acc,3)))

MSE	0.37
MAE	0.43
RMSE	0.608
ACC	0.937


In [15]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import StackingRegressor


# 기본 모델들 생성
estimators = [
    ('rf', RandomForestRegressor()),
    ('lr', LinearRegression()),
    ('svr', SVR())
]

# 메타 모델 생성
meta_model = LinearRegression()

# 스태킹 모델 생성
stacking_model = StackingRegressor(estimators=estimators, final_estimator=meta_model)


# 스태킹 모델 학습
stacking_model.fit(X_train, y_train)

# 테스트 데이터 예측
y_pred = stacking_model.predict(X_test)

# 모델 평가
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mse)
acc= reg.score(X_test, y_test)

print('MSE\t{}'.format(round(mse,3)))
print('MAE\t{}'.format(round(mae,3)))
print('RMSE\t{}'.format(round(rmse,3)))
print('ACC\t{}'.format(round(acc,3)))

MSE	0.388
MAE	0.431
RMSE	0.623
ACC	0.937


### (13) GridSearch

In [17]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

xgb = XGBRegressor(n_estimators=1000)

param_grid = [{'max_depth':[2,4,6,8,10]}]

grid_xgb = GridSearchCV(xgb, param_grid=param_grid,
                                     cv=3, n_jobs=-1)
grid_xgb.fit(X_train,y_train)
print('final params', grid_xgb.best_params_)   # 최적의 파라미터 값 출력
print('Train data best score', grid_xgb.best_score_)   # 최고의 점수

# 테스트 데이터 평가
y_pred = grid_xgb.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mse)
acc= reg.score(X_test, y_test)

print('MSE\t{}'.format(round(mse,3)))
print('MAE\t{}'.format(round(mae,3)))
print('RMSE\t{}'.format(round(rmse,3)))
print('ACC\t{}'.format(round(acc,3)))

final params {'max_depth': 10}
Train data best score 0.9594572842760766
MSE	0.486
MAE	0.5
RMSE	0.697
ACC	0.937


### (14) 랜덤그리드 서치

In [23]:
from sklearn.model_selection import RandomizedSearchCV

xgb = XGBRegressor(n_estimators=1000)

param_grid = [{'max_depth':[2,4,6,8,10]}]

grid_xgb = RandomizedSearchCV(xgb, param_distributions=param_grid,
                                     cv=3,n_iter=3, n_jobs=-1, random_state=5)
grid_xgb.fit(X_train,y_train)
print('final params', grid_xgb.best_params_)   # 최적의 파라미터 값 출력
print('Train data best score', grid_xgb.best_score_)   # 최고의 점수

# 테스트 데이터 평가
y_pred = grid_xgb.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mse)
acc= reg.score(X_test, y_test)

print('MSE\t{}'.format(round(mse,3)))
print('MAE\t{}'.format(round(mae,3)))
print('RMSE\t{}'.format(round(rmse,3)))
print('ACC\t{}'.format(round(acc,3)))

final params {'max_depth': 10}
Train data best score 0.9594572842760766
MSE	0.486
MAE	0.5
RMSE	0.697
ACC	0.937
