In [21]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.formula.api import ols
from statsmodels.api import qqplot, add_constant
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler
from statsmodels.tools.eval_measures import rmse
from sklearn.metrics import r2_score
from sklearn.linear_model import Ridge, Lasso, ElasticNet
import os
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.tree import export_graphviz
# import graphviz

from sklearn.ensemble import RandomForestRegressor as RFR

from sklearn.ensemble import GradientBoostingRegressor as GBR

#평가함수
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [22]:
# RMSE:root mean squared error 함수
def root_mean_squared_error(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# MAPE:mean absolute percentage error 함수
def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100


In [23]:
# 다주선형 회귀분석 

df_raw = pd.read_csv("D:/data/2/체질검사.csv")
df_raw.head()

Unnamed: 0,FAT,AGE,WEIGHT,HEIGHT,NECK,CHEST,ABDOMEN,HIP,THIGH,KNEE,ANKLE,BICEPS,FOREARM,WRIST
0,35.2,46,363.15,72.25,51.2,136.2,148.1,147.7,87.3,49.1,29.6,45.0,29.0,21.4
1,11.8,27,168.0,71.25,38.1,93.0,79.1,94.5,57.3,36.2,24.5,29.0,30.0,18.8
2,22.2,69,177.75,68.5,38.7,102.0,95.0,98.3,55.0,38.3,21.8,30.8,25.7,18.8
3,10.6,57,147.75,65.75,35.2,99.6,86.4,90.1,53.0,35.0,21.3,31.7,27.3,16.9
4,47.5,51,219.0,64.0,41.2,119.8,122.1,112.8,62.5,36.9,23.6,34.7,29.1,18.4


In [24]:
df_raw_y = df_raw['FAT']
df_raw_x = df_raw.drop('FAT', axis = 1, inplace = False)


In [27]:
df_train, df_test = train_test_split(df_raw,test_size=0.3,random_state =1234)
print('train data size: {}'.format(df_train.shape))
print('train data size: {}'.format(df_test.shape))

train data size: (176, 14)
train data size: (76, 14)


In [28]:
df_train_x, df_test_x, df_train_y, df_test_y = train_test_split(df_raw_x, df_raw_y,
                                        test_size = 0.3, random_state =1234)

print('train data X size:{}'.format(df_train_x.shape))
print('train data Y size:{}'.format(df_train_y.shape))
print('test data X size:{}'.format(df_test_x.shape))
print('test data Y size:{}'.format(df_test_y.shape))

train data X size:(176, 13)
train data Y size:(176,)
test data X size:(76, 13)
test data Y size:(76,)


In [29]:
# 회귀 모델
lr_model = smf.ols(formula = "FAT ~ HEIGHT + NECK + ABDOMEN + HIP + FOREARM", 
                   data = df_train)
lr_result = lr_model.fit()
print(lr_result.summary())


                            OLS Regression Results                            
Dep. Variable:                    FAT   R-squared:                       0.751
Model:                            OLS   Adj. R-squared:                  0.743
Method:                 Least Squares   F-statistic:                     102.3
Date:                Wed, 10 Aug 2022   Prob (F-statistic):           2.18e-49
Time:                        13:31:13   Log-Likelihood:                -501.89
No. Observations:                 176   AIC:                             1016.
Df Residuals:                     170   BIC:                             1035.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      2.7817      7.302      0.381      0.7

In [30]:
# Decision Tree 모델
dt_model=DecisionTreeRegressor(random_state=1234, min_samples_leaf = 8, 
                               min_samples_split = 20, max_depth = 4 )
dt_model.fit(df_train_x, df_train_y)

# Train 데이터 설명력
print("Score on training set:{:.3f}".format(dt_model.score(df_train_x,df_train_y)))

# Test 데이터 설명력
print("Score on test set:{:.3f}".format(dt_model.score(df_test_x,df_test_y)))


Score on training set:0.783
Score on test set:0.570


In [31]:
# Random Forest 모델
rf_model=RandomForestRegressor(random_state=1234, n_estimators = 100, 
                               min_samples_leaf = 6, min_samples_split = 14, 
                               max_depth = 4)
rf_model.fit(df_train_x, df_train_y)

# Train 데이터 설명력
print("Score on training set:{:.3f}".format(rf_model.score(df_train_x,df_train_y)))

# Test 데이터 설명력
print("Score on test set:{:.3f}".format(rf_model.score(df_test_x,df_test_y)))


Score on training set:0.825
Score on test set:0.627


In [34]:
# Gradient Boosting 모델
gb_model=GradientBoostingRegressor(random_state=1234, n_estimators = 30, 
                                   min_samples_leaf = 11, 
                                   min_samples_split = 22, 
                                   max_depth = 4, learning_rate = 0.1)
gb_model.fit(df_train_x, df_train_y)

# Train 데이터 설명력
print("Score on training set:{:.3f}".format(gb_model.score(df_train_x, df_train_y)))

# Test 데이터 설명력
print("Score on test set:{:.3f}".format(gb_model.score(df_test_x, df_test_y)))


Score on training set:0.900
Score on test set:0.629


In [35]:
models = ['회귀순석', '의사결정나무', '랜덤 포레스트','그래디언트 부스팅']
mse, rmse, mae, mape = [], [], [], []

In [37]:
#예측 
lr_y_pred = lr_result.predict(df_test)

#평가
mse.append(mean_squared_error(df_test['FAT'],lr_y_pred))
rmse.append(root_mean_squared_error(df_test['FAT'],lr_y_pred))
mae.append(mean_absolute_percentage_error(df_test['FAT'],lr_y_pred))