In [1]:
import pandas as pd
import numpy as np
import heejin
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, LabelEncoder, RobustScaler, \
    PolynomialFeatures
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR,SVC
from sklearn.linear_model import LinearRegression,Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error,mean_squared_log_error,classification_report

In [2]:
dataDF = pd.read_csv('../data/Hamburger.csv', encoding='latin1')
dataDF.info()

dataDF['restaurant'].replace('macdonald','McDonalds',inplace=True)
dataDF['restaurant'].replace('Mcdonald','McDonalds',inplace=True)
dataDF['restaurant'].replace('Mcdonalds','McDonalds',inplace=True)
dataDF['restaurant'].replace('burgerking','Burger King',inplace = True)

# 중복값 제거 - 2개 
dataDF.duplicated().sum()
dataDF.drop_duplicates(inplace=True)

# 결측치 제거 - protein - 94개
dataDF.isna().sum()
dataDF.dropna(subset = 'protein',inplace = True)

# 전처리한 데이터 파일로 
dataDF.to_csv('../data/Hambuger_2.csv')
# 타겟 : calories
# 피쳐 : sodium, sugar, total_fat, portein
targetSR = dataDF['calories']
featureDF = dataDF[dataDF.columns[3:]]
targetSR.shape , featureDF.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1068 entries, 0 to 1067
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   restaurant  1068 non-null   object 
 1   item        1068 non-null   object 
 2   calories    1068 non-null   float64
 3   sodium      1068 non-null   float64
 4   sugar       1068 non-null   float64
 5   total_fat   1068 non-null   float64
 6   protein     974 non-null    float64
dtypes: float64(5), object(2)
memory usage: 58.5+ KB


((972,), (972, 4))

In [16]:
# 학습용 데이터셋, 검증용 데이터셋 분리
xtrain, xtest, ytrain, ytest = train_test_split(featureDF, targetSR, test_size=0.2, random_state=48)

In [17]:
scaler = StandardScaler()
xtrain = scaler.fit_transform(xtrain)
xtest = scaler.transform(xtest)

In [32]:
# 여러 모델 인스턴스 생성
from joblib import load
import joblib
import os

knn = KNeighborsRegressor(n_neighbors=2, weights='distance',p=1)
knn.fit(xtrain, ytrain)

In [19]:
li = LinearRegression(fit_intercept=False,copy_X=False, n_jobs=100)
li.fit(xtrain, ytrain)

In [20]:
ridge = load('../model/Ridge.pkl')
ridge.fit(xtrain, ytrain)

In [21]:
lasso = load('../model/Lasso.pkl')
lasso.fit(xtrain,ytrain)

In [22]:
dt = load('../model/decision_tree_total_02.pkl')
dt.fit(xtrain, ytrain)

In [23]:
boost = load('../model/boosting_total.pkl')
boost.fit(xtrain, ytrain)

In [24]:
rf = load('../model/RandomForest.pkl')
rf.fit(xtrain, ytrain)

In [15]:
r_max = heejin.find_random_state(featureDF, targetSR,VotingRegressor(estimators = [('LI', li), ('KNN', knn), ('Ridge', ridge), ('Lasso', lasso), ('DT',dt), ('Boost', boost),('RandomForest',rf)]))

radom_state = 48
score : 0.9129782575715675


In [None]:
# estimators = [('LI', li), ('KNN', knn), ('Ridge', ridge), ('Lasso', lasso), ('DT',dt), ('Boost', boost)]
# 
# voting_models = {'models': [], 'train_score': [], 'test_score': []}
# 
# for n, model1 in enumerate(estimators[:-1]):
#     for model2 in estimators[n + 1:]:  
#         vt_models = VotingRegressor(estimators=[model1, model2])
#         vt_models.fit(xtrain, ytrain) 
#         voting_models['models'].append([model1[1], model2[1]])
#         voting_models['train_score'].append(vt_models.score(xtrain, ytrain))
#         voting_models['test_score'].append(vt_models.score(xtest, ytest)) 
# 
# # Create the DataFrame
# df = pd.DataFrame(voting_models)
# print(df)

In [None]:
# for n, model1 in enumerate(estimators[:-1]):
#     for model2 in estimators[n + 1:]:  # Iterate from the next model onwards
#         vt_models = VotingRegressor(estimators=[model1, model2])
#         vt_models.fit(xtrain, ytrain) 
#         voting_models['models'].append([model1[1], model2[1]])
#         voting_models['train_score'].append(vt_models.score(xtrain, ytrain))
#         voting_models['test_score'].append(vt_models.score(xtest, ytest)) 
# 
# df = pd.DataFrame(voting_models)
# print(df)

In [25]:
vt_models = VotingRegressor(estimators = [('LI', li), ('KNN', knn), ('Ridge', ridge), ('Lasso', lasso), ('DT',dt), ('Boost', boost),('RandomForest',rf)])

In [26]:
vt_models.fit(xtrain,ytrain)

In [27]:
vt_models.score(xtrain,ytrain),vt_models.score(xtest,ytest)

(0.9124816417318052, 0.9120334802958219)

In [28]:
vt_models.estimators_

[LinearRegression(copy_X=False, fit_intercept=False, n_jobs=100),
 KNeighborsRegressor(n_neighbors=2, p=1, weights='distance'),
 Ridge(alpha=0.9, max_iter=1000, random_state=28, solver='lsqr'),
 Lasso(alpha=1.5, max_iter=10000, positive=True, selection='random', tol=0.01),
 DecisionTreeRegressor(criterion='friedman_mse', max_depth=7, max_features=3,
                       min_impurity_decrease=3, min_samples_leaf=3,
                       min_samples_split=4),
 AdaBoostRegressor(estimator=DecisionTreeRegressor(criterion='friedman_mse',
                                                   max_depth=7, max_features=3,
                                                   min_impurity_decrease=3,
                                                   min_samples_leaf=3,
                                                   min_samples_split=4)),
 RandomForestRegressor(max_depth=10, max_features='sqrt', min_samples_split=5,
                       n_estimators=50, random_state=18)]

In [29]:
ypre = vt_models.predict(xtest)
r2 = r2_score(ytest, ypre)
mse = mean_squared_error(ytest, ypre)
mae = mean_absolute_error(ytest, ypre)
rmse = mean_squared_error(ytest, ypre, squared=False)
print(f'''
[모델 설명도]\nR2 : {r2}\n[에러]\nMAE : {mae}\nMSE : {mse}\nRMSE : {rmse}\n--------------------------------------
''')


[모델 설명도]
R2 : 0.9120334802958219
[에러]
MAE : 71.659755748153
MSE : 8129.04214048125
RMSE : 90.16120085980027
--------------------------------------


In [30]:
import joblib
import os

In [31]:
# 모델 저장
model_dir = '../model/'
model_filename = model_dir + 'Voting.pkl'

# 저장 폴더 존재 여부 확인 후 저장
if not os.path.exists(model_dir):
        os.makedirs(model_dir)
        
joblib.dump(vt_models,model_filename)

['../model/Voting.pkl']