## 목표 : 성분에 따른 칼로리 예측
- 데이터 : Hamberger.csv
- 피 쳐 : sodium, sugar, total_fat, protein
- 타 겟 : calories
- 학습 방법 : 지도학습 + 예측 => Linear Regressor 기반 회귀
- 학습/테스트 데이터 => 85% : 15%

(1) 모듈 로딩 및 데이터 준비 <hr>

In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

In [34]:
# 데이터 파일
data_file = './Hamburger.csv'

In [35]:
# 데이터 준비
bergerDF =pd.read_csv(data_file)

(2) 데이터 전처리 <hr>
- 결측치, 이상치, 중복값 처리
- 데이터 분포, 컬럼 분포, 최빈값, 고유값

In [36]:
featureDF = bergerDF.iloc[:,3:]

In [37]:
targetSR = bergerDF.iloc[:,2:3]

In [38]:
from sklearn.model_selection import train_test_split

In [57]:
# 모델 인스턴스 생성 후 학습
X_train, X_test, y_train, y_test = train_test_split(featureDF,
                                                    targetSR,
                                                    random_state=10,
                                                    test_size=0.15)
LRmodel=LinearRegression()
# 모델 학습
LRmodel.fit(X_train, y_train)

train_score = LRmodel.score(X_train, y_train)
test_score = LRmodel.score(X_test, y_test)
print(f'[TRAIN_SCORE] : {train_score}\n[TEST SCORE] : {test_score}')

[TRAIN_SCORE] : 0.860759957789226
[TEST SCORE] : 0.8768822759157684


In [76]:
top_train_score = 0
top_test_score = 0
top_random_state = 0

for num in range(1,1001):
    X_train, X_test, y_train, y_test = train_test_split(featureDF,
                                                        targetSR,
                                                        random_state=num,
                                                        test_size=0.15)
    # 모델 인스턴스 생성
    LRmodel=LinearRegression()
    # 튜닝 -> 파라미터 값 조절
    # 모델 학습
    LRmodel.fit(X_train, y_train)
    
    train_score = LRmodel.score(X_train, y_train)
    test_score = LRmodel.score(X_test, y_test)
    
    if (test_score >= top_test_score):# and (train_score >= top_train_score):
        top_train_score = train_score
        top_test_score = test_score
        top_random_state = num
        
print(f'[TOP_RANDOM_STATE_NUM] : {top_random_state} \n[TOP_TRAIN_SCORE] : {top_train_score} \n[TOP_TEST_SCORE] : {top_test_score}')

[TOP_RANDOM_STATE_NUM] : 287 
[TOP_TRAIN_SCORE] : 0.8475865785527302 
[TOP_TEST_SCORE] : 0.928787126167401


In [40]:
X_train, X_test, y_train, y_test = train_test_split(featureDF,
                                                        targetSR,
                                                        random_state=287,
                                                        test_size=0.15)
# 모델 인스턴스 생성
LRmodel=LinearRegression()
# 튜닝 -> 파라미터 값 조절
# 모델 학습
LRmodel.fit(X_train, y_train)

train_score = LRmodel.score(X_train, y_train)
test_score = LRmodel.score(X_test, y_test)
print(f'[TRAIN_SCORE] : {train_score} \n[TEST_SCORE] : {test_score}')

[TRAIN_SCORE] : 0.8475865785527302 
[TEST_SCORE] : 0.928787126167401


In [41]:
top_train_score = 0
top_test_score = 0
top_random_state = 0
for num in range(1,1001):
    X_train, X_test, y_train, y_test = train_test_split(featureDF,
                                                        targetSR,
                                                        random_state=num,
                                                        test_size=0.15)
    # 모델 인스턴스 생성
    LRmodel=LinearRegression(fit_intercept=False)
    # 튜닝 -> 파라미터 값 조절
    # 모델 학습
    LRmodel.fit(X_train, y_train)
    
    train_score = LRmodel.score(X_train, y_train)
    test_score = LRmodel.score(X_test, y_test)
    if (test_score >= top_test_score):# and (train_score >= top_train_score):
        top_train_score = train_score
        top_test_score = test_score
        top_random_state = num
        
print(f'[TOP_RANDOM_STATE_NUM] : {top_random_state} \n[TOP_TRAIN_SCORE] : {top_train_score} \n[TOP_TEST_SCORE] : {top_test_score}')

[TOP_RANDOM_STATE_NUM] : 287 
[TOP_TRAIN_SCORE] : 0.8465368865209739 
[TOP_TEST_SCORE] : 0.9293046643036056


In [42]:
# fit_intercept=False일 때 test 점수 up, train 점수 down

In [43]:
top_train_score = 0
top_test_score = 0
top_random_state = 0
for num in range(1,1001):
    X_train, X_test, y_train, y_test = train_test_split(featureDF,
                                                        targetSR,
                                                        random_state=num,
                                                        test_size=0.15)
    # 모델 인스턴스 생성
    LRmodel=LinearRegression(fit_intercept=False, copy_X=False)
    # 튜닝 -> 파라미터 값 조절
    # 모델 학습
    LRmodel.fit(X_train, y_train)
    
    train_score = LRmodel.score(X_train, y_train)
    test_score = LRmodel.score(X_test, y_test)
    if (test_score >= top_test_score):# and (train_score >= top_train_score):
        top_train_score = train_score
        top_test_score = test_score
        top_random_state = num
        
print(f'[TOP_RANDOM_STATE_NUM] : {top_random_state} \n[TOP_TRAIN_SCORE] : {top_train_score} \n[TOP_TEST_SCORE] : {top_test_score}')

[TOP_RANDOM_STATE_NUM] : 287 
[TOP_TRAIN_SCORE] : 0.8465368865209739 
[TOP_TEST_SCORE] : 0.9293046643036056


In [44]:
# copy_X는 False로 바꿔도 똑같음

In [45]:
top_train_score = 0
top_test_score = 0
top_random_state = 0
for num in range(1,1001):
    X_train, X_test, y_train, y_test = train_test_split(featureDF,
                                                        targetSR,
                                                        random_state=num,
                                                        test_size=0.15)
    # 모델 인스턴스 생성
    LRmodel=LinearRegression(fit_intercept=False, copy_X=False, n_jobs=100)
    # 튜닝 -> 파라미터 값 조절
    # 모델 학습
    LRmodel.fit(X_train, y_train)
    
    train_score = LRmodel.score(X_train, y_train)
    test_score = LRmodel.score(X_test, y_test)
    if (test_score >= top_test_score):# and (train_score >= top_train_score):
        top_train_score = train_score
        top_test_score = test_score
        top_random_state = num
        
print(f'[TOP_RANDOM_STATE_NUM] : {top_random_state} \n[TOP_TRAIN_SCORE] : {top_train_score} \n[TOP_TEST_SCORE] : {top_test_score}')

[TOP_RANDOM_STATE_NUM] : 287 
[TOP_TRAIN_SCORE] : 0.8465368865209739 
[TOP_TEST_SCORE] : 0.9293046643036056


In [46]:
X_train, X_test, y_train, y_test = train_test_split(featureDF,
                                                        targetSR,
                                                        random_state=287,
                                                        test_size=0.15)
# 모델 인스턴스 생성
LRmodel=LinearRegression(fit_intercept=False, copy_X=False, n_jobs=100)
# 튜닝 -> 파라미터 값 조절
# 모델 학습
LRmodel.fit(X_train, y_train)

train_score = LRmodel.score(X_train, y_train)
test_score = LRmodel.score(X_test, y_test)
print(f'[TRAIN_SCORE] : {train_score} \n[TEST_SCORE] : {test_score}')

[TRAIN_SCORE] : 0.8465368865209739 
[TEST_SCORE] : 0.9293046643036056


In [47]:
# n_jobs도 변화 없음

In [48]:
# 예측

In [49]:
y_pre = LRmodel.predict(X_test)

In [50]:
y_pre

array([[ 406.38485369],
       [ 241.25397717],
       [ 127.95826192],
       [ 645.1724444 ],
       [ 857.13405374],
       [ 481.80905667],
       [ 387.27483578],
       [ 310.87387291],
       [ 241.53515392],
       [ 957.02019165],
       [ 583.82846681],
       [ 427.10036443],
       [ 445.89598491],
       [ 250.35166375],
       [ 661.25669684],
       [ 523.70493984],
       [ 880.22958027],
       [ 540.38803012],
       [  68.99247115],
       [  44.88654429],
       [ 166.96510255],
       [ 703.89857538],
       [ 489.91967367],
       [ 281.49084361],
       [ 441.59484014],
       [ 137.63442911],
       [ 638.18749842],
       [ 161.74700159],
       [  12.63347255],
       [ 521.94497022],
       [ 183.33603642],
       [ 380.52004675],
       [ 240.59424792],
       [ 835.79821766],
       [ 349.77508444],
       [ 643.98181953],
       [ 202.26430262],
       [ 441.63379792],
       [ 978.55252971],
       [  73.08767583],
       [ 185.68644992],
       [ 582.228

In [51]:
# 성능 평가

In [52]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [53]:
# 성능평가 => 결정계수값(r2) -> 1에 가까울수록 좋음
r2_score(y_pre, y_test)

0.9228634738263799

In [54]:
# 평균절대오차 => 오차가 적을수록 좋음
mean_absolute_error(y_pre, y_test)

73.081755354463

In [55]:
# 평균제곱근오차 => 오차가 적을수록 좋음
mean_squared_error(y_pre, y_test, squared=False)

102.47857196463563

In [56]:
def save_model(model,filename):
    import joblib
    import os
    model_dir='./model/'
    model_filename=model_dir+f'{filename}.pkl'
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    joblib.dump(model, model_filename)
save_model(LRmodel,"LR.pkl")