In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from pycaret.regression import *



## Load Files

In [2]:
file_path = "/Users/hyunsubong/Library/CloudStorage/OneDrive-명지대학교/Univ/2022-1/테청캠/Code/MainData_전처리_결과_데이터_v2"
folder_list = [f for f in os.listdir(file_path) if not f.startswith('.')]

data_list = []
for i in folder_list:
    data = pd.read_csv(file_path + '/' + i, encoding="cp949")
    data_list.append(data)

for idx, file in enumerate(folder_list):
    file = file.rstrip(".csv")
    folder_list[idx] = file
    
print("총 {}개의 파일을 불러왔습니다.\n".format(len(folder_list)))

총 24개의 파일을 불러왔습니다.



## 전처리

In [3]:
def preprocess_df(df, onehot_col=None, scaling_col=None, drop_col = None, labeling_col = None, scaling_func=MinMaxScaler) :
    """
    :param df: 사용할 데이터프레임 
    :param onehot_col: default = None, 원핫인코딩할 컬럼 리스트 
    :param scaling_col: default = None, 스케일링할 컬럼 리스트 
    :param drop_col: default = None, 사용하지 않을 컬럼 리스트 
    :param labeling_col: default = None, 라벨링이 필요한 컬럼 리스트 
    :param scaling_func: 스케일링시 사용할 컬럼, MinMaxScaler or StandardScaler
    :return: 전처리 된 데이터 프레임
    """
    if drop_col is not None :
        df = df.drop(drop_col, axis=1)
    if onehot_col is not None:
        df = pd.get_dummies(df, columns= onehot_col)
    if scaling_col is not None :
        for c in scaling_col :
            scaler = scaling_func()
            scaler.fit(df[[c]])
            df[c] = scaler.transform(df[[c]])
    if labeling_col is not None :
        for c in labeling_col :
            encoder = LabelEncoder()
            encoder.fit(df[c])
            df[c] = encoder.transform(df[c])
    return df

In [4]:
onehot_col = ['지형기호(2.3km)', '최우점식물군락']
scaling_col = ['폭(m)', '연장(m)', '주변 로드킬 빈도', '산책로까지의 최단 거리(km)', '농가까지의 거리(km)', '도로 최고제한속도(km/h)',
               '하천거리(km)', '주변 동물종 개수', '등산로까지 최단거리(km)', '유도울타리_연장_m', '유도울타리_높이_m', '교통량', 
               '건물까지거리(km)', '주변동물 출현빈도']
drop_col = ['번호', '위도', '경도', '생태통로_유형', '이용확인종_수', '유도울타리_위도_2.3km', '유도울타리_경도_2.3km', '식물군락명']
scaling_col_tunnel = scaling_col + ['높이(m)', '개방도']

## 모델 비교 (특정)

In [13]:
while(True):
    print('데이터셋 목록 :\n', '\n'.join([str(elem) for elem in folder_list]))
    target_name = input("찾고자하는 데이터의 이름을 입력하십시오. 종료는 quit를 입력하십시오.")
    if target_name == "quit":
        break
    target_idx = folder_list.index(target_name)

    train_set = preprocess_df(data_list[13], drop_col=drop_col, labeling_col=onehot_col, scaling_col=scaling_col, scaling_func=StandardScaler)
    train_set = train_set.rename(columns={name: str(idx) for idx, name in enumerate(train_set.columns)}, inplace = False)
    setup_rgs = setup(data = train_set, target = '2',session_id = 10)
    best_model_top5 = compare_models(sort='MSE', n_select=5)
    print("분석 데이터 => " + folder_list[target_idx] + "\n")

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
br,Bayesian Ridge,0.6972,0.8746,0.9182,-0.0037,0.4677,1.4264,0.002
llar,Lasso Least Angle Regression,0.7058,0.9124,0.9391,-0.0491,0.5497,1.026,0.002
dummy,Dummy Regressor,0.7058,0.9124,0.9391,-0.0491,0.5497,1.026,0.002
lasso,Lasso Regression,0.7058,0.9124,0.9391,-0.0491,0.5497,1.026,0.002
en,Elastic Net,0.7058,0.9124,0.9391,-0.0491,0.5497,1.026,0.002
rf,Random Forest Regressor,0.6953,0.9161,0.9414,-0.1022,0.3942,1.7062,0.02
knn,K Neighbors Regressor,0.7125,0.9598,0.9565,-0.102,0.4047,1.8157,0.002
ada,AdaBoost Regressor,0.764,1.0079,0.9809,-0.1521,0.4238,1.9568,0.008
omp,Orthogonal Matching Pursuit,0.7458,1.0141,0.9792,-0.1617,0.4106,2.1682,0.002
et,Extra Trees Regressor,0.7286,1.0307,0.9997,-0.2499,0.4182,2.1773,0.018


분석 데이터 => tunnel_knn3_median

데이터셋 목록 :
 overpass_knn2_median
overpass_mean_drop
overpass_knn1_drop
tunnel_knn1_drop
tunnel_mean_drop
tunnel_knn2_median
tunnel_knn5_median
overpass_knn5_median
overpass_mean_median
tunnel_knn3_median
tunnel_knn2_drop
tunnel_knn3_drop
tunnel_mean_median
overpass_knn3_median
overpass_knn2_drop
overpass_knn3_drop
overpass_knn5_drop
overpass_knn4_drop
overpass_knn4_median
tunnel_knn1_median
tunnel_knn4_median
overpass_knn1_median
tunnel_knn5_drop
tunnel_knn4_drop
찾고자하는 데이터의 이름을 입력하십시오. 종료는 quit를 입력하십시오.quit


## 모델 비교 (전체)

In [12]:
"""
compare_models()
fold: cross_validation의 fold를 지정 (default = 10)
sort: 정렬기준 지표 설정
n_select: 상위 n개의 모델 결과만 출력
"""

for idx, data in enumerate(data_list, start=0):
    train_set = preprocess_df(data, drop_col=drop_col, labeling_col=onehot_col, scaling_col=scaling_col, scaling_func=StandardScaler)
    train_set = train_set.rename(columns={name: str(idx) for idx, name in enumerate(train_set.columns)}, inplace = False)
    setup_rgs = setup(data = train_set, target = '2',session_id = 10)
    best_model_top5 = compare_models(sort='MSE', n_select=5)
    print("분석 데이터 => " + folder_list[idx] + "\n")

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
br,Bayesian Ridge,0.6972,0.8746,0.9182,-0.0037,0.4677,1.4264,0.002
llar,Lasso Least Angle Regression,0.7058,0.9124,0.9391,-0.0491,0.5497,1.026,0.002
dummy,Dummy Regressor,0.7058,0.9124,0.9391,-0.0491,0.5497,1.026,0.002
lasso,Lasso Regression,0.7058,0.9124,0.9391,-0.0491,0.5497,1.026,0.002
en,Elastic Net,0.7058,0.9124,0.9391,-0.0491,0.5497,1.026,0.002
rf,Random Forest Regressor,0.6953,0.9161,0.9414,-0.1022,0.3942,1.7062,0.02
knn,K Neighbors Regressor,0.7125,0.9598,0.9565,-0.102,0.4047,1.8157,0.003
ada,AdaBoost Regressor,0.764,1.0079,0.9809,-0.1521,0.4238,1.9568,0.007
omp,Orthogonal Matching Pursuit,0.7458,1.0141,0.9792,-0.1617,0.4106,2.1682,0.002
et,Extra Trees Regressor,0.7286,1.0307,0.9997,-0.2499,0.4182,2.1773,0.018


분석 데이터 => tunnel_knn3_median



IntProgress(value=0, description='Processing: ', max=3)

Text(value="Following data types have been inferred automatically, if they are correct press enter to continue…

Unnamed: 0,Data Type
0,Numeric
1,Categorical
3,Numeric
4,Categorical
5,Numeric
6,Numeric
7,Numeric
8,Numeric
9,Numeric
10,Numeric


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)

KeyboardInterrupt: Interrupted by user

ersion 0.22 and will be removed in 0.24.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html


## Model

In [8]:
from sklearn.model_selection import train_test_split

data_set = preprocess_df(data_list[11], drop_col=drop_col, labeling_col=onehot_col, scaling_col=scaling_col, scaling_func=StandardScaler)
# data_set.rename(columns={name: str(idx) for idx, name in enumerate(data_set.columns)}, inplace = True)
train_set, test_set = train_test_split(data_set, test_size=0.2)

# setup_rgs = setup(data = train_set, target = '2', session_id = 10)
setup_rgs = setup(data = train_set, target = '생태통로_효율성', session_id = 10)

lasso = create_model('lasso')
lasso_least_angle = create_model('llar')
dummy = create_model('dummy')
elastic = create_model('en')
bayesian = create_model('br')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.8714,0.9211,0.9597,-0.0746,0.5044,0.5247
1,0.4558,0.4114,0.6414,-0.008,0.3893,0.1598
2,0.9033,0.9267,0.9627,-0.195,0.4822,0.4824
3,0.9576,1.087,1.0426,-0.4016,0.5301,0.4878
4,0.8372,0.7915,0.8897,-0.4917,0.5643,0.4207
5,0.5921,0.5617,0.7495,-0.1468,0.4723,0.1802
6,0.9893,0.9832,0.9916,-0.0037,0.5748,0.5331
7,0.9122,0.9731,0.9865,-0.6682,0.6325,0.2757
8,0.9954,0.9929,0.9964,-0.117,0.6075,0.5178
9,0.9189,0.9785,0.9892,-0.6774,0.4629,0.4779


## Tunning

In [9]:
# tuned_lasso = tune_model(lasso)
# tuned_lasso_least_angle = tune_model(lasso_least_angle)
# tuned_dummy = tune_model(dummy)
# tuned_elastic = tune_model(elastic)
# tuned_bayesian = tune_model(bayesian)

## Blending

In [10]:
# tuned = [tuned_lasso,
#          tuned_lasso_least_angle,
#          tuned_dummy,
#          tuned_elastic,
#          tuned_bayesian
#         ]
tuned = [lasso,
         lasso_least_angle,
         dummy,
         elastic,
         bayesian
        ]
blender_model = blend_models(tuned)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.8695,0.8675,0.9314,-0.0121,0.5129,0.4466
1,0.4569,0.4135,0.643,-0.013,0.3902,0.16
2,0.9064,0.9347,0.9668,-0.2053,0.4873,0.4725
3,0.9143,0.9626,0.9811,-0.2413,0.4954,0.4694
4,0.7198,0.6956,0.834,-0.311,0.5395,0.2041
5,0.5762,0.5479,0.7402,-0.1186,0.4678,0.1622
6,0.9902,0.985,0.9924,-0.0055,0.5753,0.5333
7,0.8404,0.8283,0.9101,-0.42,0.5889,0.2748
8,0.986,0.974,0.9869,-0.0958,0.6011,0.5232
9,0.9204,0.981,0.9905,-0.6818,0.4638,0.4785


In [11]:
"""
finalize_model(): 최종 모델로 설정 후 마지막 학습 진행
predict_model(): 예측 결과를 'Label' 변수에 저장
"""
final_model = finalize_model(blender_model)
prediction = predict_model(final_model, data = test_set)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Voting Regressor,0.7875,0.7472,0.8644,0.0584,0.4933,0.3788


In [12]:
prediction

Unnamed: 0,차선_수,등산객_이용빈도,생태통로_효율성,폭(m),연장(m),높이(m),개방도,경사도,주변 로드킬 빈도,산책로까지의 최단 거리(km),...,지형기호(2.3km),하천거리(km),등산로까지 최단거리(km),유도울타리_연장_m,유도울타리_높이_m,교통량,환경영향평가점수,건물까지거리(km),최우점식물군락,Label
61,0.0,1.0,1,0.049547,-0.019034,3.1,0.9,5,0.0,-0.023675,...,0,0.785413,-0.008601,-0.676812,-0.769591,-0.550482,3,-0.436623,10,0.885386
45,0.0,1.0,1,2.385014,-2.193474,5.0,2.5,6,0.0,-1.672001,...,0,0.218028,-1.195433,-0.676812,-0.769591,-0.801866,1,0.184356,3,0.96606
39,0.0,1.0,2,0.049547,-0.019034,3.1,0.9,6,0.0,0.289059,...,0,1.670121,-0.864293,-0.676812,-0.769591,0.167734,1,2.273185,13,0.942714
63,1.0,2.0,2,0.049547,-0.019034,3.1,0.9,5,0.0,-1.301622,...,6,-0.699187,1.55085,1.290033,1.415801,-0.662349,2,0.754188,11,0.860866
111,0.0,0.0,2,0.049547,-0.019034,3.1,0.9,4,0.0,0.85009,...,3,1.571622,-1.091511,-0.676812,-0.769591,-0.740914,1,2.273185,17,0.983151
96,0.0,0.0,2,0.049547,-0.019034,3.1,0.9,4,0.0,1.559157,...,0,1.244646,-1.04281,-0.676812,-0.769591,-0.881029,1,0.138992,13,0.927409
52,1.0,0.0,0,0.049547,-0.019034,3.1,0.9,5,0.0,-0.604943,...,0,-0.958908,-0.172133,1.433475,1.355095,-0.081603,1,0.931855,11,0.991449
104,0.0,2.5,0,0.049547,-0.019034,3.1,0.9,1,0.0,-1.373584,...,0,-0.994166,0.562003,-0.676812,-0.769591,-0.321281,2,-0.471238,3,0.806378
82,1.0,0.0,0,0.049547,-0.019034,3.1,0.9,3,0.0,-0.431851,...,0,-0.446743,-0.224261,1.815174,1.25392,-0.130085,2,0.252785,6,0.991193
38,1.0,1.0,0,0.049547,-0.019034,3.1,0.9,3,0.0,0.698322,...,0,0.18868,-0.036943,-0.550389,1.658622,2.193253,2,0.281178,6,0.809346


In [13]:
test_set

Unnamed: 0,차선_수,등산객_이용빈도,생태통로_효율성,폭(m),연장(m),높이(m),개방도,경사도,주변 로드킬 빈도,산책로까지의 최단 거리(km),...,주변동물 출현빈도,지형기호(2.3km),하천거리(km),등산로까지 최단거리(km),유도울타리_연장_m,유도울타리_높이_m,교통량,환경영향평가점수,건물까지거리(km),최우점식물군락
61,0.0,1.0,1,0.049547,-0.019034,3.1,0.9,5,0.0,-0.023675,...,-0.758234,0,0.785413,-0.008601,-0.676812,-0.769591,-0.550482,3,-0.436623,10
45,0.0,1.0,1,2.385014,-2.193474,5.0,2.5,6,0.0,-1.672001,...,0.829827,0,0.218028,-1.195433,-0.676812,-0.769591,-0.801866,1,0.184356,3
39,0.0,1.0,2,0.049547,-0.019034,3.1,0.9,6,0.0,0.289059,...,-0.758234,0,1.670121,-0.864293,-0.676812,-0.769591,0.167734,1,2.273185,13
63,1.0,2.0,2,0.049547,-0.019034,3.1,0.9,5,0.0,-1.301622,...,0.234304,6,-0.699187,1.55085,1.290033,1.415801,-0.662349,2,0.754188,11
111,0.0,0.0,2,0.049547,-0.019034,3.1,0.9,4,0.0,0.85009,...,0.631319,3,1.571622,-1.091511,-0.676812,-0.769591,-0.740914,1,2.273185,17
96,0.0,0.0,2,0.049547,-0.019034,3.1,0.9,4,0.0,1.559157,...,-0.559726,0,1.244646,-1.04281,-0.676812,-0.769591,-0.881029,1,0.138992,13
52,1.0,0.0,0,0.049547,-0.019034,3.1,0.9,5,0.0,-0.604943,...,-0.758234,0,-0.958908,-0.172133,1.433475,1.355095,-0.081603,1,0.931855,11
104,0.0,2.5,0,0.049547,-0.019034,3.1,0.9,1,0.0,-1.373584,...,-0.758234,0,-0.994166,0.562003,-0.676812,-0.769591,-0.321281,2,-0.471238,3
82,1.0,0.0,0,0.049547,-0.019034,3.1,0.9,3,0.0,-0.431851,...,-0.758234,0,-0.446743,-0.224261,1.815174,1.25392,-0.130085,2,0.252785,6
38,1.0,1.0,0,0.049547,-0.019034,3.1,0.9,3,0.0,0.698322,...,-0.361219,0,0.18868,-0.036943,-0.550389,1.658622,2.193253,2,0.281178,6
