In [1]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
import pandas as pd
from glob import glob
import numpy as np
from sklearn.metrics import classification_report

## Load Data

In [2]:
files = glob('../../Maindata_predict/도람지/*.csv')
files = [x.replace('\\', '/') for x in files]
df_overpass = pd.read_csv(files[0], encoding='cp949')
df_tunnel = pd.read_csv(files[1], encoding='cp949')
files

['../../Maindata_predict/도람지/df_tunnel_mean.csv',
 '../../Maindata_predict/도람지/df_overpass_mean.csv',
 '../../Maindata_predict/도람지/df_overpass_knn1.csv',
 '../../Maindata_predict/도람지/df_overpass_knn3.csv',
 '../../Maindata_predict/도람지/df_overpass_knn5.csv',
 '../../Maindata_predict/도람지/df_tunnel_knn3.csv',
 '../../Maindata_predict/도람지/df_tunnel_knn1.csv',
 '../../Maindata_predict/도람지/df_tunnel_knn5.csv']

## Preprocessing

In [3]:
def preprocess_df(df, onehot_col=None, scaling_col=None, drop_col = None, labeling_col = None, scaling_func=MinMaxScaler) :
    """
    :param df: 사용할 데이터프레임 
    :param onehot_col: default = None, 원핫인코딩할 컬럼 리스트 
    :param scaling_col: default = None, 스케일링할 컬럼 리스트 
    :param drop_col: default = None, 사용하지 않을 컬럼 리스트 
    :param labeling_col: default = None, 라벨링이 필요한 컬럼 리스트 
    :param scaling_func: 스케일링시 사용할 컬럼, MinMaxScaler or StandardScaler
    :return: 전처리 된 데이터 프레임
    """
    if drop_col is not None :
        df = df.drop(drop_col, axis=1)
    if onehot_col is not None:
        df = pd.get_dummies(df, columns= onehot_col)
    if scaling_col is not None :
        for c in scaling_col :
            scaler = scaling_func()
            scaler.fit(df[[c]])
            df[c] = scaler.transform(df[[c]])
    if labeling_col is not None :
        for c in labeling_col :
            encoder = LabelEncoder()
            encoder.fit(df[c])
            df[c] = encoder.transform(df[c])
    return df

## Blending

In [4]:
def blending(df, file_name, target_idx, test_set, voting = False):
    
    clf = setup(data=df, target=target_idx, train_size=0.8)
    print(file_name)
    
    best_model_5 = compare_models(sort = 'Accuracy', n_select = 5)

    # Blending
    blended_model = blend_models(estimator_list = best_model_5, fold = 10)
    
#     plot_model(blended_model, plot='learning')
    final_model = finalize_model(blended_model)
    prediction = predict_model(final_model, data = test_set)
    
    print(file_name)
    print(check_metric(prediction[target_idx], prediction['Label'], metric = 'Accuracy'))

In [5]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from pycaret.classification import *
from pycaret.utils import check_metric

onehot_col = ['지형기호(2.3km)', '최우점식물군락']
scaling_col = ['폭(m)', '연장(m)', '산책로까지의 최단 거리(km)', '농가까지의 거리(km)', '도로 최고제한속도(km/h)',
               '하천거리(km)', '주변 동물종 개수', '등산로까지 최단거리(km)', '유도울타리_연장_m', '유도울타리_높이_m', '교통량', 
               '건물까지거리(km)', '주변동물 출현빈도']
drop_col = ['이용확인종_수']
scaling_col_tunnel = scaling_col + ['높이(m)', '개방도']





In [None]:
for file in files:
    data_set = pd.read_csv(file, encoding='cp949')
    data_set = preprocess_df(df, scaling_col=scaling_col, scaling_func=StandardScaler)
    col_ls = list(data_set.columns)
    target_idx = col_ls.index('일평균이용빈도')

    data_set = data_set.rename(columns={name: str(idx) for idx, name in enumerate(data_set.columns)}, inplace = False)
    train_set, test_set = train_test_split(data_set, test_size=0.2)

    name = file.split('/')[-1].split('.')[0]

    blending(train_set, name, str(target_idx), test_set)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
omp,Orthogonal Matching Pursuit,0.0696,0.0124,0.1051,0.8834,0.0686,0.1814,0.002
br,Bayesian Ridge,0.0499,0.0124,0.0958,0.8717,0.0542,0.1537,0.002
lr,Linear Regression,0.0495,0.0127,0.0967,0.8683,0.0542,0.1544,0.14
gbr,Gradient Boosting Regressor,0.0545,0.0148,0.116,0.8943,0.0718,0.1591,0.008
et,Extra Trees Regressor,0.0573,0.0178,0.127,0.8603,0.0745,0.1681,0.015
ridge,Ridge Regression,0.0749,0.0193,0.1297,0.8546,0.0632,0.1826,0.002
xgboost,Extreme Gradient Boosting,0.059,0.0195,0.1252,0.8739,0.0765,0.1614,0.011
rf,Random Forest Regressor,0.0691,0.0204,0.1305,0.8687,0.0761,0.1792,0.019
dt,Decision Tree Regressor,0.0865,0.034,0.1778,0.685,0.1071,0.2147,0.002
ada,AdaBoost Regressor,0.1559,0.0434,0.2044,0.6778,0.1159,0.3985,0.005


df_tunnel_mean


IntProgress(value=0, description='Processing: ', max=3)

Text(value="Following data types have been inferred automatically, if they are correct press enter to continue…

Unnamed: 0,Data Type
0,Numeric
1,Numeric
2,Numeric
3,Numeric
4,Numeric
5,Numeric
6,Numeric
7,Categorical
8,Numeric
9,Numeric


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("