In [1]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
import pandas as pd
from glob import glob
import numpy as np
from sklearn.metrics import classification_report

## Load Data

In [2]:
files = glob('../../Maindata_predict/도람지/*.csv')
files = [x.replace('\\', '/') for x in files]
df_overpass = pd.read_csv(files[0], encoding='cp949')
df_tunnel = pd.read_csv(files[1], encoding='cp949')
files

['../../Maindata_predict/도람지/df_tunnel_mean.csv',
 '../../Maindata_predict/도람지/df_overpass_mean.csv',
 '../../Maindata_predict/도람지/df_overpass_knn1.csv',
 '../../Maindata_predict/도람지/df_overpass_knn3.csv',
 '../../Maindata_predict/도람지/df_overpass_knn5.csv',
 '../../Maindata_predict/도람지/df_tunnel_knn3.csv',
 '../../Maindata_predict/도람지/df_tunnel_knn1.csv',
 '../../Maindata_predict/도람지/df_tunnel_knn5.csv']

## Preprocessing

In [3]:
def preprocess_df(df, onehot_col=None, scaling_col=None, drop_col = None, labeling_col = None, scaling_func=MinMaxScaler) :
    """
    :param df: 사용할 데이터프레임 
    :param onehot_col: default = None, 원핫인코딩할 컬럼 리스트 
    :param scaling_col: default = None, 스케일링할 컬럼 리스트 
    :param drop_col: default = None, 사용하지 않을 컬럼 리스트 
    :param labeling_col: default = None, 라벨링이 필요한 컬럼 리스트 
    :param scaling_func: 스케일링시 사용할 컬럼, MinMaxScaler or StandardScaler
    :return: 전처리 된 데이터 프레임
    """
    if drop_col is not None :
        df = df.drop(drop_col, axis=1)
    if onehot_col is not None:
        df = pd.get_dummies(df, columns= onehot_col)
    if scaling_col is not None :
        for c in scaling_col :
            scaler = scaling_func()
            scaler.fit(df[[c]])
            df[c] = scaler.transform(df[[c]])
    if labeling_col is not None :
        for c in labeling_col :
            encoder = LabelEncoder()
            encoder.fit(df[c])
            df[c] = encoder.transform(df[c])
    return df

## Blending

In [4]:
def blending(df, file_name, target_idx, test_set, voting = False):
    
    clf = setup(data=df, target=target_idx, train_size=0.8)
    best_model = compare_models(sort = 'MSE', n_select = 1)
    print(file_name)

    target_model = create_model('lr')
    final_model = finalize_model(target_model)
    pred = predict_model(final_model, data = df)
    print(pred)
    
    print(file_name)
    
    #실제값 - 예측값 의 절댓값 구하기
    pred['result'] = np.nan
    for i in range(0,len(pred)):
        if abs(pred.loc[i,'0']-pred.loc[i,'Label']) < 1:
            pred.loc[i,'result'] = 1
        else:
            pred.loc[i,'result'] = 0
    print(pred['result'].value_counts())
    
#     print(check_metric(prediction[target_idx], prediction['Label'], metric = 'MSE'))

In [5]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
# from pycaret.classification import *
from pycaret.regression import *
from pycaret.utils import check_metric

onehot_col = ['지형기호(2.3km)', '최우점식물군락']
scaling_col = ['폭(m)', '연장(m)', '산책로까지의 최단 거리(km)', '농가까지의 거리(km)', '도로 최고제한속도(km/h)',
               '하천거리(km)', '주변 동물종 개수', '등산로까지 최단거리(km)', '유도울타리_연장_m', '유도울타리_높이_m', '교통량', 
               '건물까지거리(km)', '주변동물 출현빈도']
drop_col = ['이용확인종_수']
scaling_col_tunnel = scaling_col + ['높이(m)', '개방도']





In [None]:
for file in files:
    data_set = pd.read_csv(file, encoding='cp949')
    # data_set = preprocess_df(df, scaling_col=scaling_col, scaling_func=StandardScaler)
    col_ls = list(data_set.columns)
    target_idx = col_ls.index('일평균이용빈도')

    data_set = data_set.rename(columns={name: str(idx) for idx, name in enumerate(data_set.columns)}, inplace = False)
    train_set, test_set = train_test_split(data_set, test_size=0.2)

    name = file.split('/')[-1].split('.')[0]

    blending(data_set, name, str(target_idx), test_set)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.0294,0.0015,0.0381,0.9952,0.0234,0.061
1,0.0476,0.0164,0.1279,0.8095,0.0954,0.3469
2,0.0474,0.004,0.0629,0.9917,0.0386,0.0875
3,0.0555,0.0129,0.1137,0.8498,0.0646,0.0934
4,0.0552,0.0056,0.0747,0.984,0.0395,0.0887
5,0.1155,0.0466,0.216,0.5034,0.1137,0.3154
6,0.063,0.0278,0.1667,0.8049,0.1182,0.4417
7,0.0821,0.03,0.1732,0.6457,0.0994,0.108
8,0.0211,0.0007,0.026,0.9916,0.0179,0.0505
9,0.0281,0.0013,0.0364,0.9631,0.0203,0.0379


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Linear Regression,0.0318,0.006,0.0777,0.972,0.0542,0.091


       0         1         2         3         4         5         6    7  \
0    0.8  0.360465  0.088000  3.000000   1.20000  0.018466  0.987849  0.0   
1    0.4  1.000000  0.248000  2.893023  20.95007  0.777251  0.307112  0.0   
2    1.0  0.953488  0.152000  5.300000   3.40000  0.439247  0.987849  0.0   
3    0.6  0.244186  0.312000  3.000000   0.40000  0.235679  0.251869  0.4   
4    0.6  0.244186  0.312000  3.000000   0.40000  0.236202  0.209296  0.4   
..   ...       ...       ...       ...       ...       ...       ...  ...   
192  0.4  0.356589  0.420089  2.893023  20.95007  0.579863  0.614061  0.4   
193  0.2  0.302326  0.456000  3.300000  35.00000  0.580725  0.697157  0.4   
194  0.0  0.356589  0.420089  2.893023  20.95007  0.473805  0.000000  0.4   
195  0.0  0.356589  0.420089  2.893023  20.95007  0.473151  0.028650  0.4   
196  0.4  0.356589  0.420089  2.893023  20.95007  0.372030  0.055456  0.4   

            8  9  ...  33  34  35  36  37  38  39  40  41     Label  
0    

IntProgress(value=0, description='Processing: ', max=3)

Text(value="Following data types have been inferred automatically, if they are correct press enter to continue…

Unnamed: 0,Data Type
0,Numeric
1,Numeric
2,Numeric
3,Numeric
4,Numeric
5,Numeric
6,Numeric
7,Categorical
8,Numeric
9,Numeric
