In [1]:
from sqlalchemy import create_engine
import pymysql
import pandas as pd
from haversine import haversine
from DB import *

### 유도울타리현황_2021

In [29]:
file_name = "../data/variables/유도울타리현황_2021.csv"
df_etc_fence = pd.read_csv(file_name, sep=",", header=0, encoding="cp949")
df_etc_fence

Unnamed: 0,본부,지사,노선,설치시점,설치종점,방향,연장,높이,설치년도
0,수도권본부,경기광주,제2중부선,353,353,부산,30,2,2011
1,수도권본부,경기광주,제2중부선,353,353,부산,150,2,2011
2,수도권본부,경기광주,제2중부선,353,354,부산,130,2,2011
3,수도권본부,경기광주,제2중부선,354,354,부산,90,2,2011
4,수도권본부,경기광주,제2중부선,354,354,부산,70,2,2011
...,...,...,...,...,...,...,...,...,...
15419,충북본부,충주,중부내륙선,208,208,서울,175,1,2020
15420,충북본부,충주,중부내륙선,208,208,서울,10,2,2020
15421,충북본부,충주,중부내륙선,209,209,서울,135,2,2020
15422,충북본부,충주,중부내륙선,209,209,서울,20,1,2020


In [30]:
# 노선과 설치시점을 기준으로 groupby하되, 설치종점은 최대값, 연장은 합계값, 높이는 평균값으로 설정
df_etc_fence_preprocessing = df_etc_fence.groupby(['노선', '설치시점']).agg({'설치종점':'max', '연장':'sum', '높이':'mean'})
# 인덱싱 재설정
df_etc_fence_preprocessing.reset_index(inplace=True)
# 시종점길이는 설치종점에서 설치시점을 뺀 값으로 설정
df_etc_fence_preprocessing['시종점길이'] = (df_etc_fence_preprocessing['설치종점'] - df_etc_fence_preprocessing['설치시점'])
# 설치시점의 경우 type을 float로 설정
df_etc_fence_preprocessing['설치시점'] = df_etc_fence_preprocessing['설치시점'].astype('float')
df_etc_fence_preprocessing

Unnamed: 0,노선,설치시점,설치종점,연장,높이,시종점길이
0,경부선,3.0,3,270,2.000000,0
1,경부선,4.0,5,470,2.000000,1
2,경부선,5.0,6,492,2.000000,1
3,경부선,6.0,7,1506,1.636364,1
4,경부선,7.0,8,738,1.666667,1
...,...,...,...,...,...,...
2874,호남선의 지선,48.0,49,720,2.000000,1
2875,호남선의 지선,49.0,49,360,2.000000,0
2876,호남선의 지선,51.0,52,120,1.000000,1
2877,호남선의 지선,52.0,53,1095,1.800000,1


### 한국도로공사_도로중심선

In [31]:
file_name = "../data/variables/한국도로공사_도로중심선.csv"
df_etc_road = pd.read_csv(file_name, sep=",", header=0, encoding="cp949")
# index 계산을 위해 이정 데이터를 문자열로 변경
df_etc_road['이정'] = df_etc_road['이정'].astype('string')
df_etc_road.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48241 entries, 0 to 48240
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   노선번호       48241 non-null  int64  
 1   도로명        48241 non-null  object 
 2   이정         48241 non-null  string 
 3   X좌표값       48241 non-null  float64
 4   Y좌표값       48241 non-null  float64
 5   GRS80X좌표값  48241 non-null  float64
 6   GRS80Y좌표값  48241 non-null  float64
dtypes: float64(4), int64(1), object(1), string(1)
memory usage: 2.6+ MB


In [32]:
# 이정의 마지막 자리가 0이 아닌 것만을 추출 
# -> 유도울타리의 설치시점과 종점이 자세하지 않기 때문에 도로중심선의 이정과 맞춰주기 위함
target_idx = []
for i in range(len(df_etc_road)):
    if df_etc_road.loc[i, '이정'][-1] != '0':
        target_idx.append(i)
        
df_etc_road.drop(target_idx, axis=0, inplace=True)

In [33]:
df_etc_road.reset_index(inplace=True, drop=True)
# 이정 데이터의 type을 다시 float로 변경
df_etc_road['이정'] = df_etc_road['이정'].astype('float')
df_etc_road

Unnamed: 0,노선번호,도로명,이정,X좌표값,Y좌표값,GRS80X좌표값,GRS80Y좌표값
0,10,경부선,0.0,35.246578,129.093207,390524.290167,296459.942628
1,10,경부선,1.0,35.253629,129.100005,391126.574702,297255.473039
2,10,경부선,2.0,35.261977,129.104050,391475.199578,298189.576852
3,10,경부선,3.0,35.270386,129.107884,391804.314426,299130.211426
4,10,경부선,4.0,35.278963,129.106610,391668.093399,300079.508662
...,...,...,...,...,...,...,...
4837,6000,부산외곽순환선,44.0,35.280043,129.165551,397029.027406,300314.946152
4838,6000,부산외곽순환선,45.0,35.278286,129.176331,398014.261102,300141.393525
4839,6000,부산외곽순환선,46.0,35.279118,129.187196,399000.981241,300255.474773
4840,6000,부산외곽순환선,47.0,35.280547,129.198049,399985.062982,300435.904279


### 도로명, 기점 거리를 기준으로 Join

In [34]:
li_latitude = [None] * len(df_etc_fence_preprocessing)
li_longitude = [None] * len(df_etc_fence_preprocessing)
li_idx = []
for idx in range(len(df_etc_fence_preprocessing)):
    li_idx.append(idx)
    for i in range(len(df_etc_road)):
        # 도로명과 노선명이 일치하는지 확인
        if df_etc_road.iloc[i,1] == df_etc_fence_preprocessing.iloc[idx,0]:
            # 이정과 설치시점이 같은지 확인
            if df_etc_road.iloc[i,2] == df_etc_fence_preprocessing.iloc[idx,1]:
                # 좌표값 저장
                li_latitude[idx] = df_etc_road.iloc[i,3]
                li_longitude[idx] = df_etc_road.iloc[i,4]
                break
# 제대로 진행되었는지 확인
print(len(li_latitude))
print(len(li_longitude))

2879
2879


In [35]:
# 진행 확인
if len(li_latitude) == len(li_longitude):
    df_etc_fence_preprocessing['위도'] = li_latitude
    df_etc_fence_preprocessing['경도'] = li_longitude
    print(True)
else:
    print(False)
df_etc_fence_preprocessing

True


Unnamed: 0,노선,설치시점,설치종점,연장,높이,시종점길이,위도,경도
0,경부선,3.0,3,270,2.000000,0,35.270386,129.107884
1,경부선,4.0,5,470,2.000000,1,35.278963,129.106610
2,경부선,5.0,6,492,2.000000,1,35.286647,129.101121
3,경부선,6.0,7,1506,1.636364,1,35.291514,129.091887
4,경부선,7.0,8,738,1.666667,1,35.297901,129.084194
...,...,...,...,...,...,...,...,...
2874,호남선의 지선,48.0,49,720,2.000000,1,36.400681,127.359373
2875,호남선의 지선,49.0,49,360,2.000000,0,36.406366,127.367995
2876,호남선의 지선,51.0,52,120,1.000000,1,36.412060,127.388434
2877,호남선의 지선,52.0,53,1095,1.800000,1,36.409214,127.399012


In [36]:
df_etc_fence_preprocessing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2879 entries, 0 to 2878
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   노선      2879 non-null   object 
 1   설치시점    2879 non-null   float64
 2   설치종점    2879 non-null   int64  
 3   연장      2879 non-null   int64  
 4   높이      2879 non-null   float64
 5   시종점길이   2879 non-null   int64  
 6   위도      2854 non-null   float64
 7   경도      2854 non-null   float64
dtypes: float64(4), int64(3), object(1)
memory usage: 180.1+ KB


### 결측치 처리

In [37]:
df_etc_fence_preprocessing[df_etc_fence_preprocessing['위도'].isnull()]

Unnamed: 0,노선,설치시점,설치종점,연장,높이,시종점길이,위도,경도
404,광주대구선,178.0,179,640,2.0,1,,
405,광주대구선,181.0,181,270,2.0,0,,
628,남해제2지선,146.0,147,1436,2.0,1,,
629,남해제2지선,147.0,147,276,2.0,0,,
851,당진대전선,92.0,92,960,1.0,0,,
994,동해선,123.0,123,948,1.277778,0,,
996,동해선,125.0,125,855,1.25,0,,
997,동해선,151.0,151,794,1.555556,0,,
1005,밀양울산선,99.0,100,185,1.0,1,,
1032,밀양울산선,145.0,146,2381,1.482759,1,,


In [38]:
# 필요한 위도, 경도, 노선, 연장, 높이 데이터만 추출
df_etc = df_etc_fence_preprocessing[['위도','경도','노선','연장','높이']]
df_etc

Unnamed: 0,위도,경도,노선,연장,높이
0,35.270386,129.107884,경부선,270,2.000000
1,35.278963,129.106610,경부선,470,2.000000
2,35.286647,129.101121,경부선,492,2.000000
3,35.291514,129.091887,경부선,1506,1.636364
4,35.297901,129.084194,경부선,738,1.666667
...,...,...,...,...,...
2874,36.400681,127.359373,호남선의 지선,720,2.000000
2875,36.406366,127.367995,호남선의 지선,360,2.000000
2876,36.412060,127.388434,호남선의 지선,120,1.000000
2877,36.409214,127.399012,호남선의 지선,1095,1.800000


### 좌표간 거리 계산

In [39]:
# RDS에서 생태통로 데이터 불러오기
db_connection = connect_db()
df_maindata_inner_origin = load_data_from_rds('생태통로', db_connection) 
df_maindata_inner_origin

Unnamed: 0,번호,위도,경도,폭,연장,높이,개방도,차선_수,생태통로타입,등산객이용빈도,생태통로효율성
0,1495,35.36019,128.06375,5.00,45.0,,,4,0,1,3
1,1277,37.03408,128.40661,,,,,2,1,0,3
2,1278,36.89832,128.30563,,,,,2,1,0,1
3,1279,36.89687,128.30581,,,,,2,1,0,1
4,1090,36.71051,127.42222,8.23,38.0,,,6,0,5,1
...,...,...,...,...,...,...,...,...,...,...,...
492,1089,36.71278,127.42056,13.30,50.7,,,6,0,0,0
493,1087,36.71722,127.41778,13.60,38.7,,,4,0,5,0
494,1088,36.71222,127.42528,13.30,50.7,,,4,0,5,0
495,1091,36.76750,127.47333,7.00,20.0,,,2,0,0,0


In [40]:
# 각각의 데이터에 대해 리스트로 변환

df_etc_latitude = df_etc["위도"]
df_etc_longitude = df_etc["경도"]
df_etc_length = df_etc["연장"]
df_etc_height = df_etc["높이"]
df_etc_longitude = df_etc_longitude.values.tolist()
df_etc_latitude = df_etc_latitude.values.tolist()
df_etc_length = df_etc_length.values.tolist()
df_etc_height = df_etc_height.values.tolist()

df_latitude = df_maindata_inner_origin["위도"]
df_longitude = df_maindata_inner_origin["경도"]
df_main_longitude = df_longitude.values.tolist()
df_main_latitude = df_latitude.values.tolist()

In [41]:
print(len(df_etc_latitude))
print(len(df_etc_longitude))
print(len(df_latitude))
print(len(df_longitude))

2879
2879
497
497


In [42]:
def is_fence(n, fence_info_return = False):
    # 반경 nkm 이내의 가장 가까운 유도울타리

    fence_lat = [0] * len(df_latitude)
    fence_lng = [0] * len(df_longitude)
    fence_length = [0] * len(df_latitude)
    fence_height = [0] * len(df_latitude)

    cnt = 0
    li_idx = []
    li_distance = []
    
    # 모든 생태통로 데이터에 대하여 유도울타리의 위,경도간 거리를 모두 계산해 반경 nkm 이내의 가장 가까운 유도울타리 계산
    for idx in range(len(df_latitude)):
        distance_min = 500
        distance_min_idx = 0
        start = (df_latitude[idx], df_longitude[idx])

        for i, (val1, val2) in enumerate(zip(df_etc_latitude, df_etc_longitude)):
            goal = (val1, val2)
            # 유도울타리와의 거리 계산
            distance = haversine(start, goal, unit = "km")
            # 설정한 nkm보다 작은 경우
            if distance <= n:
                # 임시 저장된 최소 거리보다 작은 경우 데이터 대치
                if distance < distance_min:
                    distance_min = distance
                    distance_min_idx = i
                    fence_lat[idx] = df_etc_latitude[i]
                    fence_lng[idx] = df_etc_longitude[i]
                    fence_length[idx] = df_etc_length[i]
                    fence_height[idx] = df_etc_height[i]
        li_distance.append(distance_min)
        li_idx.append(distance_min_idx)
    
    # 결과 리스트 반환
    if fence_info_return:
        return (fence_lat, fence_lng, fence_length, fence_height)
    return (fence_lat, fence_lng)

In [43]:
# 2.3km 반경만 처리하기
fence_lat_2_3, fence_lng_2_3, fence_length, fence_height = is_fence(2.3, True)

In [44]:
# 결측치 처리
for idx, (val1, val2) in enumerate(zip(fence_lat_2_3, fence_lng_2_3)):
    if val1 == None or val2 == None:
        fence_length[idx] = None
        fence_height[idx] = None

In [45]:
# 생태통로 데이터에 유도울타리 관련 데이터 추가
df_maindata_inner_origin["유도울타리_위도_2.3km"] = fence_lat_2_3
df_maindata_inner_origin["유도울타리_경도_2.3km"] = fence_lng_2_3
df_maindata_inner_origin["유도울타리_연장_m"] = fence_length
df_maindata_inner_origin["유도울타리_높이_m"] = fence_height

In [46]:
df_maindata_inner_origin

Unnamed: 0,번호,위도,경도,폭,연장,높이,개방도,차선_수,생태통로타입,등산객이용빈도,생태통로효율성,유도울타리_위도_2.3km,유도울타리_경도_2.3km,유도울타리_연장_m,유도울타리_높이_m
0,1495,35.36019,128.06375,5.00,45.0,,,4,0,1,3,0.000000,0.000000,0,0.000000
1,1277,37.03408,128.40661,,,,,2,1,0,3,0.000000,0.000000,0,0.000000
2,1278,36.89832,128.30563,,,,,2,1,0,1,0.000000,0.000000,0,0.000000
3,1279,36.89687,128.30581,,,,,2,1,0,1,0.000000,0.000000,0,0.000000
4,1090,36.71051,127.42222,8.23,38.0,,,6,0,5,1,36.701443,127.431078,442,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
492,1089,36.71278,127.42056,13.30,50.7,,,6,0,0,0,36.701443,127.431078,442,1.000000
493,1087,36.71722,127.41778,13.60,38.7,,,4,0,5,0,36.708802,127.437539,886,1.571429
494,1088,36.71222,127.42528,13.30,50.7,,,4,0,5,0,36.708802,127.437539,886,1.571429
495,1091,36.76750,127.47333,7.00,20.0,,,2,0,0,0,36.759335,127.483618,884,1.500000


In [50]:
# 변수관계설정 데이터와 join하기 위해 필요한 데이터만을 생태통로 데이터에서 추출
df_main = df_maindata_inner_origin.iloc[:,[0,13,14]]
df_main

Unnamed: 0,번호,유도울타리_연장_m,유도울타리_높이_m
0,1495,0,0.000000
1,1277,0,0.000000
2,1278,0,0.000000
3,1279,0,0.000000
4,1090,442,1.000000
...,...,...,...
492,1089,442,1.000000
493,1087,886,1.571429
494,1088,886,1.571429
495,1091,884,1.500000


In [52]:
# RDS에서 변수관계설정 데이터 불러오기
df_rds = load_data_from_rds('변수관계설정', db_connection) 
df_rds

Unnamed: 0,번호,등산로까지 최단거리(km),식생,경사도,지형기호(2.3km),주변 동물종 개수,주변동물 출현빈도,주변 로드킬 빈도,산책로까지의 최단 거리(km),최고제한속도(km/h),농가까지의 거리(km),교통량,환경영향평가점수,건물까지거리(km)
0,1001,1.903039,0,6,X,,,0,237.040073,80.0,0.125686,173030.400000,1,0.192281
1,1002,0.120319,1,5,X,4.0,4.0,0,122.055124,60.0,0.074627,55985.500000,1,0.575364
2,1004,0.047211,1,2,산지,1.0,1.0,0,125.516218,60.0,0.110617,46452.800000,2,0.077290
3,1005,0.233820,1,2,X,4.0,4.0,0,128.701343,80.0,0.009030,36727.714286,4,0.155399
4,1006,2.172428,0,4,하천,3.0,3.0,0,103.572133,80.0,0.098269,107833.500000,1,0.104287
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
492,1561,0.000416,1,2,산지,3.0,5.0,4,219.634740,60.0,0.087448,107659.473684,1,0.271655
493,1562,2.835246,1,5,산지,6.0,27.0,0,38.432601,80.0,0.137822,13853.200000,2,0.252010
494,1564,0.886170,1,4,X,1.0,1.0,0,248.086515,80.0,0.054073,6478.333333,2,0.113175
495,1567,9.125829,1,5,하천,4.0,5.0,0,283.374921,60.0,0.102655,10937.272727,2,0.248477


In [53]:
# 변수관계설정 데이터에 유도울타리 연장, 높이 정보 추가하기
df_rds_new = df_rds.merge(df_main, on="번호")
df_rds_new

Unnamed: 0,번호,등산로까지 최단거리(km),식생,경사도,지형기호(2.3km),주변 동물종 개수,주변동물 출현빈도,주변 로드킬 빈도,산책로까지의 최단 거리(km),최고제한속도(km/h),농가까지의 거리(km),교통량,환경영향평가점수,건물까지거리(km),유도울타리_연장_m,유도울타리_높이_m
0,1001,1.903039,0,6,X,,,0,237.040073,80.0,0.125686,173030.400000,1,0.192281,0,0.0
1,1002,0.120319,1,5,X,4.0,4.0,0,122.055124,60.0,0.074627,55985.500000,1,0.575364,0,0.0
2,1004,0.047211,1,2,산지,1.0,1.0,0,125.516218,60.0,0.110617,46452.800000,2,0.077290,0,0.0
3,1005,0.233820,1,2,X,4.0,4.0,0,128.701343,80.0,0.009030,36727.714286,4,0.155399,0,0.0
4,1006,2.172428,0,4,하천,3.0,3.0,0,103.572133,80.0,0.098269,107833.500000,1,0.104287,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
492,1561,0.000416,1,2,산지,3.0,5.0,4,219.634740,60.0,0.087448,107659.473684,1,0.271655,0,0.0
493,1562,2.835246,1,5,산지,6.0,27.0,0,38.432601,80.0,0.137822,13853.200000,2,0.252010,60,2.0
494,1564,0.886170,1,4,X,1.0,1.0,0,248.086515,80.0,0.054073,6478.333333,2,0.113175,0,0.0
495,1567,9.125829,1,5,하천,4.0,5.0,0,283.374921,60.0,0.102655,10937.272727,2,0.248477,0,0.0


### Push RDS

In [54]:
upload_data_to_rds(df_rds_new, '변수관계설정', db_connection)