# Colab 환경에서 거리매칭

!pip install haversine
!pip install pymysql



# Import
import pandas as pd
from sqlalchemy import create_engine
from haversine import haversine
from tqdm.notebook import tqdm
import numpy as np
import pymysql
import matplotlib.pyplot as plt



# DB Connect
endpoint = 'dao.c51deksujiip.ap-northeast-2.rds.amazonaws.com'
schema = 'storage'
db_connection_str = 'mysql+pymysql://admin:ekfkawnl@{}/{}'.format(endpoint, schema)
try :
    db_connection = create_engine(db_connection_str)
    conn = db_connection.connect()
except :
    print('fail to connect db')



# DataLoad
def load_data_from_rds(tabel_name):
  sql = "SELECT * FROM {}".format(tabel_name)
  df = pd.read_sql(sql, db_connection)
  return df



# Preprocessing
table = load_data_from_rds('생태통로')
display(table)

def calc_distance(bridge, factor, limit) :
  num_bridge = len(bridge)
  num_factor = len(factor)
  true_table = [[False for _ in range(num_factor)] for _ in range(num_bridge)]

  for b in tqdm(range(num_bridge)) :
    start = (bridge.iloc[b]['위도'], bridge.iloc[b]['경도'])
    for f in range(num_factor) :
      goal = (factor.iloc[f]['위도_십진법'], factor.iloc[f]['경도_십진법'])
      if haversine(start,goal,unit='km') <= limit :
        true_table[b][f] = True
  return np.array(true_table)

from google.colab import drive
drive.mount('/content/drive')



# 포유류 거리계산
ani = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/전국자연환경조사_4차_포유류_병합.csv', encoding='cp949')
display(ani.head())

ani['위도_십진법'] = ani['경도']
ani['경도_십진법'] = ani['위도']
ani.drop(['위도', '경도', 'path'], axis=1, inplace=True)
display(ani)

target_ani = ['멧돼지', '고라니', '노루', '너구리', '오소리', '족제비','멧토끼', '청설모', '다람쥐']

target_animals = ani[ani['국명'].isin(target_ani)]
display(target_animals)

################ time
true_table = calc_distance(table, target_animals, 2.3)
############## CPU times: user 1h 3min 10s, sys: 5.44 s, total: 1h 3min 15s
############## Wall time: 1h 3min 35s

print(true_table.shape)
############## (497, 29259)
print(true_table.sum())
############## 2271

df = pd.DataFrame(columns=['번호', '종명'])
## true table
for b in tqdm(range(true_table.shape[0])) :
  for f in range(true_table.shape[1]) :
    if true_table[b][f] :
      temp = pd.DataFrame([[table.iloc[b]['번호'],
                            target_animals.iloc[f]['국명']]],
                          columns=df.columns)
      df = pd.concat([df, temp])
display(df)



##### 수행결과 저장
df.to_csv('/content/drive/MyDrive/Colab Notebooks/포유류_join.csv', index=False)


# Local 환경에서 join

In [1]:
from DB import *
import pandas as pd
db_connection = connect_db()

In [15]:
df = pd.read_csv('.././data/variables/포유류_join.csv')

In [16]:
# 주변에서 동물종이 발견된 생태통로 개수
df.groupby('번호').count().__len__()

259

In [17]:
grouped_df = df.groupby('번호')

In [18]:
result = pd.DataFrame(columns=['번호'])
result

Unnamed: 0,번호


### 생태통로 주변에서 발견된 포유류 종의 개수

In [19]:
unique_ = pd.DataFrame(grouped_df['종명'].nunique())
unique_.rename(columns={'종명' : '주변 동물종 개수'}, inplace=True)
unique_

Unnamed: 0_level_0,주변 동물종 개수
번호,Unnamed: 1_level_1
1002.0,4
1004.0,1
1005.0,4
1006.0,3
1009.0,2
...,...
1561.0,3
1562.0,6
1564.0,1
1567.0,4


### 생태통로 주변에서 발견된 포유류 빈도수

In [20]:
cnt = pd.DataFrame(grouped_df['종명'].count())
cnt.rename(columns={'종명':'주변동물 출현빈도'}, inplace=True)
cnt

Unnamed: 0_level_0,주변동물 출현빈도
번호,Unnamed: 1_level_1
1002.0,4
1004.0,1
1005.0,4
1006.0,3
1009.0,4
...,...
1561.0,5
1562.0,27
1564.0,1
1567.0,5


### 빈도와 다양도 join

In [21]:
result = result.merge(unique_, how='outer', on='번호')
result = result.merge(cnt, how='outer', on='번호')
result

Unnamed: 0,번호,주변 동물종 개수,주변동물 출현빈도
0,1002.0,4,4
1,1004.0,1,1
2,1005.0,4,4
3,1006.0,3,3
4,1009.0,2,4
...,...,...,...
254,1561.0,3,5
255,1562.0,6,27
256,1564.0,1,1
257,1567.0,4,5


In [25]:
result['번호'] = result['번호'].astype('int64')

In [26]:
# 결과 확인
result

Unnamed: 0,번호,주변 동물종 개수,주변동물 출현빈도
0,1002,4,4
1,1004,1,1
2,1005,4,4
3,1006,3,3
4,1009,2,4
...,...,...,...
254,1561,3,5
255,1562,6,27
256,1564,1,1
257,1567,4,5


# join

In [36]:
main_table = load_data_from_rds('변수관계설정', db_connection)
main_table

Unnamed: 0,번호,등산로까지 최단거리(km),식생,경사도,지형기호(2.3km)
0,1001,1.903039,0,6,X
1,1002,0.120319,1,5,X
2,1004,0.047211,1,2,산지
3,1005,0.233820,1,2,X
4,1006,2.172428,0,4,하천
...,...,...,...,...,...
492,1561,0.000416,1,2,산지
493,1562,2.835246,1,5,산지
494,1564,0.886170,1,4,X
495,1567,9.125829,1,5,하천


In [37]:
backup = main_table.copy()

In [38]:
main_table =main_table.merge(result, on='번호', how='outer')

In [39]:
main_table

Unnamed: 0,번호,등산로까지 최단거리(km),식생,경사도,지형기호(2.3km),주변 동물종 개수,주변동물 출현빈도
0,1001,1.903039,0,6,X,,
1,1002,0.120319,1,5,X,4.0,4.0
2,1004,0.047211,1,2,산지,1.0,1.0
3,1005,0.233820,1,2,X,4.0,4.0
4,1006,2.172428,0,4,하천,3.0,3.0
...,...,...,...,...,...,...,...
492,1561,0.000416,1,2,산지,3.0,5.0
493,1562,2.835246,1,5,산지,6.0,27.0
494,1564,0.886170,1,4,X,1.0,1.0
495,1567,9.125829,1,5,하천,4.0,5.0


In [40]:
upload_data_to_rds(main_table, '변수관계설정', db_connection)