In [1]:
import pandas as pd
import numpy as np
import pickle
### 모델링--------------------------
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, precision_score, f1_score, recall_score, accuracy_score, roc_auc_score
# 업샘플링
!pip install imbalanced-learn
from imblearn.over_sampling import SMOTE
### x 만들기 -----------------------
# 지오코딩
!pip install pandas openpyxl
!pip install geokakao
import geokakao as gk
# 역지오코딩
import requests
import re
# 배수등급, 경사도
import geopandas as gpd
from shapely.geometry import Point
### 새로운 데이터 --------------------
from sklearn.preprocessing import PowerTransformer, StandardScaler
from sklearn.preprocessing import LabelEncoder
import joblib

Collecting geokakao
  Downloading geokakao-0.1.1-py3-none-any.whl.metadata (2.5 kB)
Downloading geokakao-0.1.1-py3-none-any.whl (3.6 kB)
Installing collected packages: geokakao
Successfully installed geokakao-0.1.1


##### 함수 정의

In [3]:
### ----------------------------------------------------------------------------------

def geo_coding(df):
    ### 위도, 경도
    gk.add_coordinates_to_dataframe(df, '주소')
    df = df.rename(columns = {"decimalLatitude" : "위도", "decimalLongitude" : "경도"})

    ### 자치구, 행정동
    # 동 : ~동, ~가 부분 찾아오기
    def get_dong(address):
        find = re.search(r"\b(\w+[동가])\b", address)
        if find:
            dong = find.group(1)
            return(dong)
        else:
            return(np.nan)

    # 구 : ~구 부분 찾아오기
    def get_gu(address):
        find = re.search(r"\b(\w+구)\b", address)
        if find:
            gu = find.group(1)
            return(gu)
        else:
            return(np.nan)

    # DataFrame 추가
    def add_dong_gu_to_dataframe(df):
        dongs = []
        gus = []

        for i in range(len(df)):
            address = df.loc[i, '주소']

            if address:
                try:
                    dong = get_dong(address)
                except:
                    dong = np.nan
                try:
                    gu = get_gu(address)
                except:
                    gu = np.nan
            else:
                dong = np.nan
                gu = np.nan

            dongs.append(dong)
            gus.append(gu)

        df["행정동"] = dongs
        df["자치구"] = gus
        return df

    df = add_dong_gu_to_dataframe(df)
    return df

### ----------------------------------------------------------------------------------

def traffic(df):
    # 도로 데이터
    gdf_links = gpd.read_file('/content/drive/MyDrive/DATA_원본/[2024-02-23]NODELINKDATA/MOCT_LINK.shp', encoding='euc-kr')
    gdf_links['LINK_ID'] = gdf_links['LINK_ID'].astype(int)

    # 새로운 데이터 공간 데이터로 변환
    gdf_potholes = gpd.GeoDataFrame(
        df,
        geometry = gpd.points_from_xy(df['경도'], df['위도']),
        crs="EPSG:4326")  # 위/경도 WGS84 좌표계로 설정

    # 새로운 데이터 좌표계 통일
    if gdf_links.crs is not None:
        gdf_potholes = gdf_potholes.to_crs(gdf_links.crs)

    # 새로운 데이터의 위치와 가까운 도로 번호(+도로정보) 매칭
    gdf_nearest = gpd.sjoin_nearest(
        gdf_potholes, gdf_links,
        how='left',
        distance_col='distance')  # 계산한 거리(m)

    ### 교통량 데이터
    traffic_d = pd.read_excel('/content/drive/MyDrive/DATA_원본/TrafficVolume(LINK).xlsx', header=[0, 1])

    # 전처리
    traffic_d.columns = [
        f"{upper}" if 'Unnamed' in str(lower) else f"{upper}_{lower}"
        for upper, lower in traffic_d.columns]
    traffic_d.rename(columns={
        'ITS LINK ID': 'ITS_LINK_ID',
        '승용차-평일_전일': '승용차',
        '버스-평일_전일': '버스',
        '트럭-평일_전일': '트럭'}, inplace=True)

    traffic_d['ITS_LINK_ID'] = traffic_d['ITS_LINK_ID'].astype(str).str.split(',')
    traffic_d = traffic_d.explode('ITS_LINK_ID')
    traffic_d['LINK_ID'] = traffic_d['ITS_LINK_ID'].str.strip().astype(int)
    traffic_df = traffic_d[['LINK_ID', '도로명', '차선수', '승용차', '버스', '트럭']].copy()
    traffic_df = (traffic_df.dropna(subset=['LINK_ID']).drop_duplicates('LINK_ID').reset_index(drop=True).astype({'LINK_ID': 'int'}))

    # 교통량 join
    pothole_traffic = gdf_nearest.merge(traffic_df, on='LINK_ID', how='left')
    pothole_output = pothole_traffic[['날짜', '주소', '위도', '경도', '행정동', '자치구', 'LINK_ID', '도로명', '차선수', '승용차', '버스', '트럭']]
    pothole_output['총교통량'] = pothole_output['승용차'] + pothole_output['버스'] + pothole_output['트럭']
    pothole_output['중대형차량 교통량'] = (pothole_output['버스'] + pothole_output['트럭']) / pothole_output['총교통량']


    return pothole_output

### ----------------------------------------------------------------------------------

def nature(new_pothole_traffic):

    ### 건물별 평균 연령
    old = pd.read_csv('/content/drive/MyDrive/DATA_원본/건축물연령정보_행정동_평균.csv')
    bup = pd.read_csv('/content/drive/MyDrive/DATA_원본/국토교통부_법정동코드_20240805.csv', encoding = 'cp949')
    bup['법정동명'] = bup['법정동명'].astype(str)

    # 법정동 만들기
    new_pothole_traffic['법정동명'] = '서울특별시 ' + new_pothole_traffic['자치구'] + ' ' + new_pothole_traffic['행정동']
    new_pothole_traffic['법정동명'] = new_pothole_traffic['법정동명'].astype(str)

    # join
    new_pothole_building = new_pothole_traffic.merge(bup[['법정동코드', '법정동명']], on = '법정동명', how = 'left')
    new_pothole_building = new_pothole_building.merge(old[['법정동코드', '평균_건물연령']], on = '법정동코드', how = 'left')

    ### 배수등급
    # 공간 데이터로 변환
    gdf_pothole2 = gpd.GeoDataFrame(
        new_pothole_building,
        geometry=gpd.points_from_xy(new_pothole_building['경도'], new_pothole_building['위도']),
        crs="EPSG:4326")

    # 포트홀 좌표계 → EPSG:5174 로 변환 (shp에 맞추기)
    gdf_pothole2 = gdf_pothole2.to_crs("EPSG:5174")

    # 배수등급 로드
    soil_df = gpd.read_file("/content/drive/MyDrive/DATA_원본/ASIT_SOILDRA_AREA/ASIT_SOILDRA_AREA.shp")

    # join
    new_pothole_soil = gpd.sjoin(gdf_pothole2, soil_df[['SOILDRA', 'geometry']], how='left', predicate='within')

    ### 토양 경사도
    # 데이터 로드
    slope_df = gpd.read_file("/content/drive/MyDrive/DATA_원본/ASIT_SOILSLOPE_AREA/ASIT_SOILSLOPE_AREA.shp")
    new_pothole_soil.drop(['index_right'], axis=1, inplace=True)

    # join
    new_pothole_slope = gpd.sjoin(new_pothole_soil, slope_df[['SOILSLOPE', 'geometry']], how='left', predicate='within')

    # 열 이름 수정
    new_pothole_slope.rename(columns = {'SOILDRA' : '배수등급', 'SOILSLOPE' : '경사도'}, inplace = True)
    new_pothole_done = new_pothole_slope[['날짜', '주소', '위도', '경도', '자치구', '행정동', '도로명', '차선수', '승용차', '버스', '트럭', '총교통량', '중대형차량 교통량', '평균_건물연령', '배수등급' ,'경사도']]

    return new_pothole_done

### ----------------------------------------------------------------------------------

def join_gu(new_pothole_done):
    ### 자치구별 데이터 로드
    people = pd.read_pickle('/content/drive/MyDrive/DATA_전처리/자치구별 인구 수.pickle')
    people['자치구'] = people['자치구'].str.replace("\u3000","",regex = False)

    new_pothole_done.rename(columns = {'날짜' : '발생일'}, inplace = True)
    new_pothole_done['발생일'] = pd.to_datetime(new_pothole_done['발생일'])

    ### 인구 수
    new_pothole = pd.merge(new_pothole_done, people, on = '자치구', how = 'left')

    return new_pothole

### ----------------------------------------------------------------------------------

def prediction(new_pothole, transformer_path, scaler_path, model_path):
    ## x 할당
    new_x = new_pothole[['차선수', '승용차', '버스', '트럭', '총교통량', '중대형차량 교통량', '평균_건물연령', '인구 수', '배수등급', '경사도']]

    # 저장된 변환기, 스케일러 불러오기
    transformer = joblib.load(transformer_path)
    scaler = joblib.load(scaler_path)
    # 새로운 데이터 변환
    cols = ['승용차', '버스', '트럭', '총교통량', '중대형차량 교통량', '평균_건물연령', '인구 수']
    arr = new_x[cols].values + 1e-6
    bc = transformer.transform(arr)
    bc_std = scaler.transform(bc)
    new_x[cols] = bc_std

    ### 배수등급, 경사도 변수 처리
    # 경사도
    slope_encoding = {
        '0-2%': 0,
        '2-7%': 1,
        '7-15%': 2,
        '15-30%': 3,
        '30-60%': 4,
        '60-100%': 5}
    # 배수등급
    drain_encoding = {
        '매우양호': 5,
        '양호': 4,
        '약간양호': 3,
        '약간불량': 2,
        '불량': 1,
        '매우불량': 0}
    # 매핑
    new_x['경사도'] = new_x['경사도'].map(slope_encoding)
    new_x['배수등급'] = new_x['배수등급'].map(drain_encoding)

    ### 모델 로드
    xgb_model = joblib.load(model_path)
    y_pred = xgb_model.predict(new_x)
    y_pred_prob = xgb_model.predict_proba(new_x)[:, 1]

    return new_pothole, new_x, y_pred, y_pred_prob ## 원본 데이터, x를 살펴보기 위해 따로 받아옴 !

In [4]:
### 역변환!!!! - Counterfactual에서 사용
def inverse_scaling_boxcox(transformed_data, transformer_path, scaler_path):

    # 역변환 대상 변수
    cols = ['승용차', '버스', '트럭', '총교통량', '중대형차량 교통량', '평균_건물연령', '인구 수']
    cols_idx = [transformed_data.columns.get_loc(col) for col in cols]

    # scaler, transformer
    scaler = joblib.load(scaler_path)
    transformer = joblib.load(transformer_path)

    # 변환된 5개 변수만 추출
    transformed = transformed_data.iloc[:, cols_idx]

    # 역표준화
    un_scaled = scaler.inverse_transform(transformed)

    # 역변환
    un_transformed = np.array([inv_boxcox(un_scaled[:, i], transformer.lambdas_[i]) for i in range(un_scaled.shape[1])]).T

    # 전체 복원
    original = transformed_data.copy()
    for j, col in enumerate(cols):
        original[col] = un_transformed[:,j]


    ### 배수등급, 경사도 변수 처리
    # 경사도
    slope_encoding = {
        '0-2%': 0,
        '2-7%': 1,
        '7-15%': 2,
        '15-30%': 3,
        '30-60%': 4,
        '60-100%': 5}
    # 배수등급
    drain_encoding = {
        '매우양호': 5,
        '양호': 4,
        '약간양호': 3,
        '약간불량': 2,
        '불량': 1,
        '매우불량': 0}
    # 인코딩 딕셔너리 뒤집기
    slope_decoding = {v: k for k, v in slope_encoding.items()}
    drain_decoding = {v: k for k, v in drain_encoding.items()}

    # 역매핑
    original['경사도'] = original['경사도'].map(slope_decoding)
    original['배수등급'] = original['배수등급'].map(drain_decoding)

    return original

#### 예측 진행

In [5]:
### Input - 주의 : 행정동이 명시적으로 존재해야
input = ["서울특별시 강남구 대치동 507",
         "서울특별시 동대문구 제기동 137-418",
         "서울특별시 서초구 방배동 756-4",
         "서울특별시 용산구 동빙고동 90-1",
         "서울특별시 성북구 보문동5가 235",
         ]
date = ['2024-07-28', '2023-06-23', '2021-10-29', '2024-01-05', '2022-04-29']
new_data = pd.DataFrame({'날짜' : date, '주소' : input})

In [6]:
def prediction_without_env(new_data, transformer_path, scaler_path, model_path):
    ### 마지막에 출력할 output
    output_df = new_data.copy()
    ### x 만들기
    geo = geo_coding(new_data)
    traffic_df = traffic(geo)
    nature_df = nature(traffic_df)
    new_potholes = join_gu(nature_df)
    ### 예측
    new_data_org, new_x, y_pred, y_pred_prob = prediction(new_potholes, transformer_path, scaler_path, model_path)
    ### output
    output_df['예측'] = y_pred
    output_df['예측 확률'] = y_pred_prob
    return new_data_org, new_x, output_df

In [None]:
### Box-Cox 변환 - 기후 제외
# 변환 데이터
df = pd.read_csv('/content/drive/MyDrive/DATA_완성/dataset.csv')
# 변환 대상 컬럼
cols = ['승용차', '버스', '트럭',  '총교통량', '중대형차량 교통량', '평균_건물연령', '인구 수']
# 변환기, 스케일러
transform = PowerTransformer(method = 'box-cox', standardize = False)
scaler = StandardScaler()
# Box-Cox + 스케일링 진행
arr = df[cols].values + 1e-6
bc = transform.fit_transform(arr)
bc_std = scaler.fit_transform(bc)
df[cols] = bc_std
# 변환기, 스케일러 저장
joblib.dump(transform, '/content/drive/MyDrive/준희/Report/Files/BoxCoxTransformer_wo_env.pkl')
joblib.dump(scaler, '/content/drive/MyDrive/준희/Report/Files/StandardScaler_wo_env.pkl')

['/content/drive/MyDrive/준희/Report/Files/StandardScaler_wo_env.pkl']

In [None]:
### Model, BoxCox Transformer, Scaler Path
model_path = '/content/drive/MyDrive/세은/model.pickle'
transformer_path = '/content/drive/MyDrive/준희/Report/Files/BoxCoxTransformer_wo_env.pkl'
scaler_path = '/content/drive/MyDrive/준희/Report/Files/StandardScaler_wo_env.pkl'

### Prediction - 기후 없이!
new_data_org, new_x, output_df = prediction_without_env(new_data, transformer_path, scaler_path, model_path)
# new_data_org : 변환 전 기존 x
# new_x : 변환 후 모델 입력 x

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pothole_output['총교통량'] = pothole_output['승용차'] + pothole_output['버스'] + pothole_output['트럭']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pothole_output['중대형차량 교통량'] = (pothole_output['버스'] + pothole_output['트럭']) / pothole_output['총교통량']
Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: EPSG:5174
Right CRS: None

  new_pothole_soil = gpd.sjoin(gdf_pothole2, soil_df[['SOILDRA', 'geometry']], how='left', predicate='within')
Use `to_crs()` to reproject one of the i

In [None]:
### 저장하기
new_data_org.to_csv("/content/drive/MyDrive/준희/Report/Files/new_places_org.csv", encoding = "utf-8-sig")
new_x.to_csv("/content/drive/MyDrive/준희/Report/Files/new_places_x.csv", encoding = "utf-8-sig")
output_df.to_csv("/content/drive/MyDrive/준희/Report/Files/new_places_output.csv", encoding = "utf-8-sig")

### 최종 결과는

In [None]:
output_df

Unnamed: 0,날짜,주소,decimalLatitude,decimalLongitude,예측,예측 확률
0,2024-07-28,서울특별시 강남구 대치동 507,37.4933577446877,127.06190904547,1,0.628422
1,2023-06-23,서울특별시 동대문구 제기동 137-418,37.5890727535302,127.035747839869,1,0.797912
2,2021-10-29,서울특별시 서초구 방배동 756-4,37.4971720723398,126.986177463791,0,0.136294
3,2024-01-05,서울특별시 용산구 동빙고동 90-1,37.521620711868,126.997322170193,0,0.11455
4,2022-04-29,서울특별시 성북구 보문동5가 235,37.5808694213932,127.021007360417,0,0.397331


In [None]:
output_df = output_df.drop(['decimalLatitude', 'decimalLongitude'], axis = 1)
output_df

Unnamed: 0,날짜,주소,예측,예측 확률
0,2024-07-28,서울특별시 강남구 대치동 507,1,0.628422
1,2023-06-23,서울특별시 동대문구 제기동 137-418,1,0.797912
2,2021-10-29,서울특별시 서초구 방배동 756-4,0,0.136294
3,2024-01-05,서울특별시 용산구 동빙고동 90-1,0,0.11455
4,2022-04-29,서울특별시 성북구 보문동5가 235,0,0.397331


### 프레젠테이션

In [37]:
### Input - 주의 : 행정동이 명시적으로 존재해야
input = ["서초구 서초동 1377",
         "서울특별시 강남구 대치동 507",
         "서울특별시 서초구 방배동 756-4",
         "서울특별시 용산구 동빙고동 90-1",
         "서울특별시 동대문구 제기동 137-418",
         "서울특별시 성북구 보문동5가 235"]
date = ['2024-06-12', '2024-07-28', '2023-06-23', '2021-10-29', '2024-01-05', '2022-04-29']
new_data = pd.DataFrame({'날짜' : date, '주소' : input})

### Model, BoxCox Transformer, Scaler Path
model_path = '/content/drive/MyDrive/세은/model.pickle'
transformer_path = '/content/drive/MyDrive/준희/Report/Files/BoxCoxTransformer_wo_env.pkl'
scaler_path = '/content/drive/MyDrive/준희/Report/Files/StandardScaler_wo_env.pkl'

### Prediction - 기후 없이!
new_data_org, new_x, output_df = prediction_without_env(new_data, transformer_path, scaler_path, model_path)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pothole_output['총교통량'] = pothole_output['승용차'] + pothole_output['버스'] + pothole_output['트럭']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pothole_output['중대형차량 교통량'] = (pothole_output['버스'] + pothole_output['트럭']) / pothole_output['총교통량']
Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: EPSG:5174
Right CRS: None

  new_pothole_soil = gpd.sjoin(gdf_pothole2, soil_df[['SOILDRA', 'geometry']], how='left', predicate='within')
Use `to_crs()` to reproject one of the i

In [34]:
new_x

Unnamed: 0,차선수,승용차,버스,트럭,총교통량,중대형차량 교통량,평균_건물연령,인구 수,배수등급,경사도
0,6,0.312042,0.869574,-0.239991,0.282995,-0.446776,-0.503202,-0.419831,2,1
1,2,-2.073907,-1.78578,-1.521183,-2.082963,0.653158,-0.315915,-0.419831,2,1
2,2,-1.129121,-0.163307,0.03494,-0.973517,2.134739,0.896901,-1.559461,3,1
3,4,-0.567495,0.683146,-0.267261,-0.497331,1.11831,1.321718,-0.363322,5,3
4,4,-0.877968,-0.193483,0.13426,-0.755174,1.757725,0.143847,0.134013,2,1


In [29]:
new_data_org

Unnamed: 0,발생일,주소,위도,경도,자치구,행정동,도로명,차선수,승용차,버스,트럭,총교통량,중대형차량 교통량,평균_건물연령,배수등급,경사도,인구 수
0,2024-06-12,서초구 서초동 1377,37.4970270625909,127.024585952765,서초구,서초동,서초대로,6,24245,1329,1788,27362,0.113917,29.961279,약간불량,2-7%,169016.647059
1,2024-07-28,서울특별시 서초구 방배동 1001-9,37.4822019390629,127.004015877281,서초구,방배동,효령로,6,13433,1054,2679,17166,0.217465,31.110233,양호,2-7%,169016.647059
2,2023-06-23,서울특별시 동대문구 제기동 137-418,37.5890727535302,127.035747839869,동대문구,제기동,안암로,4,13747,1174,1755,16676,0.175642,44.211557,매우양호,15-30%,171931.647059
3,2021-10-29,서울특별시 서초구 방배동 756-4,37.4971720723398,126.986177463791,서초구,방배동,방배중앙로,2,4804,132,749,5685,0.154969,31.110233,약간불량,2-7%,169016.647059
4,2024-01-05,서울특별시 용산구 동빙고동 90-1,37.521620711868,126.997322170193,용산구,동빙고동,서빙고로,2,9406,632,2158,12196,0.228764,40.192727,약간양호,2-7%,108959.470588
5,2022-04-29,서울특별시 성북구 보문동5가 235,37.5808694213932,127.021007360417,성북구,보문동5가,보문로,4,11165,617,2310,14092,0.207707,34.192982,약간불량,2-7%,197381.705882


In [38]:
output_df

Unnamed: 0,날짜,주소,예측,예측 확률
0,2024-06-12,서초구 서초동 1377,1,0.874646
1,2024-07-28,서울특별시 강남구 대치동 507,1,0.628422
2,2023-06-23,서울특별시 서초구 방배동 756-4,0,0.136294
3,2021-10-29,서울특별시 용산구 동빙고동 90-1,0,0.11455
4,2024-01-05,서울특별시 동대문구 제기동 137-418,1,0.797912
5,2022-04-29,서울특별시 성북구 보문동5가 235,0,0.397331


In [39]:
### 저장하기
new_data_org.to_csv("/content/drive/MyDrive/준희/Report/Files/new_places_org_pres.csv", encoding = "utf-8-sig")
new_x.to_csv("/content/drive/MyDrive/준희/Report/Files/new_places_x_pres.csv", encoding = "utf-8-sig")
output_df.to_csv("/content/drive/MyDrive/준희/Report/Files/new_places_output_pres.csv", encoding = "utf-8-sig")