In [None]:
import pandas as pd
import numpy as np
import pickle
### 모델링--------------------------
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, precision_score, f1_score, recall_score, accuracy_score, roc_auc_score
# 업샘플링
!pip install imbalanced-learn
from imblearn.over_sampling import SMOTE
### x 만들기 -----------------------
# 지오코딩
!pip install pandas openpyxl
!pip install geokakao
import geokakao as gk
# 역지오코딩
import requests
import re
# 배수등급, 경사도
import geopandas as gpd
from shapely.geometry import Point
### 새로운 데이터 --------------------
from sklearn.preprocessing import PowerTransformer, StandardScaler
from sklearn.preprocessing import LabelEncoder
import joblib

Collecting geokakao
  Downloading geokakao-0.1.1-py3-none-any.whl.metadata (2.5 kB)
Downloading geokakao-0.1.1-py3-none-any.whl (3.6 kB)
Installing collected packages: geokakao
Successfully installed geokakao-0.1.1


### Prediction

새로운 데이터
- 주소 (도로명?)
- 경도/위도 변환
- 자치구, 행정동 get
- 자치구로 get
  - 인구 수
  - 날짜까지 반영해서 강수량, 습도, 기온
- 행정동으로 get
  - 건물별 노후도
- 정확한 위치로
  - 배수 등급, 경사도, 도로별 교통량/하중량, 속도

In [None]:
x.columns

Index(['차선수', '승용차', '버스', '트럭', '평균_건물연령', '인구 수', '누적 강수량', '평균 습도',
       '1년 기온차', '평균 일교차', '배수등급', '경사도'],
      dtype='object')

#### 경위도, 자치구, 행정동

In [None]:
from google.colab import userdata
api = userdata.get('KAKAO_API_KEY')

In [None]:
# 이렇게 input이 들어오면 (여러 개 들어온다고 가정)
input = ["서울특별시 강남구 대치동 507",
         "서울특별시 동대문구 제기동 137-418",
         "서울특별시 서대문구 홍은동 9-360",
         "서울특별시 영등포구 당산1동 당산로 31길 25",
         "서울특별시 서초구 방배동 756-4",
         "서울특별시 용산구 동빙고동 90-1",
         "서울특별시 성북구 보문동5가 235",
         "서울특별시 노원구 공릉2동 29-4"]
date = ['2024-07-28', '2023-06-23', '2022-05-21', '2023-12-05', '2021-10-29', '2024-01-05', '2022-04-29', '2021-02-03']
new_data = pd.DataFrame({'날짜' : date, '주소' : input})

In [None]:
### 위도, 경도
new_data_coor = new_data.copy()
gk.add_coordinates_to_dataframe(new_data_coor, '주소')
new_data_coor = new_data_coor.rename(columns = {"decimalLatitude" : "위도", "decimalLongitude" : "경도"})

In [None]:
### 자치구, 행정동
# 동 : ~동, ~가 부분 찾아오기
def get_dong(address):
    find = re.search(r"\b(\w+[동가])\b", address)
    if find:
        dong = find.group(1)
        return(dong)
    else:
        return(np.nan)

# 구 : ~구 부분 찾아오기
def get_gu(address):
    find = re.search(r"\b(\w+구)\b", address)
    if find:
        gu = find.group(1)
        return(gu)
    else:
        return(np.nan)

# DataFrame 추가
def add_dong_gu_to_dataframe(df):
    dongs = []
    gus = []

    for i in range(len(df)):
        address = df.loc[i, '주소']

        if address:
            try:
                dong = get_dong(address)
            except:
                dong = np.nan
            try:
                gu = get_gu(address)
            except:
                gu = np.nan
        else:
            dong = np.nan
            gu = np.nan

        dongs.append(dong)
        gus.append(gu)

    df["행정동"] = dongs
    df["자치구"] = gus
    return df

In [None]:
new_data_donggu = new_data_coor.copy()
new_data_donggu = add_dong_gu_to_dataframe(new_data_donggu)

In [None]:
new_data_donggu.head(2)

Unnamed: 0,날짜,주소,위도,경도,행정동,자치구
0,2024-07-28,서울특별시 강남구 대치동 507,37.4933577446877,127.06190904547,대치동,강남구
1,2023-06-23,서울특별시 동대문구 제기동 137-418,37.5890727535302,127.035747839869,제기동,동대문구


In [None]:
new_potholes = new_data_donggu.copy()

#### 도로 데이터

In [None]:
# 도로 데이터
gdf_links = gpd.read_file('/content/drive/MyDrive/DATA_원본/[2024-02-23]NODELINKDATA/MOCT_LINK.shp', encoding='euc-kr')
gdf_links['LINK_ID'] = gdf_links['LINK_ID'].astype(int)

# 새로운 데이터 공간 데이터로 변환
gdf_potholes = gpd.GeoDataFrame(
    new_potholes,
    geometry = gpd.points_from_xy(new_potholes['경도'], new_potholes['위도']),
    crs="EPSG:4326")  # 위/경도 WGS84 좌표계로 설정

# 새로운 데이터 좌표계 통일
if gdf_links.crs is not None:
    gdf_potholes = gdf_potholes.to_crs(gdf_links.crs)

# 새로운 데이터의 위치와 가까운 도로 번호(+도로정보) 매칭
gdf_nearest = gpd.sjoin_nearest(
    gdf_potholes, gdf_links,
    how='left',
    distance_col='distance'  # 계산한 거리(m)
)

gdf_nearest[['LINK_ID', 'distance']]

Unnamed: 0,LINK_ID,distance
0,1220018508,47.821631
1,1070005202,12.337265


In [None]:
### 교통량 데이터
traffic_d = pd.read_excel('/content/drive/MyDrive/DATA_원본/TrafficVolume(LINK).xlsx', header=[0, 1])

# 전처리
traffic_d.columns = [
    f"{upper}" if 'Unnamed' in str(lower) else f"{upper}_{lower}"
    for upper, lower in traffic_d.columns]
traffic_d.rename(columns={
    'ITS LINK ID': 'ITS_LINK_ID',
    '승용차-평일_전일': '승용차',
    '버스-평일_전일': '버스',
    '트럭-평일_전일': '트럭'}, inplace=True)

traffic_d['ITS_LINK_ID'] = traffic_d['ITS_LINK_ID'].astype(str).str.split(',')
traffic_d = traffic_d.explode('ITS_LINK_ID')
traffic_d['LINK_ID'] = traffic_d['ITS_LINK_ID'].str.strip().astype(int)
traffic_df = traffic_d[['LINK_ID', '도로명', '차선수', '승용차', '버스', '트럭']].copy()
traffic_df = (traffic_df.dropna(subset=['LINK_ID']).drop_duplicates('LINK_ID').reset_index(drop=True).astype({'LINK_ID': 'int'}))

# 교통량 join
pothole_traffic = gdf_nearest.merge(traffic_df, on='LINK_ID', how='left')

In [None]:
pothole_traffic.columns

Index(['날짜', '주소', '위도', '경도', '행정동', '자치구', 'geometry', 'index_right',
       'LINK_ID', 'F_NODE', 'T_NODE', 'LANES', 'ROAD_RANK', 'ROAD_TYPE',
       'ROAD_NO', 'ROAD_NAME', 'ROAD_USE', 'MULTI_LINK', 'CONNECT', 'MAX_SPD',
       'REST_VEH', 'REST_W', 'REST_H', 'C-ITS', 'LENGTH', 'UPDATEDATE',
       'REMARK', 'HIST_TYPE', 'HISTREMARK', 'distance', '도로명', '차선수', '승용차',
       '버스', '트럭'],
      dtype='object')

In [None]:
new_pothole_traffic = pothole_traffic[['날짜', '주소', '위도', '경도', '행정동', '자치구', 'LINK_ID', '도로명', '차선수', '승용차', '버스', '트럭']]

#### 토양 데이터

In [None]:
### 건물별 평균 연령
old = pd.read_csv('/content/drive/MyDrive/DATA_원본/건축물연령정보_행정동_평균.csv')
bup = pd.read_csv('/content/drive/MyDrive/DATA_원본/국토교통부_법정동코드_20240805.csv', encoding = 'cp949')
bup['법정동명'] = bup['법정동명'].astype(str)

# 법정동 만들기
new_pothole_traffic['법정동명'] = '서울특별시 ' + new_pothole_traffic['자치구'] + ' ' + new_pothole_traffic['행정동']
new_pothole_traffic['법정동명'] = new_pothole_traffic['법정동명'].astype(str)

# join
new_pothole_building = new_pothole_traffic.merge(bup[['법정동코드', '법정동명']], on = '법정동명', how = 'left')
new_pothole_building = new_pothole_building.merge(old[['법정동코드', '평균_건물연령']], on = '법정동코드', how = 'left')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_pothole_traffic['법정동명'] = '서울특별시 ' + new_pothole_traffic['자치구'] + ' ' + new_pothole_traffic['행정동']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_pothole_traffic['법정동명'] = new_pothole_traffic['법정동명'].astype(str)


In [None]:
### 배수등급
# 공간 데이터로 변환
gdf_pothole2 = gpd.GeoDataFrame(
    new_pothole_building,
    geometry=gpd.points_from_xy(new_pothole_building['경도'], new_pothole_building['위도']),
    crs="EPSG:4326")

# 포트홀 좌표계 → EPSG:5174 로 변환 (shp에 맞추기)
gdf_pothole2 = gdf_pothole2.to_crs("EPSG:5174")

# 배수등급 로드
soil_df = gpd.read_file("/content/drive/MyDrive/DATA_원본/ASIT_SOILDRA_AREA/ASIT_SOILDRA_AREA.shp")

# join
new_pothole_soil = gpd.sjoin(gdf_pothole2, soil_df[['SOILDRA', 'geometry']], how='left', predicate='within')

Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: EPSG:5174
Right CRS: None

  new_pothole_soil = gpd.sjoin(gdf_pothole2, soil_df[['SOILDRA', 'geometry']], how='left', predicate='within')


In [None]:
### 토양 경사도
# 데이터 로드
slope_df = gpd.read_file("/content/drive/MyDrive/DATA_원본/ASIT_SOILSLOPE_AREA/ASIT_SOILSLOPE_AREA.shp")
new_pothole_soil.drop(['index_right'], axis=1, inplace=True)

# join
new_pothole_slope = gpd.sjoin(new_pothole_soil, slope_df[['SOILSLOPE', 'geometry']], how='left', predicate='within')

# 열 이름 수정
new_pothole_slope.rename(columns = {'SOILDRA' : '배수등급', 'SOILSLOPE' : '경사도'}, inplace = True)

Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: EPSG:5174
Right CRS: None

  new_pothole_slope = gpd.sjoin(new_pothole_soil, slope_df[['SOILSLOPE', 'geometry']], how='left', predicate='within')


Index(['날짜', '주소', '위도', '경도', '행정동', '자치구', 'LINK_ID', '도로명', '차선수', '승용차',
       '버스', '트럭', '법정동명', '법정동코드', '평균_건물연령', 'geometry', 'SOILDRA',
       'index_right', 'SOILSLOPE'],
      dtype='object')

In [None]:
new_pothole_done = new_pothole_slope[['날짜', '주소', '위도', '경도', '자치구', '행정동', '도로명', '차선수', '승용차', '버스', '트럭', '평균_건물연령', '배수등급' ,'경사도']]

#### 자치구 데이터

In [None]:
# 자치구별
rain_sum = pd.read_pickle('/content/drive/MyDrive/DATA_전처리/자치구별/누적 강수량.pickle')
humid_mean = pd.read_pickle('/content/drive/MyDrive/DATA_전처리/자치구별/평균 습도.pickle')
temp_total_diff = pd.read_pickle('/content/drive/MyDrive/DATA_전처리/자치구별/1년 기온차.pickle')
temp_monthly_diff = pd.read_pickle('/content/drive/MyDrive/DATA_전처리/자치구별/평균 일교차.pickle')
people = pd.read_pickle('/content/drive/MyDrive/DATA_전처리/자치구별 인구 수.pickle')
people['자치구'] = people['자치구'].str.replace("\u3000","",regex = False)

In [None]:
new_pothole_done.rename(columns = {'날짜' : '발생일'}, inplace = True)
new_pothole_done['발생일'] = pd.to_datetime(new_pothole_done['발생일'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_pothole_done.rename(columns = {'날짜' : '발생일'}, inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_pothole_done['발생일'] = pd.to_datetime(new_pothole_done['발생일'])


In [None]:
### 인구 수
people_merged = pd.merge(new_pothole_done, people, on = '자치구', how = 'left')
### 강수량
rain_merged = pd.merge(people_merged, rain_sum, on = ['자치구', '발생일'], how = 'left')
### 습도
humid_merged = pd.merge(rain_merged, humid_mean, on = ['자치구', '발생일'], how = 'left')
### 기온
temp_merged1 = pd.merge(humid_merged, temp_total_diff, on = ['자치구', '발생일'], how = 'left')
new_pothole = pd.merge(temp_merged1, temp_monthly_diff, on = ['자치구', '발생일'], how = 'left')

### x 형식으로

In [None]:
new_pothole.columns

Index(['발생일', '주소', '위도', '경도', '자치구', '행정동', '도로명', '차선수', '승용차', '버스', '트럭',
       '평균_건물연령', '배수등급', '경사도', '인구 수', '누적 강수량', '평균 습도', '1년 기온차',
       '평균 일교차'],
      dtype='object')

In [None]:
new_x = new_pothole[['차선수', '승용차', '버스', '트럭', '평균_건물연령', '인구 수', '누적 강수량', '평균 습도', '1년 기온차', '평균 일교차', '배수등급', '경사도']]

In [None]:
### Box-Cox 변환 - 기후 포함
# 변환 데이터
df = pd.read_csv('/content/drive/MyDrive/DATA_완성/dataset.csv')
# 변환 대상 컬럼
cols = ['승용차', '버스', '트럭', '총교통량', '중대형차량 교통량', '평균_건물연령', '인구 수', '누적 강수량', '평균 습도', '1년 기온차', '평균 일교차']
# 변환기, 스케일러
transform = PowerTransformer(method = 'box-cox', standardize = False)
scaler = StandardScaler()
# Box-Cox + 스케일링 진행
arr = df[cols].values + 1e-6
bc = transform.fit_transform(arr)
bc_std = scaler.fit_transform(bc)
df[cols] = bc_std
# 변환기, 스케일러 저장
joblib.dump(transform, '/content/drive/MyDrive/준희/Modeling/Prediction/BoxCoxTransformer_w_env.pkl')
joblib.dump(scaler, '/content/drive/MyDrive/준희/Modeling/Prediction/StandardScaler_w_env.pkl')

['/content/drive/MyDrive/준희/Modeling/Prediction/StandardScaler_w_env.pkl']

In [None]:
### Box-Cox 변환 - 기후 제외
# 변환 데이터
df = pd.read_csv('/content/drive/MyDrive/DATA_완성/dataset.csv')
# 변환 대상 컬럼
cols = ['승용차', '버스', '트럭',  '총교통량', '중대형차량 교통량', '평균_건물연령', '인구 수']
# 변환기, 스케일러
transform = PowerTransformer(method = 'box-cox', standardize = False)
scaler = StandardScaler()
# Box-Cox + 스케일링 진행
arr = df[cols].values + 1e-6
bc = transform.fit_transform(arr)
bc_std = scaler.fit_transform(bc)
df[cols] = bc_std
# 변환기, 스케일러 저장
joblib.dump(transform, '/content/drive/MyDrive/준희/Modeling/Prediction/BoxCoxTransformer_wo_env.pkl')
joblib.dump(scaler, '/content/drive/MyDrive/준희/Modeling/Prediction/StandardScaler_wo_env.pkl')

['/content/drive/MyDrive/준희/Modeling/Prediction/StandardScaler_wo_env.pkl']

In [None]:
# 저장된 변환기, 스케일러 불러오기
transformer = joblib.load('/content/drive/MyDrive/준희/Modeling/Prediction/BoxCoxTransformer.pkl')
scaler = joblib.load('/content/drive/MyDrive/준희/Modeling/Prediction/StandardScaler.pkl')
# 새로운 데이터 변환
cols = ['승용차', '버스', '트럭', '평균_건물연령', '인구 수', '누적 강수량', '평균 습도', '1년 기온차', '평균 일교차']
arr = new_x[cols].values + 1e-6
bc = transformer.transform(arr)
bc_std = scaler.transform(bc)
new_x[cols] = bc_std

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_x[cols] = bc_std


In [None]:
### 배수등급, 경사도 변수 처리
# 경사도
slope_encoding = {
    '0-2%': 0,
    '2-7%': 1,
    '7-15%': 2,
    '15-30%': 3,
    '30-60%': 4,
    '60-100%': 5}
# 배수등급
drain_encoding = {
    '매우양호': 5,
    '양호': 4,
    '약간양호': 3,
    '약간불량': 2,
    '불량': 1,
    '매우불량': 0}
# 매핑
new_x['경사도'] = new_x['경사도'].map(slope_encoding)
new_x['배수등급'] = new_x['배수등급'].map(drain_encoding)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_x['경사도'] = new_x['경사도'].map(slope_encoding)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_x['배수등급'] = new_x['배수등급'].map(drain_encoding)


In [None]:
new_x

Unnamed: 0,차선수,승용차,버스,트럭,평균_건물연령,인구 수,누적 강수량,평균 습도,1년 기온차,평균 일교차,배수등급,경사도
0,4,0.473499,0.64883,0.218257,-0.703321,0.928563,0.508823,0.653368,-0.967038,-1.238875,4,0
1,4,-0.567495,0.683146,-0.267261,1.321718,-0.363322,0.750267,-1.451302,-0.010344,0.339432,5,3


In [None]:
### 모델 로드
xgb_model = joblib.load('/content/drive/MyDrive/준희/Modeling/Prediction/xgb_model.pkl')
y_pred = xgb_model.predict(new_x)
y_pred_prob = xgb_model.predict_proba(new_x)[:, 1]

In [None]:
y_pred

array([1, 1])

In [None]:
y_pred_prob

array([0.99994135, 0.9999671 ], dtype=float32)

### Modeling

기존 데이터로 모델 만들고

In [None]:
### 데이터 불러오기
pothole = pd.read_pickle('/content/drive/MyDrive/DATA_완성/pothole_예측용.pickle')
pothole['발생여부'] = np.where(pothole['포트홀 갯수'] >=1, 1, 0)

### x, y 할당
x = pothole.drop(['자치구', '행정동', '경도', '위도', '등록번호', '발생일', '자치구_enc', '총교통량', '중대형차량 교통량', '포트홀 갯수', '발생여부'], axis=1)
x = x.drop(['누적 강수량', '평균 습도', '1년 기온차', '평균 일교차'], axis = 1) ########## 날씨 제외하고
y = pothole['발생여부']
x.head(2)

### train-test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, stratify = y, random_state = 1384)

### SMOTE
smote = SMOTE(random_state = 1392)
x_train_up, y_train_up = smote.fit_resample(x_train, y_train)

In [None]:
### XGBoost
xgb_model = xgb.XGBClassifier(random_state = 1393)
xgb_model.fit(x_train_up, y_train_up)
# prediction
y_pred = xgb_model.predict(x_test)
y_pred_prob = xgb_model.predict_proba(x_test)[:, 1]
# confusion matrix
confusion = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = confusion.ravel()
fnr = fn / (fn + tp)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
# 성능 확인
print('accuracy : {0:.4f}'.format(accuracy_score(y_test, y_pred)))
print('roc_auc : {0:.4f}'.format(roc_auc_score(y_test, y_pred_prob)))
print('FNR : {0:.4F}'.format(fnr))
print('precision : {0:.4f}'.format(precision))
print('f1 : {0:.4f}'.format(f1))

accuracy : 0.7664
roc_auc : 0.8421
FNR : 0.2366
precision : 0.7318
f1 : 0.7473


In [None]:
### 모델 저장
joblib.dump(xgb_model, '/content/drive/MyDrive/준희/Modeling/Prediction/xgb_model_wo_env.pkl')

['/content/drive/MyDrive/준희/Modeling/Prediction/xgb_model_wo_env.pkl']

### 정리

In [None]:
def geo_coding(df):
    ### 위도, 경도
    gk.add_coordinates_to_dataframe(df, '주소')
    df = df.rename(columns = {"decimalLatitude" : "위도", "decimalLongitude" : "경도"})

    ### 자치구, 행정동
    # 동 : ~동, ~가 부분 찾아오기
    def get_dong(address):
        find = re.search(r"\b(\w+[동가])\b", address)
        if find:
            dong = find.group(1)
            return(dong)
        else:
            return(np.nan)

    # 구 : ~구 부분 찾아오기
    def get_gu(address):
        find = re.search(r"\b(\w+구)\b", address)
        if find:
            gu = find.group(1)
            return(gu)
        else:
            return(np.nan)

    # DataFrame 추가
    def add_dong_gu_to_dataframe(df):
        dongs = []
        gus = []

        for i in range(len(df)):
            address = df.loc[i, '주소']

            if address:
                try:
                    dong = get_dong(address)
                except:
                    dong = np.nan
                try:
                    gu = get_gu(address)
                except:
                    gu = np.nan
            else:
                dong = np.nan
                gu = np.nan

            dongs.append(dong)
            gus.append(gu)

        df["행정동"] = dongs
        df["자치구"] = gus
        return df

    df = add_dong_gu_to_dataframe(df)
    return df

In [None]:
def traffic(df):
    # 도로 데이터
    gdf_links = gpd.read_file('/content/drive/MyDrive/DATA_원본/[2024-02-23]NODELINKDATA/MOCT_LINK.shp', encoding='euc-kr')
    gdf_links['LINK_ID'] = gdf_links['LINK_ID'].astype(int)

    # 새로운 데이터 공간 데이터로 변환
    gdf_potholes = gpd.GeoDataFrame(
        df,
        geometry = gpd.points_from_xy(df['경도'], df['위도']),
        crs="EPSG:4326")  # 위/경도 WGS84 좌표계로 설정

    # 새로운 데이터 좌표계 통일
    if gdf_links.crs is not None:
        gdf_potholes = gdf_potholes.to_crs(gdf_links.crs)

    # 새로운 데이터의 위치와 가까운 도로 번호(+도로정보) 매칭
    gdf_nearest = gpd.sjoin_nearest(
        gdf_potholes, gdf_links,
        how='left',
        distance_col='distance')  # 계산한 거리(m)

    ### 교통량 데이터
    traffic_d = pd.read_excel('/content/drive/MyDrive/DATA_원본/TrafficVolume(LINK).xlsx', header=[0, 1])

    # 전처리
    traffic_d.columns = [
        f"{upper}" if 'Unnamed' in str(lower) else f"{upper}_{lower}"
        for upper, lower in traffic_d.columns]
    traffic_d.rename(columns={
        'ITS LINK ID': 'ITS_LINK_ID',
        '승용차-평일_전일': '승용차',
        '버스-평일_전일': '버스',
        '트럭-평일_전일': '트럭'}, inplace=True)

    traffic_d['ITS_LINK_ID'] = traffic_d['ITS_LINK_ID'].astype(str).str.split(',')
    traffic_d = traffic_d.explode('ITS_LINK_ID')
    traffic_d['LINK_ID'] = traffic_d['ITS_LINK_ID'].str.strip().astype(int)
    traffic_df = traffic_d[['LINK_ID', '도로명', '차선수', '승용차', '버스', '트럭']].copy()
    traffic_df = (traffic_df.dropna(subset=['LINK_ID']).drop_duplicates('LINK_ID').reset_index(drop=True).astype({'LINK_ID': 'int'}))

    # 교통량 join
    pothole_traffic = gdf_nearest.merge(traffic_df, on='LINK_ID', how='left')
    pothole_output = pothole_traffic[['날짜', '주소', '위도', '경도', '행정동', '자치구', 'LINK_ID', '도로명', '차선수', '승용차', '버스', '트럭']]

    return pothole_output

In [None]:
def nature(new_pothole_traffic):

    ### 건물별 평균 연령
    old = pd.read_csv('/content/drive/MyDrive/DATA_원본/건축물연령정보_행정동_평균.csv')
    bup = pd.read_csv('/content/drive/MyDrive/DATA_원본/국토교통부_법정동코드_20240805.csv', encoding = 'cp949')
    bup['법정동명'] = bup['법정동명'].astype(str)

    # 법정동 만들기
    new_pothole_traffic['법정동명'] = '서울특별시 ' + new_pothole_traffic['자치구'] + ' ' + new_pothole_traffic['행정동']
    new_pothole_traffic['법정동명'] = new_pothole_traffic['법정동명'].astype(str)

    # join
    new_pothole_building = new_pothole_traffic.merge(bup[['법정동코드', '법정동명']], on = '법정동명', how = 'left')
    new_pothole_building = new_pothole_building.merge(old[['법정동코드', '평균_건물연령']], on = '법정동코드', how = 'left')

    ### 배수등급
    # 공간 데이터로 변환
    gdf_pothole2 = gpd.GeoDataFrame(
        new_pothole_building,
        geometry=gpd.points_from_xy(new_pothole_building['경도'], new_pothole_building['위도']),
        crs="EPSG:4326")

    # 포트홀 좌표계 → EPSG:5174 로 변환 (shp에 맞추기)
    gdf_pothole2 = gdf_pothole2.to_crs("EPSG:5174")

    # 배수등급 로드
    soil_df = gpd.read_file("/content/drive/MyDrive/DATA_원본/ASIT_SOILDRA_AREA/ASIT_SOILDRA_AREA.shp")

    # join
    new_pothole_soil = gpd.sjoin(gdf_pothole2, soil_df[['SOILDRA', 'geometry']], how='left', predicate='within')

    ### 토양 경사도
    # 데이터 로드
    slope_df = gpd.read_file("/content/drive/MyDrive/DATA_원본/ASIT_SOILSLOPE_AREA/ASIT_SOILSLOPE_AREA.shp")
    new_pothole_soil.drop(['index_right'], axis=1, inplace=True)

    # join
    new_pothole_slope = gpd.sjoin(new_pothole_soil, slope_df[['SOILSLOPE', 'geometry']], how='left', predicate='within')

    # 열 이름 수정
    new_pothole_slope.rename(columns = {'SOILDRA' : '배수등급', 'SOILSLOPE' : '경사도'}, inplace = True)
    new_pothole_done = new_pothole_slope[['날짜', '주소', '위도', '경도', '자치구', '행정동', '도로명', '차선수', '승용차', '버스', '트럭', '평균_건물연령', '배수등급' ,'경사도']]

    return new_pothole_done

In [None]:
def join_gu(new_pothole_done):
    ### 자치구별 데이터 로드
    rain_sum = pd.read_pickle('/content/drive/MyDrive/DATA_전처리/자치구별/누적 강수량.pickle')
    humid_mean = pd.read_pickle('/content/drive/MyDrive/DATA_전처리/자치구별/평균 습도.pickle')
    temp_total_diff = pd.read_pickle('/content/drive/MyDrive/DATA_전처리/자치구별/1년 기온차.pickle')
    temp_monthly_diff = pd.read_pickle('/content/drive/MyDrive/DATA_전처리/자치구별/평균 일교차.pickle')
    people = pd.read_pickle('/content/drive/MyDrive/DATA_전처리/자치구별 인구 수.pickle')
    people['자치구'] = people['자치구'].str.replace("\u3000","",regex = False)

    new_pothole_done.rename(columns = {'날짜' : '발생일'}, inplace = True)
    new_pothole_done['발생일'] = pd.to_datetime(new_pothole_done['발생일'])

    ### 인구 수
    people_merged = pd.merge(new_pothole_done, people, on = '자치구', how = 'left')
    ### 강수량
    rain_merged = pd.merge(people_merged, rain_sum, on = ['자치구', '발생일'], how = 'left')
    ### 습도
    humid_merged = pd.merge(rain_merged, humid_mean, on = ['자치구', '발생일'], how = 'left')
    ### 기온
    temp_merged1 = pd.merge(humid_merged, temp_total_diff, on = ['자치구', '발생일'], how = 'left')
    new_pothole = pd.merge(temp_merged1, temp_monthly_diff, on = ['자치구', '발생일'], how = 'left')

    return people_merged

In [None]:
def prediction(new_pothole, model_path):
    ## x 할당
    new_x = new_pothole[['차선수', '승용차', '버스', '트럭', '평균_건물연령', '인구 수', '누적 강수량', '평균 습도', '1년 기온차', '평균 일교차', '배수등급', '경사도']]

    # 저장된 변환기, 스케일러 불러오기
    transformer = joblib.load('/content/drive/MyDrive/준희/Modeling/Prediction/BoxCoxTransformer.pkl')
    scaler = joblib.load('/content/drive/MyDrive/준희/Modeling/Prediction/StandardScaler.pkl')
    # 새로운 데이터 변환
    cols = ['승용차', '버스', '트럭', '평균_건물연령', '인구 수', '누적 강수량', '평균 습도', '1년 기온차', '평균 일교차']
    arr = new_x[cols].values + 1e-6
    bc = transformer.transform(arr)
    bc_std = scaler.transform(bc)
    new_x[cols] = bc_std

    ### 배수등급, 경사도 변수 처리
    # 경사도
    slope_encoding = {
        '0-2%': 0,
        '2-7%': 1,
        '7-15%': 2,
        '15-30%': 3,
        '30-60%': 4,
        '60-100%': 5}
    # 배수등급
    drain_encoding = {
        '매우양호': 5,
        '양호': 4,
        '약간양호': 3,
        '약간불량': 2,
        '불량': 1,
        '매우불량': 0}
    # 매핑
    new_x['경사도'] = new_x['경사도'].map(slope_encoding)
    new_x['배수등급'] = new_x['배수등급'].map(drain_encoding)

    ### 모델 로드
    xgb_model = joblib.load(model_path)
    y_pred = xgb_model.predict(new_x)
    y_pred_prob = xgb_model.predict_proba(new_x)[:, 1]

    return y_pred, y_pred_prob

In [None]:
# 이렇게 input이 들어오면 (여러 개 들어온다고 가정)
input = ["서울특별시 강남구 대치동 507",
         "서울특별시 동대문구 제기동 137-418",
         "서울특별시 서대문구 홍은동 9-360",
         "서울특별시 영등포구 당산1동 당산로 31길 25",
         "서울특별시 서초구 방배동 756-4",
         "서울특별시 용산구 동빙고동 90-1",
         "서울특별시 성북구 보문동5가 235",
         "서울특별시 노원구 공릉2동 29-4"]
date = ['2024-07-28', '2023-06-23', '2022-05-21', '2023-12-05', '2021-10-29', '2024-01-05', '2022-04-29', '2021-02-03']
new_data = pd.DataFrame({'날짜' : date, '주소' : input})

In [None]:
geo = geo_coding(new_data)

In [None]:
traffic_df = traffic(geo)

In [None]:
nature_df = nature(traffic_df)

Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: EPSG:5174
Right CRS: None

  new_pothole_soil = gpd.sjoin(gdf_pothole2, soil_df[['SOILDRA', 'geometry']], how='left', predicate='within')
Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: EPSG:5174
Right CRS: None

  new_pothole_slope = gpd.sjoin(new_pothole_soil, slope_df[['SOILSLOPE', 'geometry']], how='left', predicate='within')


In [None]:
new_potholes = join_gu(nature_df)

In [None]:
model_path = '/content/drive/MyDrive/준희/Modeling/Prediction/xgb_model.pkl'
y_pred, y_pred_prob = prediction(new_potholes, model_path)

KeyError: "['누적 강수량', '평균 습도', '1년 기온차', '평균 일교차'] not in index"

In [None]:
y_pred

array([1, 1, 1, 1, 1, 1, 1, 0])

In [None]:
y_pred_prob

array([0.99994135, 0.9999671 , 0.99970335, 0.9999877 , 0.9990633 ,
       0.999943  , 0.9917114 , 0.01495602], dtype=float32)

### 최종최종

#### 환경 요인 제외

##### 함수들 정의

In [None]:
### ----------------------------------------------------------------------------------

def geo_coding(df):
    ### 위도, 경도
    gk.add_coordinates_to_dataframe(df, '주소')
    df = df.rename(columns = {"decimalLatitude" : "위도", "decimalLongitude" : "경도"})

    ### 자치구, 행정동
    # 동 : ~동, ~가 부분 찾아오기
    def get_dong(address):
        find = re.search(r"\b(\w+[동가])\b", address)
        if find:
            dong = find.group(1)
            return(dong)
        else:
            return(np.nan)

    # 구 : ~구 부분 찾아오기
    def get_gu(address):
        find = re.search(r"\b(\w+구)\b", address)
        if find:
            gu = find.group(1)
            return(gu)
        else:
            return(np.nan)

    # DataFrame 추가
    def add_dong_gu_to_dataframe(df):
        dongs = []
        gus = []

        for i in range(len(df)):
            address = df.loc[i, '주소']

            if address:
                try:
                    dong = get_dong(address)
                except:
                    dong = np.nan
                try:
                    gu = get_gu(address)
                except:
                    gu = np.nan
            else:
                dong = np.nan
                gu = np.nan

            dongs.append(dong)
            gus.append(gu)

        df["행정동"] = dongs
        df["자치구"] = gus
        return df

    df = add_dong_gu_to_dataframe(df)
    return df

### ----------------------------------------------------------------------------------

def traffic(df):
    # 도로 데이터
    gdf_links = gpd.read_file('/content/drive/MyDrive/DATA_원본/[2024-02-23]NODELINKDATA/MOCT_LINK.shp', encoding='euc-kr')
    gdf_links['LINK_ID'] = gdf_links['LINK_ID'].astype(int)

    # 새로운 데이터 공간 데이터로 변환
    gdf_potholes = gpd.GeoDataFrame(
        df,
        geometry = gpd.points_from_xy(df['경도'], df['위도']),
        crs="EPSG:4326")  # 위/경도 WGS84 좌표계로 설정

    # 새로운 데이터 좌표계 통일
    if gdf_links.crs is not None:
        gdf_potholes = gdf_potholes.to_crs(gdf_links.crs)

    # 새로운 데이터의 위치와 가까운 도로 번호(+도로정보) 매칭
    gdf_nearest = gpd.sjoin_nearest(
        gdf_potholes, gdf_links,
        how='left',
        distance_col='distance')  # 계산한 거리(m)

    ### 교통량 데이터
    traffic_d = pd.read_excel('/content/drive/MyDrive/DATA_원본/TrafficVolume(LINK).xlsx', header=[0, 1])

    # 전처리
    traffic_d.columns = [
        f"{upper}" if 'Unnamed' in str(lower) else f"{upper}_{lower}"
        for upper, lower in traffic_d.columns]
    traffic_d.rename(columns={
        'ITS LINK ID': 'ITS_LINK_ID',
        '승용차-평일_전일': '승용차',
        '버스-평일_전일': '버스',
        '트럭-평일_전일': '트럭'}, inplace=True)

    traffic_d['ITS_LINK_ID'] = traffic_d['ITS_LINK_ID'].astype(str).str.split(',')
    traffic_d = traffic_d.explode('ITS_LINK_ID')
    traffic_d['LINK_ID'] = traffic_d['ITS_LINK_ID'].str.strip().astype(int)
    traffic_df = traffic_d[['LINK_ID', '도로명', '차선수', '승용차', '버스', '트럭']].copy()
    traffic_df = (traffic_df.dropna(subset=['LINK_ID']).drop_duplicates('LINK_ID').reset_index(drop=True).astype({'LINK_ID': 'int'}))

    # 교통량 join
    pothole_traffic = gdf_nearest.merge(traffic_df, on='LINK_ID', how='left')
    pothole_output = pothole_traffic[['날짜', '주소', '위도', '경도', '행정동', '자치구', 'LINK_ID', '도로명', '차선수', '승용차', '버스', '트럭']]

    return pothole_output

### ----------------------------------------------------------------------------------

def nature(new_pothole_traffic):

    ### 건물별 평균 연령
    old = pd.read_csv('/content/drive/MyDrive/DATA_원본/건축물연령정보_행정동_평균.csv')
    bup = pd.read_csv('/content/drive/MyDrive/DATA_원본/국토교통부_법정동코드_20240805.csv', encoding = 'cp949')
    bup['법정동명'] = bup['법정동명'].astype(str)

    # 법정동 만들기
    new_pothole_traffic['법정동명'] = '서울특별시 ' + new_pothole_traffic['자치구'] + ' ' + new_pothole_traffic['행정동']
    new_pothole_traffic['법정동명'] = new_pothole_traffic['법정동명'].astype(str)

    # join
    new_pothole_building = new_pothole_traffic.merge(bup[['법정동코드', '법정동명']], on = '법정동명', how = 'left')
    new_pothole_building = new_pothole_building.merge(old[['법정동코드', '평균_건물연령']], on = '법정동코드', how = 'left')

    ### 배수등급
    # 공간 데이터로 변환
    gdf_pothole2 = gpd.GeoDataFrame(
        new_pothole_building,
        geometry=gpd.points_from_xy(new_pothole_building['경도'], new_pothole_building['위도']),
        crs="EPSG:4326")

    # 포트홀 좌표계 → EPSG:5174 로 변환 (shp에 맞추기)
    gdf_pothole2 = gdf_pothole2.to_crs("EPSG:5174")

    # 배수등급 로드
    soil_df = gpd.read_file("/content/drive/MyDrive/DATA_원본/ASIT_SOILDRA_AREA/ASIT_SOILDRA_AREA.shp")

    # join
    new_pothole_soil = gpd.sjoin(gdf_pothole2, soil_df[['SOILDRA', 'geometry']], how='left', predicate='within')

    ### 토양 경사도
    # 데이터 로드
    slope_df = gpd.read_file("/content/drive/MyDrive/DATA_원본/ASIT_SOILSLOPE_AREA/ASIT_SOILSLOPE_AREA.shp")
    new_pothole_soil.drop(['index_right'], axis=1, inplace=True)

    # join
    new_pothole_slope = gpd.sjoin(new_pothole_soil, slope_df[['SOILSLOPE', 'geometry']], how='left', predicate='within')

    # 열 이름 수정
    new_pothole_slope.rename(columns = {'SOILDRA' : '배수등급', 'SOILSLOPE' : '경사도'}, inplace = True)
    new_pothole_done = new_pothole_slope[['날짜', '주소', '위도', '경도', '자치구', '행정동', '도로명', '차선수', '승용차', '버스', '트럭', '평균_건물연령', '배수등급' ,'경사도']]

    return new_pothole_done

### ----------------------------------------------------------------------------------

def join_gu(new_pothole_done):
    ### 자치구별 데이터 로드
    people = pd.read_pickle('/content/drive/MyDrive/DATA_전처리/자치구별 인구 수.pickle')
    people['자치구'] = people['자치구'].str.replace("\u3000","",regex = False)

    new_pothole_done.rename(columns = {'날짜' : '발생일'}, inplace = True)
    new_pothole_done['발생일'] = pd.to_datetime(new_pothole_done['발생일'])

    ### 인구 수
    new_pothole = pd.merge(new_pothole_done, people, on = '자치구', how = 'left')

    return new_pothole

### ----------------------------------------------------------------------------------

def prediction(new_pothole, transformer_path, scaler_path, model_path):
    ## x 할당
    new_x = new_pothole[['차선수', '승용차', '버스', '트럭', '평균_건물연령', '인구 수', '배수등급', '경사도']]

    # 저장된 변환기, 스케일러 불러오기
    transformer = joblib.load(transformer_path)
    scaler = joblib.load(scaler_path)
    # 새로운 데이터 변환
    cols = ['승용차', '버스', '트럭', '평균_건물연령', '인구 수']
    arr = new_x[cols].values + 1e-6
    bc = transformer.transform(arr)
    bc_std = scaler.transform(bc)
    new_x[cols] = bc_std

    ### 배수등급, 경사도 변수 처리
    # 경사도
    slope_encoding = {
        '0-2%': 0,
        '2-7%': 1,
        '7-15%': 2,
        '15-30%': 3,
        '30-60%': 4,
        '60-100%': 5}
    # 배수등급
    drain_encoding = {
        '매우양호': 5,
        '양호': 4,
        '약간양호': 3,
        '약간불량': 2,
        '불량': 1,
        '매우불량': 0}
    # 매핑
    new_x['경사도'] = new_x['경사도'].map(slope_encoding)
    new_x['배수등급'] = new_x['배수등급'].map(drain_encoding)

    ### 모델 로드
    xgb_model = joblib.load(model_path)
    y_pred = xgb_model.predict(new_x)
    y_pred_prob = xgb_model.predict_proba(new_x)[:, 1]

    return y_pred, y_pred_prob

##### 예측

In [None]:
def prediction_without_env(new_data, transformer_path, scaler_path, model_path):
    ### 마지막에 출력할 output
    output_df = new_data.copy()
    ### x 만들기
    geo = geo_coding(new_data)
    traffic_df = traffic(geo)
    nature_df = nature(traffic_df)
    new_potholes = join_gu(nature_df)
    ### 예측
    y_pred, y_pred_prob = prediction(new_potholes, transformer_path, scaler_path, model_path)
    ### output
    output_df['예측'] = y_pred
    output_df['예측 확률'] = y_pred_prob
    return output_df

In [None]:
### Input - 주의 : 행정동이 명시적으로 존재해야
input = ["서울특별시 강남구 대치동 507",
         "서울특별시 동대문구 제기동 137-418",
         "서울특별시 서대문구 홍은동 9-360",
         "서울특별시 영등포구 당산1동 당산로 31길 25",
         "서울특별시 서초구 방배동 756-4",
         "서울특별시 용산구 동빙고동 90-1",
         "서울특별시 성북구 보문동5가 235",
         "서울특별시 노원구 공릉2동 29-4"]
date = ['2024-07-28', '2023-06-23', '2022-05-21', '2023-12-05', '2021-10-29', '2024-01-05', '2022-04-29', '2021-02-03']
new_data = pd.DataFrame({'날짜' : date, '주소' : input})

### Model, BoxCox Transformer, Scaler Path
model_path = '/content/drive/MyDrive/준희/Modeling/Prediction/xgb_model_wo_env.pkl'
transformer_path = '/content/drive/MyDrive/준희/Modeling/Prediction/BoxCoxTransformer_wo_env.pkl'
scaler_path = '/content/drive/MyDrive/준희/Modeling/Prediction/StandardScaler_wo_env.pkl'

### Prediction - 기후 없이!
output_df = prediction_without_env(new_data, transformer_path, scaler_path, model_path)

Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: EPSG:5174
Right CRS: None

  new_pothole_soil = gpd.sjoin(gdf_pothole2, soil_df[['SOILDRA', 'geometry']], how='left', predicate='within')
Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: EPSG:5174
Right CRS: None

  new_pothole_slope = gpd.sjoin(new_pothole_soil, slope_df[['SOILSLOPE', 'geometry']], how='left', predicate='within')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_x[cols] = bc_std
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#re

In [None]:
output_df

Unnamed: 0,날짜,주소,예측,예측 확률
0,2024-07-28,서울특별시 강남구 대치동 507,1,0.575764
1,2023-06-23,서울특별시 동대문구 제기동 137-418,1,0.898051
2,2022-05-21,서울특별시 서대문구 홍은동 9-360,1,0.841361
3,2023-12-05,서울특별시 영등포구 당산1동 당산로 31길 25,1,0.749974
4,2021-10-29,서울특별시 서초구 방배동 756-4,0,0.351927
5,2024-01-05,서울특별시 용산구 동빙고동 90-1,1,0.649658
6,2022-04-29,서울특별시 성북구 보문동5가 235,0,0.459055
7,2021-02-03,서울특별시 노원구 공릉2동 29-4,0,0.167979


#### 환경 요인 포함

##### 함수들 정의

In [None]:
### ----------------------------------------------------------------------------------

def geo_coding(df):
    ### 위도, 경도
    gk.add_coordinates_to_dataframe(df, '주소')
    df = df.rename(columns = {"decimalLatitude" : "위도", "decimalLongitude" : "경도"})

    ### 자치구, 행정동
    # 동 : ~동, ~가 부분 찾아오기
    def get_dong(address):
        find = re.search(r"\b(\w+[동가])\b", address)
        if find:
            dong = find.group(1)
            return(dong)
        else:
            return(np.nan)

    # 구 : ~구 부분 찾아오기
    def get_gu(address):
        find = re.search(r"\b(\w+구)\b", address)
        if find:
            gu = find.group(1)
            return(gu)
        else:
            return(np.nan)

    # DataFrame 추가
    def add_dong_gu_to_dataframe(df):
        dongs = []
        gus = []

        for i in range(len(df)):
            address = df.loc[i, '주소']

            if address:
                try:
                    dong = get_dong(address)
                except:
                    dong = np.nan
                try:
                    gu = get_gu(address)
                except:
                    gu = np.nan
            else:
                dong = np.nan
                gu = np.nan

            dongs.append(dong)
            gus.append(gu)

        df["행정동"] = dongs
        df["자치구"] = gus
        return df

    df = add_dong_gu_to_dataframe(df)
    return df

### ----------------------------------------------------------------------------------

def traffic(df):
    # 도로 데이터
    gdf_links = gpd.read_file('/content/drive/MyDrive/DATA_원본/[2024-02-23]NODELINKDATA/MOCT_LINK.shp', encoding='euc-kr')
    gdf_links['LINK_ID'] = gdf_links['LINK_ID'].astype(int)

    # 새로운 데이터 공간 데이터로 변환
    gdf_potholes = gpd.GeoDataFrame(
        df,
        geometry = gpd.points_from_xy(df['경도'], df['위도']),
        crs="EPSG:4326")  # 위/경도 WGS84 좌표계로 설정

    # 새로운 데이터 좌표계 통일
    if gdf_links.crs is not None:
        gdf_potholes = gdf_potholes.to_crs(gdf_links.crs)

    # 새로운 데이터의 위치와 가까운 도로 번호(+도로정보) 매칭
    gdf_nearest = gpd.sjoin_nearest(
        gdf_potholes, gdf_links,
        how='left',
        distance_col='distance')  # 계산한 거리(m)

    ### 교통량 데이터
    traffic_d = pd.read_excel('/content/drive/MyDrive/DATA_원본/TrafficVolume(LINK).xlsx', header=[0, 1])

    # 전처리
    traffic_d.columns = [
        f"{upper}" if 'Unnamed' in str(lower) else f"{upper}_{lower}"
        for upper, lower in traffic_d.columns]
    traffic_d.rename(columns={
        'ITS LINK ID': 'ITS_LINK_ID',
        '승용차-평일_전일': '승용차',
        '버스-평일_전일': '버스',
        '트럭-평일_전일': '트럭'}, inplace=True)

    traffic_d['ITS_LINK_ID'] = traffic_d['ITS_LINK_ID'].astype(str).str.split(',')
    traffic_d = traffic_d.explode('ITS_LINK_ID')
    traffic_d['LINK_ID'] = traffic_d['ITS_LINK_ID'].str.strip().astype(int)
    traffic_df = traffic_d[['LINK_ID', '도로명', '차선수', '승용차', '버스', '트럭']].copy()
    traffic_df = (traffic_df.dropna(subset=['LINK_ID']).drop_duplicates('LINK_ID').reset_index(drop=True).astype({'LINK_ID': 'int'}))

    # 교통량 join
    pothole_traffic = gdf_nearest.merge(traffic_df, on='LINK_ID', how='left')
    pothole_output = pothole_traffic[['날짜', '주소', '위도', '경도', '행정동', '자치구', 'LINK_ID', '도로명', '차선수', '승용차', '버스', '트럭']]

    return pothole_output

### ----------------------------------------------------------------------------------

def nature(new_pothole_traffic):

    ### 건물별 평균 연령
    old = pd.read_csv('/content/drive/MyDrive/DATA_원본/건축물연령정보_행정동_평균.csv')
    bup = pd.read_csv('/content/drive/MyDrive/DATA_원본/국토교통부_법정동코드_20240805.csv', encoding = 'cp949')
    bup['법정동명'] = bup['법정동명'].astype(str)

    # 법정동 만들기
    new_pothole_traffic['법정동명'] = '서울특별시 ' + new_pothole_traffic['자치구'] + ' ' + new_pothole_traffic['행정동']
    new_pothole_traffic['법정동명'] = new_pothole_traffic['법정동명'].astype(str)

    # join
    new_pothole_building = new_pothole_traffic.merge(bup[['법정동코드', '법정동명']], on = '법정동명', how = 'left')
    new_pothole_building = new_pothole_building.merge(old[['법정동코드', '평균_건물연령']], on = '법정동코드', how = 'left')

    ### 배수등급
    # 공간 데이터로 변환
    gdf_pothole2 = gpd.GeoDataFrame(
        new_pothole_building,
        geometry=gpd.points_from_xy(new_pothole_building['경도'], new_pothole_building['위도']),
        crs="EPSG:4326")

    # 포트홀 좌표계 → EPSG:5174 로 변환 (shp에 맞추기)
    gdf_pothole2 = gdf_pothole2.to_crs("EPSG:5174")

    # 배수등급 로드
    soil_df = gpd.read_file("/content/drive/MyDrive/DATA_원본/ASIT_SOILDRA_AREA/ASIT_SOILDRA_AREA.shp")

    # join
    new_pothole_soil = gpd.sjoin(gdf_pothole2, soil_df[['SOILDRA', 'geometry']], how='left', predicate='within')

    ### 토양 경사도
    # 데이터 로드
    slope_df = gpd.read_file("/content/drive/MyDrive/DATA_원본/ASIT_SOILSLOPE_AREA/ASIT_SOILSLOPE_AREA.shp")
    new_pothole_soil.drop(['index_right'], axis=1, inplace=True)

    # join
    new_pothole_slope = gpd.sjoin(new_pothole_soil, slope_df[['SOILSLOPE', 'geometry']], how='left', predicate='within')

    # 열 이름 수정
    new_pothole_slope.rename(columns = {'SOILDRA' : '배수등급', 'SOILSLOPE' : '경사도'}, inplace = True)
    new_pothole_done = new_pothole_slope[['날짜', '주소', '위도', '경도', '자치구', '행정동', '도로명', '차선수', '승용차', '버스', '트럭', '평균_건물연령', '배수등급' ,'경사도']]

    return new_pothole_done

### ----------------------------------------------------------------------------------

def join_gu(new_pothole_done):
    ### 자치구별 데이터 로드
    rain_sum = pd.read_pickle('/content/drive/MyDrive/DATA_전처리/자치구별/누적 강수량.pickle')
    humid_mean = pd.read_pickle('/content/drive/MyDrive/DATA_전처리/자치구별/평균 습도.pickle')
    temp_total_diff = pd.read_pickle('/content/drive/MyDrive/DATA_전처리/자치구별/1년 기온차.pickle')
    temp_monthly_diff = pd.read_pickle('/content/drive/MyDrive/DATA_전처리/자치구별/평균 일교차.pickle')
    people = pd.read_pickle('/content/drive/MyDrive/DATA_전처리/자치구별 인구 수.pickle')
    people['자치구'] = people['자치구'].str.replace("\u3000","",regex = False)

    new_pothole_done.rename(columns = {'날짜' : '발생일'}, inplace = True)
    new_pothole_done['발생일'] = pd.to_datetime(new_pothole_done['발생일'])

    ### 인구 수
    people_merged = pd.merge(new_pothole_done, people, on = '자치구', how = 'left')
    ### 강수량
    rain_merged = pd.merge(people_merged, rain_sum, on = ['자치구', '발생일'], how = 'left')
    ### 습도
    humid_merged = pd.merge(rain_merged, humid_mean, on = ['자치구', '발생일'], how = 'left')
    ### 기온
    temp_merged1 = pd.merge(humid_merged, temp_total_diff, on = ['자치구', '발생일'], how = 'left')
    new_pothole = pd.merge(temp_merged1, temp_monthly_diff, on = ['자치구', '발생일'], how = 'left')

    return new_pothole

### ----------------------------------------------------------------------------------

def prediction(new_pothole, model_path):
    ## x 할당
    new_x = new_pothole[['차선수', '승용차', '버스', '트럭', '평균_건물연령', '인구 수', '누적 강수량', '평균 습도', '1년 기온차', '평균 일교차', '배수등급', '경사도']]

    # 저장된 변환기, 스케일러 불러오기
    transformer = joblib.load('/content/drive/MyDrive/준희/Modeling/Prediction/BoxCoxTransformer.pkl')
    scaler = joblib.load('/content/drive/MyDrive/준희/Modeling/Prediction/StandardScaler.pkl')
    # 새로운 데이터 변환
    cols = ['승용차', '버스', '트럭', '평균_건물연령', '인구 수', '누적 강수량', '평균 습도', '1년 기온차', '평균 일교차']
    arr = new_x[cols].values + 1e-6
    bc = transformer.transform(arr)
    bc_std = scaler.transform(bc)
    new_x[cols] = bc_std

    ### 배수등급, 경사도 변수 처리
    # 경사도
    slope_encoding = {
        '0-2%': 0,
        '2-7%': 1,
        '7-15%': 2,
        '15-30%': 3,
        '30-60%': 4,
        '60-100%': 5}
    # 배수등급
    drain_encoding = {
        '매우양호': 5,
        '양호': 4,
        '약간양호': 3,
        '약간불량': 2,
        '불량': 1,
        '매우불량': 0}
    # 매핑
    new_x['경사도'] = new_x['경사도'].map(slope_encoding)
    new_x['배수등급'] = new_x['배수등급'].map(drain_encoding)

    ### 모델 로드
    xgb_model = joblib.load(model_path)
    y_pred = xgb_model.predict(new_x)
    y_pred_prob = xgb_model.predict_proba(new_x)[:, 1]

    return y_pred, y_pred_prob

##### 예측

In [None]:
def prediction_with_env(new_data, model):
    ### 마지막에 출력할 output
    output_df = new_data.copy()
    ### x 만들기
    geo = geo_coding(new_data)
    traffic_df = traffic(geo)
    nature_df = nature(traffic_df)
    new_potholes = join_gu(nature_df)
    ### 예측
    y_pred, y_pred_prob = prediction(new_potholes)
    ### output
    output_df['예측'] = y_pred
    output_df['예측 확률'] = y_pred_prob
    return output_df

In [None]:
### Input
input = ["서울특별시 강남구 대치동 507",
         "서울특별시 동대문구 제기동 137-418",
         "서울특별시 서대문구 홍은동 9-360",
         "서울특별시 영등포구 당산1동 당산로 31길 25",
         "서울특별시 서초구 방배동 756-4",
         "서울특별시 용산구 동빙고동 90-1",
         "서울특별시 성북구 보문동5가 235",
         "서울특별시 노원구 공릉2동 29-4"]
date = ['2024-07-28', '2023-06-23', '2022-05-21', '2023-12-05', '2021-10-29', '2024-01-05', '2022-04-29', '2021-02-03']
new_data = pd.DataFrame({'날짜' : date, '주소' : input})
### Model Path
model_path = '/content/drive/MyDrive/준희/Modeling/Prediction/xgb_model.pkl'
### Prediction
output_df = prediction_without_env(new_data, model_path)