In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import seaborn as sns

In [2]:
from utils.Parallelize_DataFrame import *

In [3]:
from datetime import datetime, timedelta

### 마이비 카드 데이터

In [4]:
%%time
mybicard = pd.read_parquet('/home/seho/Passenger_Demand/data/mybicard.parquet', engine='pyarrow')

CPU times: user 27.4 s, sys: 8.71 s, total: 36.1 s
Wall time: 19.3 s


In [5]:
mybicard.shape

(36261767, 15)

In [6]:
mybicard

Unnamed: 0,collectdate,seq,route_nm,transdate,sumamount,stop_nm,stop_id,normalcnt,studentcnt,studentamount,childcnt,childamount,transflag,mybicardnumber,base_ymd
0,20201003,26934,482-2,20201001 22:20:05,0,공업탑,3101347,1,0,0,0,0,환승,D900562094954,20201001
1,20201003,77969,482-2,20201001 22:15:04,1250,옥동행정복지센터.울산대공원정문,3102806,1,0,0,0,0,비환승,D900545114415,20201001
2,20201003,100264,104,20201001 22:44:42,1250,울산대후문,3100479,1,0,0,0,0,비환승,D100520894142,20201001
3,20201004,100508,701,20201001 18:20:02,0,병영사거리,3101274,1,0,0,0,0,환승,D900554620077,20201001
4,20201004,59746,104,20201001 11:11:21,820,남목1동,3101482,0,1,820,0,0,비환승,D900525646981,20201001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169550,20201120,38071,1127,20201121 00:14:38,2080,?????,3101460,1,0,0,0,0,???,D900644987520,20201121
169551,20201120,4216,1137,20201121 00:18:48,1700,??,3101456,0,1,1700,0,0,???,D900590544747,20201121
169552,20201120,181900,106,20201121 00:06:19,1250,??????,3101242,1,0,0,0,0,???,D900613753769,20201121
169553,20201120,240930,1127,20201121 00:12:30,2080,?????,3101461,1,0,0,0,0,???,D900614952385,20201121


In [7]:
# %%time
# mybicard = mybicard.drop_duplicates()

In [8]:
# 수집일자 데이트 포맷으로 변환
mybicard["collectdate"] = pd.to_datetime(mybicard["collectdate"], format = "%Y%m%d")

In [9]:
# 전송일자 데이트 포맷으로 변환
mybicard["transdate"] = pd.to_datetime(mybicard["transdate"], format = "%Y%m%d %H:%M:%S")

In [10]:
# 전체 승객 수 변수 생성(일반 + 학생 + 아동)
mybicard["totalcnt"] = mybicard[["normalcnt", "studentcnt", "childcnt"]].sum(axis = 1)

In [11]:
# route_nm에 공백이 포함되어 있어 공백 제거
mybicard["route_nm"] = mybicard["route_nm"].replace("\s", "", regex = True)

In [12]:
# ;mybicard = mybicard.sort_values(["transdate", "seq"]).reset_index(drop=True)

In [13]:
mybicard = mybicard.rename(columns = {"stop_id" : "mybi_stop_id"})

### 401번 버스

In [14]:
mybicard_401 = mybicard.loc[(mybicard["route_nm"] == "401")].reset_index()

In [16]:
mybicard_401.shape

(1964401, 17)

In [18]:
mybicard_401_agg = (mybicard_401.loc[mybicard_401["transflag"] != "하차"]
                                   .groupby(["mybi_stop_id", pd.Grouper(key="transdate", freq='60Min')])
                                   .agg(normalcnt = ("normalcnt",sum), 
                                        studentcnt = ("studentcnt", sum), 
                                        childcnt = ("childcnt", sum),
                                        totalcnt = ("totalcnt", sum))
                                   .reset_index())

In [19]:
mybicard_401_agg.shape

(314862, 6)

In [14]:
mybicard_401_agg.to_parquet("/home/seho/Passenger_Demand/data/mybicard_401_agg.parquet")


### 결측치 
하루의 수집 데이터의 수가 0인 날짜의 데이터

In [15]:
count_by_date = mybicard_401_agg.groupby([pd.Grouper(key="transdate", freq="1D")]).size().reset_index(name = "cnt")

In [16]:
missing_date = count_by_date.loc[count_by_date["cnt"] == 0, "transdate"]

#### n주 전 같은 요일 같은 시간대의 인원 수로 Impute

In [17]:
def impute_recent_data(data, missing_date, date_col = "transdate"):
    data = data.copy()
    for x in missing_date:
        temp = []
        w = 0
        while len(temp) == 0:
            w +=1
            temp = data.loc[data[date_col].dt.date == (x - timedelta(weeks = w)).date()].copy()

        temp[date_col] = temp[date_col] + timedelta(weeks = w)
        data = pd.concat([data, temp], 0)
        
    return data

#### 최근 n주의 같은 요일 같은 시간대의 평균값으로 Impute

In [18]:
def impute_recent_mean_data(data, missing_date, date_col):

    data = data.copy()

    # 요일, 시간 추가
    data["dayofweek"] = data["transdate"].dt.dayofweek
    dow_dict = {0:"월", 1:"화", 2:"수", 3:"목", 4:"금", 5:"토", 6:"일"}
    data["dayofweek"] = data["dayofweek"].replace(dow_dict)
    data["hour"] = data["transdate"].dt.hour

    for x in missing_date:
        base_date = x
        w = 0
        # 결측일의 이전 4주를 기본으로 검색하며, 데이터가 없는 경우 범위를 1주씩 늘려가며 데이터 조회
        temp = []
        while len(temp) == 0:
            temp = data.loc[(data["transdate"].dt.date.between((x - timedelta(weeks = 4+w)).date(), x.date())) & (data["transdate"].dt.dayofweek == x.day_of_week)].copy()
            w += 1

        # 4+w 전까지의 데이터를 찾아서 정류장별, 요일별, 시간별 평균값 산출
        temp2 = temp.groupby(["mybi_stop_id", "dayofweek", "hour"]).agg({"totalcnt" : np.mean,
                                                                         "normalcnt" : np.mean,
                                                                         "studentcnt" : np.mean,
                                                                         "childcnt" : np.mean}).reset_index()
        # 평균값 변환 (Float -> Int : 반올림 효과)
        temp2["totalcnt"] = temp2["totalcnt"].astype(int)
        temp2["normalcnt"] = temp2["normalcnt"].astype(int)
        temp2["studentcnt"] = temp2["studentcnt"].astype(int)
        temp2["childcnt"] = temp2["childcnt"].astype(int)

        # 기준 일자, 시간으로 부터 transdate을 재생성
        temp2["transdate"] = temp2.apply(lambda x: base_date + timedelta(hours = x["hour"]), 1)

        data = pd.concat([data, temp2], 0)
        
    return data

In [19]:
mybicard_401_agg_impute = impute_recent_mean_data(data = mybicard_401_agg, missing_date = missing_date, date_col = "transdate")

In [20]:
# mybicard_401_agg = impute_recent_data(data = mybicard_401_agg, missing_date = missing_date, date_col = "transdate")

#### 결측일을 제외한 결측치(특정 시간에 데이터가 없는 경우)는 승객이 0이므로 0으로 대체한다.

In [202]:
# 데이터의 시작과 끝 사이를 1시간 간격으로 구분하여 list 생성
dt_list = pd.date_range(start = mybicard_401_agg_impute["transdate"].min(), end = mybicard_401_agg_impute["transdate"].max(), freq = "1h")

transdate_df = pd.DataFrame({"transdate" : dt_list}).reset_index(drop = True)
mybi_stop_id_df = pd.DataFrame({"mybi_stop_id" : mybicard_401_agg_impute["mybi_stop_id"].drop_duplicates()}).reset_index(drop = True)

# 전체 일정(시간 단위)과 정류소 별 조합 DF 생성
all_date = pd.merge(transdate_df, mybi_stop_id_df, how = "cross")

# 결측일의 데이터를 채워넣은 전체 데이터를 left join
ml_data = pd.merge(all_date, mybicard_401_agg_impute, on = ["mybi_stop_id", "transdate"], how = "left")

ml_data["dayofweek"] = ml_data["transdate"].dt.dayofweek
dow_dict = {0:"월", 1:"화", 2:"수", 3:"목", 4:"금", 5:"토", 6:"일"}
ml_data["dayofweek"] = ml_data["dayofweek"].replace(dow_dict)
ml_data["hour"] = ml_data["transdate"].dt.hour
# 결측일이 아닌 결측값은 승객수가 없다고 판단하여 0으로 대체
ml_data = ml_data.fillna(0)
ml_data = ml_data.loc[ml_data["hour"].isin([1,2,3,4]) == False ]

In [203]:
ml_data.shape

(620940, 8)

### 시계열 변수 생성

In [204]:
def create_lag_feature(data, target_cols, date_cols, lags, groupby_cols = None):
    data = data.copy()
    if isinstance(lags, list) == False:
        lags = [lags]
    if isinstance(date_cols, list) == False:
        date_cols = [date_cols]
    if isinstance(target_cols, list) == False:
        target_cols = [target_cols]
    if isinstance(groupby_cols, list) == False:
        groupby_cols = [groupby_cols]
    
    for lg in lags:
        if groupby_cols is None:
            cnt_bf = data.set_index(date_cols)[target_cols].shift(freq = lg).reset_index()
        else:
            cnt_bf = data.set_index(date_cols).groupby(groupby_cols)[target_cols].shift(freq = lg).reset_index()
        
        rename_dict = {col: f"{col}_bf{lg}" for col in target_cols}
        cnt_bf = cnt_bf.rename(columns = rename_dict)
        
        data = pd.merge(data, cnt_bf, on = date_cols + groupby_cols, how = "left")
    
    return data
    

In [205]:
%%time
ml_data = create_lag_feature(data = ml_data, target_cols = "totalcnt", date_cols = "transdate", lags = ["1d", "2d", "3d", "4d", "5d", "6d", "7d"], groupby_cols = "mybi_stop_id")

CPU times: user 1.26 s, sys: 9.14 ms, total: 1.27 s
Wall time: 1.27 s


In [206]:
ml_data.shape

(620940, 15)

### 날짜별 평균 Lag

In [207]:
mybicard_401_agg_daily = (ml_data.groupby(["mybi_stop_id", pd.Grouper(key="transdate", freq='1d')])
                                   .agg(normalcnt = ("normalcnt", np.mean), 
                                        studentcnt = ("studentcnt", np.mean), 
                                        childcnt = ("childcnt", np.mean),
                                        totalcnt = ("totalcnt", np.mean))
                                   .reset_index())

In [208]:
daily_lag = create_lag_feature(data = mybicard_401_agg_daily, target_cols = "totalcnt", date_cols = "transdate", lags = ["1d", "2d", "3d", "4d", "5d", "6d", "7d"], groupby_cols = "mybi_stop_id")

In [209]:
lags = ["1d", "2d", "3d", "4d", "5d", "6d", "7d"]
rename_dict = {f"{col}_bf{lg}": f"{col}_bf{lg}_total" for col in ["totalcnt"] for lg in lags}

In [210]:
daily_lag = daily_lag.rename(columns = rename_dict)
daily_lag["date"] = daily_lag["transdate"].dt.date

In [211]:
ml_data["date"] = ml_data["transdate"].dt.date

In [212]:
ml_data = pd.merge(ml_data, daily_lag[["date", "mybi_stop_id"] + list(rename_dict.values())], on = ["date", "mybi_stop_id"], how = "left")

In [213]:
ml_data.shape

(620940, 23)

### Moving Average

#### 1) 이전 n개일자들의 동일 시간대 평균

In [214]:
def calculate_moving_agg(data, target_cols, date_col, groupby_cols, col_nm = "", rollings = ["2d"], agg_func = [np.mean, np.std]):
    if isinstance(target_cols, list) == False:
        target_cols = [target_cols]
        
    if isinstance(groupby_cols, list) == False:
        groupby_cols = [groupby_cols]
        
    if col_nm != "":
        col_nm = f"{col_nm}_"
    
    
    for rl in rollings:
        for tg in target_cols:
            data = data.set_index(date_col).sort_index(ascending=True).copy()
            rolling_data = data.groupby(groupby_cols)[tg].rolling(rl).agg(agg_func)
            rolling_data = rolling_data.rename(columns = {"mean" : f"{tg}_ma_{col_nm}mean_{rl}", 
                                                          "std" : f"{tg}_ma_{col_nm}std_{rl}"})
            rolling_data = rolling_data.groupby(groupby_cols).shift(1).reset_index()    
            
            data = pd.merge(data.reset_index(), rolling_data, on = [date_col] + groupby_cols, how = "left")
            
    return data

In [215]:
%%time
ml_data = calculate_moving_agg(data = ml_data, target_cols = ["totalcnt"], date_col = "transdate", groupby_cols = ["mybi_stop_id", "hour"], col_nm = "hour", rollings = ["2d", "3d", "4d", "5d", "6d"])

CPU times: user 2.04 s, sys: 46.7 ms, total: 2.09 s
Wall time: 2.09 s


In [216]:
ml_data.shape

(620940, 33)

#### 2) n주전까지의 동일 요일의 동일 시간대 평균

In [217]:
%%time
ml_data = calculate_moving_agg(data = ml_data, target_cols = ["totalcnt"], date_col = "transdate", groupby_cols = ["mybi_stop_id", "hour", "dayofweek"], col_nm = "hour_week", rollings = ["14d", "21d", "28d"])

CPU times: user 3.12 s, sys: 28.4 ms, total: 3.15 s
Wall time: 3.15 s


In [218]:
ml_data.shape

(620940, 39)

#### 3) 이전 n개일자들의 전체 평균

In [219]:
mybicard_401_agg_1d = (ml_data.groupby(["mybi_stop_id", pd.Grouper(key="transdate", freq='1d')])
                                   .agg(normalcnt = ("normalcnt",sum), 
                                        studentcnt = ("studentcnt", sum), 
                                        childcnt = ("childcnt", sum),
                                        totalcnt = ("totalcnt", sum))
                                   .reset_index())

mybicard_401_agg_1d["dayofweek"] = mybicard_401_agg_1d["transdate"].dt.dayofweek

In [220]:
%%time
daily_agg = calculate_moving_agg(data = mybicard_401_agg_1d, target_cols = ["totalcnt"], date_col = "transdate", groupby_cols = ["mybi_stop_id"], col_nm = "daily", rollings = ["2d", "3d", "4d", "5d", "6d"])
daily_agg["date"] = daily_agg["transdate"].dt.date
daily_agg = daily_agg.drop(["transdate", "normalcnt", "studentcnt", "childcnt", "totalcnt", "dayofweek"], 1)

CPU times: user 110 ms, sys: 0 ns, total: 110 ms
Wall time: 110 ms


In [221]:
ml_data = pd.merge(ml_data, daily_agg, on = ["mybi_stop_id", "date"], how = "left")

In [222]:
ml_data.shape

(620940, 49)

#### 4) n주전까지의 동일 요일의 전체 평균

In [223]:
%%time
daily_week_agg = calculate_moving_agg(data = mybicard_401_agg_1d, target_cols = ["totalcnt"], date_col = "transdate", groupby_cols = ["mybi_stop_id", "dayofweek"], col_nm = "daily_week", rollings = ["14d", "21d", "28d"])
daily_week_agg["date"] = daily_week_agg["transdate"].dt.date
daily_week_agg = daily_week_agg.drop(["transdate", "normalcnt", "studentcnt", "childcnt", "totalcnt", "dayofweek"], 1)

CPU times: user 137 ms, sys: 814 µs, total: 137 ms
Wall time: 137 ms


In [224]:
ml_data = pd.merge(ml_data, daily_week_agg, on = ["mybi_stop_id", "date"], how = "left")

In [225]:
ml_data.shape

(620940, 55)

#### 5) n주전까지의 주 평균의 이동평균

In [226]:
mybicard_401_agg_1d = (ml_data.groupby(["mybi_stop_id", pd.Grouper(key="transdate", freq='1d')])
                                   .agg(normalcnt = ("normalcnt",sum), 
                                        studentcnt = ("studentcnt", sum), 
                                        childcnt = ("childcnt", sum),
                                        totalcnt = ("totalcnt", sum))
                                   .reset_index())
mybicard_401_agg_1d["hour"] = mybicard_401_agg_1d["transdate"].dt.hour
mybicard_401_agg_1d["weekofyear"] = mybicard_401_agg_1d["transdate"].dt.isocalendar().week

In [227]:
mybicard_401_agg_1w = mybicard_401_agg_1d.groupby(["mybi_stop_id", "weekofyear"])["totalcnt"].mean().reset_index()

In [228]:
weekly_agg = calculate_moving_agg(data = mybicard_401_agg_1w, target_cols = ["totalcnt"], date_col = "weekofyear", groupby_cols = ["mybi_stop_id"], col_nm = "weekly", rollings = [2,3,4])
weekly_agg = weekly_agg.drop("totalcnt", 1)

In [229]:
ml_data["weekofyear"] = ml_data["transdate"].dt.isocalendar().week

In [230]:
ml_data = pd.merge(ml_data, weekly_agg, on = ["mybi_stop_id", "weekofyear"], how = "left")

In [231]:
ml_data.shape

(620940, 62)

### 특일 데이터 추가

In [232]:
holiday_data = pd.read_parquet("/home/seho/Passenger_Demand/data/holiday_data.parquet")

In [233]:
holiday_data["date"] = pd.to_datetime(holiday_data["locdate"], format = "%Y%m%d").dt.date
holiday_data = holiday_data.drop(["locdate"], 1)

In [234]:
date_df = pd.DataFrame({"date" : pd.date_range("2020-01-01", "2020-12-31", freq = "1D")})
date_df["weekend"] = np.where(date_df["date"].dt.dayofweek.isin([5,6]), "Y", "N")
date_df["date"] = date_df["date"].dt.date

#### 공휴일 / 명절 여부 

In [235]:
# 공휴일(holiday), 명절(ntl_holiday) 구분
ntl_holiday = holiday_data.loc[holiday_data["dateName"].isin(["설날", "추석"])]
ntl_holiday = ntl_holiday.rename(columns = {"dateName" : "ntl_holi"})

holiday = holiday_data.loc[holiday_data["dateName"].isin(["설날", "추석"]) == False]
holiday = holiday.rename(columns = {"dateName" : "holi"})

In [236]:
date_df = pd.merge(date_df, ntl_holiday, on = "date", how = "left")
date_df = pd.merge(date_df, holiday, on = "date", how = "left")

In [237]:
date_df = date_df.assign(ntl_holi = np.where(date_df["ntl_holi"].isna(), "N", "Y"),
                         holi = np.where(date_df["holi"].isna(), "N", "Y"))

#### 3일 이상 연휴 여부

In [238]:
date_df["rest_yn"] = date_df[["weekend", "ntl_holi", "holi"]].apply(lambda x: any(x == "Y"), 1)
date_df["rest_yn"] = np.where(date_df["rest_yn"],"Y", "N")

In [239]:
def find_seq_y(data, criterion = 2):
    seq_list = []
    Y_cnt = 0
    for i, x in enumerate(data):
        if x == "Y":
            Y_cnt += 1

        if (x == "N") | (i == len(data)):
            if Y_cnt > criterion:
                temp_list = ["Y"] * Y_cnt
                seq_list += temp_list
            elif (Y_cnt > 0) & (Y_cnt <= criterion):
                temp_list = ["N"] * Y_cnt
                seq_list += temp_list
            seq_list.append("N")
            Y_cnt = 0
            
    return seq_list


In [240]:
date_df["seq_holi"] = find_seq_y(data = date_df["rest_yn"])

In [241]:
date_df = date_df.drop(["weekend", "rest_yn"], 1)

In [242]:
ml_data = pd.merge(ml_data, date_df, on = "date", how = "left")

In [243]:
ml_data.shape

(620940, 65)

### 날씨 데이터 추가

In [244]:
weather_data = pd.read_parquet("/home/seho/Passenger_Demand/data/weather_2018.parquet")

In [245]:
weather_data = weather_data.loc[:, ["tm", "ta", "hm", "rn", "dc10Tca",  "dsnw", "wd", "ws"]]
weather_data = weather_data.rename(columns = {"tm" : "time",
                                              "ta" : "temperature",
                                              "hm" : "humidity",
                                              "rn" : "precipitation",
                                              "dc10Tca" : "전운량",
                                              "dsnw" : "snowfall",
                                              "wd" : "풍향",
                                              "ws" : "풍속"})
weather_data["time"] = pd.to_datetime(weather_data["time"], format = "%Y-%m-%d %H:%M")

In [246]:
for col in weather_data.columns:
    if col == "time":
        continue
    weather_data[col] = weather_data[col].replace("", "0.0").astype(float)
    weather_data[col] = weather_data[col].astype(float)

In [247]:
weather_data["time_hours"] = weather_data["time"].dt.strftime("%Y-%m-%d %H")

In [248]:
ml_data["transdate_hours"] = ml_data["transdate"].dt.strftime("%Y-%m-%d %H")

In [249]:
ml_data = pd.merge(ml_data, weather_data[["time_hours", "temperature", "humidity", "precipitation", "snowfall"]], left_on = "transdate_hours", right_on = "time_hours")
ml_data = ml_data.drop(["transdate_hours", "time_hours"], 1)

In [250]:
ml_data.shape

(620940, 69)

### 미세먼지 데이터 추가

In [251]:
pm_data = pd.read_csv("/home/seho/Passenger_Demand/data/pm_data.csv")
pm_data["issueDate"] = pd.to_datetime(pm_data["issueDate"], format = "%Y-%m-%d")

In [252]:
pm_data_agg = pm_data.loc[pm_data["districtName"] == "울산"].groupby(pd.Grouper(key="issueDate", freq="1D")).size().reset_index(name = "pm_alert_cnt")

In [253]:
ml_data["date"] = ml_data["transdate"].dt.strftime("%Y-%m-%d")
pm_data_agg["issueDate"] = pm_data_agg["issueDate"].dt.strftime("%Y-%m-%d")

In [254]:
ml_data = pd.merge(ml_data, pm_data_agg, how = "left", left_on = "date", right_on = "issueDate")
ml_data = ml_data.drop(["issueDate"], 1)
ml_data["pm_alert_cnt"] = ml_data["pm_alert_cnt"].fillna("0")

In [255]:
ml_data.shape

(620940, 70)

### 정류장 X,Y 좌표 추가

In [256]:
# 경주시, 양산시, 울산광역시, 부산광역시
bus_stop_info = pd.read_csv("/home/seho/Passenger_Demand/data/울산광역시_버스 정류소 위치 정보_20200531.csv", encoding = "euc-kr")
bus_stop_info = bus_stop_info.loc[bus_stop_info["권역"] == "울산광역시"]
bus_stop_info.columns = ["stop_nm", "stop_id", "longitude", "latitude", "city"]

In [257]:
bus_stop_401_1 = pd.read_csv("/home/seho/Passenger_Demand/data/401_율리_꽃바위.csv", encoding = "euc_kr")
bus_stop_401_2 = pd.read_csv("/home/seho/Passenger_Demand/data/401_꽃바위_율리.csv", encoding = "euc_kr")
bus_stop_401 = pd.concat([bus_stop_401_1, bus_stop_401_2])
bus_stop_401.columns = ["mybi_stop_id", "stop_id"]

In [258]:
bus_stop_401_info = pd.merge(bus_stop_401, bus_stop_info, on = "stop_id")

### 상권정보

In [259]:
trading_area = pd.read_csv("/home/seho/Passenger_Demand/data/울산광역시_상권정보_201231.csv")

In [260]:
category_list = trading_area["상권업종중분류명"].drop_duplicates().to_list()

In [261]:
from math import radians, cos, sin, asin, sqrt

def haversine(latlon1, latlon2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians
    lat1, lon1 = map(radians, latlon1)
    lat2, lon2 = map(radians, latlon2)
#     lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles
    return c * r

def count_store_nearby(data, trading_area = trading_area, dist = 0.1, category_list = None):
    data_copy = data.copy()
    if category_list == None:
        category_list = trading_area["상권업종중분류명"].drop_duplicates().to_list()
    
    dist_list = trading_area[["위도", "경도"]].apply(lambda x: haversine((x["위도"], x["경도"]), (data_copy["latitude"], data_copy["longitude"])), 1)
    within_data = trading_area.loc[dist_list <= dist]
    
    
    for i, ctgr in enumerate(category_list):
        data_copy[f"store_category_{i}"] = (within_data["상권업종중분류명"] == ctgr).sum()

    return data_copy
    

In [262]:
%%time
bus_stop_401_info = parallelize_dataframe(df = bus_stop_401_info, 
                                           func = count_store_nearby, 
                                           num_cores = 12, 
                                           trading_area = trading_area, 
                                           dist = 0.2, 
                                           category_list = category_list)

100%|██████████| 11/11 [00:09<00:00,  1.16it/s]
 45%|████▌     | 5/11 [00:03<00:05,  1.20it/s]]
100%|██████████| 11/11 [00:09<00:00,  1.10it/s]
100%|██████████| 11/11 [00:10<00:00,  1.08it/s]
 45%|████▌     | 5/11 [00:03<00:05,  1.19it/s]]
100%|██████████| 11/11 [00:10<00:00,  1.05it/s]
100%|██████████| 11/11 [00:10<00:00,  1.06it/s]
100%|██████████| 11/11 [00:10<00:00,  1.07it/s]
100%|██████████| 11/11 [00:09<00:00,  1.10it/s]
100%|██████████| 11/11 [00:09<00:00,  1.12it/s]
100%|██████████| 11/11 [00:09<00:00,  1.16it/s]
100%|██████████| 10/10 [00:08<00:00,  1.17it/s]


CPU times: user 18.7 s, sys: 589 ms, total: 19.3 s
Wall time: 27.8 s


### 병원정보

In [263]:
hospital_data = pd.read_parquet("/home/seho/Passenger_Demand/data/hospital_data.parquet")

In [264]:
hospital_data["category"] = hospital_data["의료기관종별"].replace({"한방병원" : "병원",
                                                                  "치과병원" : "병원",
                                                                  "일반요양병원" : "요양병원",
                                                                  "부속의원" : "의원",
                                                                  "치과의원" : "의원",
                                                                  "한의원" : "의원",
                                                                  "보건지소" : "보건소",
                                                                  "보건진료소" : "보건소"})

In [266]:
hospital_category_list = hospital_data["category"].drop_duplicates().to_list()

In [267]:
def count_hospital_nearby(data, hospital_data = hospital_data, dist = 0.2, category_list = None):
    data_copy = data.copy()
    if category_list == None:
        category_list = hospital_data["category"].drop_duplicates().to_list()
    
    dist_list = hospital_data[["lat", "lng"]].apply(lambda x: haversine((x["lat"], x["lng"]), (data_copy["latitude"], data_copy["longitude"])), 1)
    within_data = hospital_data.loc[dist_list <= dist]
    
    
    for i, ctgr in enumerate(category_list):
        data_copy[f"hospital_category_{i}"] = (within_data["category"] == ctgr).sum()

    return data_copy

In [268]:
%%time
bus_stop_401_info = parallelize_dataframe(df = bus_stop_401_info, 
                                           func = count_hospital_nearby, 
                                           num_cores = 12, 
                                           hospital_data = hospital_data, 
                                           dist = 0.2)

100%|██████████| 11/11 [00:00<00:00, 22.59it/s]
100%|██████████| 11/11 [00:00<00:00, 22.89it/s]
100%|██████████| 11/11 [00:00<00:00, 21.29it/s]
100%|██████████| 11/11 [00:00<00:00, 20.96it/s]
 82%|████████▏ | 9/11 [00:00<00:00, 21.81it/s]]
100%|██████████| 11/11 [00:00<00:00, 20.38it/s]
100%|██████████| 11/11 [00:00<00:00, 20.88it/s]
100%|██████████| 11/11 [00:00<00:00, 20.02it/s]
100%|██████████| 11/11 [00:00<00:00, 20.67it/s]
100%|██████████| 11/11 [00:00<00:00, 21.63it/s]
100%|██████████| 11/11 [00:00<00:00, 22.64it/s]
100%|██████████| 10/10 [00:00<00:00, 22.94it/s]


CPU times: user 728 ms, sys: 503 ms, total: 1.23 s
Wall time: 1.54 s


### 학교정보

In [269]:
school_data = pd.read_excel("/home/seho/Passenger_Demand/data/gv_school.xlsx")

In [270]:
school_data["표준일차명"] = school_data["표준일차명"].fillna("")

In [271]:
school_data = school_data.loc[school_data["표준일차명"].str.contains("울산", na="")]

In [272]:
import googlemaps

In [273]:
gmaps = googlemaps.Client(key='AIzaSyBRxjIW7qfFhaVyCsc2xhk5mf1hXUSi9DI')

In [274]:
def get_geocode(x, gmaps):
    try:
        result = gmaps.geocode(x)[0]["geometry"]["location"]
        # result = [temp["lat"], temp["lng"]]
    except:
        result = None
    
    return result        

In [275]:
school_data["category"] = school_data["학교종류"].replace({"전문대학(3년제)" : "전문대학",
                                                          "사내대학(전문)" : "전문대학",
                                                          "기능대학" : "전문대학",
                                                          "일반대학원" : "대학원",
                                                          "전문대학원" : "대학원",
                                                          "특수대학원" : "대학원",
                                                          "일반고등학교" : "고등학교",
                                                          "공업고등학교" : "고등학교",
                                                          "상업고등학교" : "고등학교",
                                                          "가사고등학교" : "고등학교",
                                                          "체육고등학교" : "고등학교",
                                                          "외국어고등학교" : "고등학교",
                                                          "과학고등학교" : "고등학교",
                                                          "예술고등학교" : "고등학교"})

In [276]:
%%time
school_data["lat_lng"] = school_data["새주소"].apply(get_geocode, gmaps = gmaps)

In [277]:
school_data["lat"] = school_data["lat_lng"].apply(lambda x: x["lat"])
school_data["lng"] = school_data["lat_lng"].apply(lambda x: x["lng"])

In [278]:
def count_school_nearby(data, school_data = school_data, dist = 0.2, category_list = None):
    data_copy = data.copy()
    if category_list == None:
        category_list = school_data["category"].drop_duplicates().to_list()
    
    dist_list = school_data[["lat", "lng"]].apply(lambda x: haversine((x["lat"], x["lng"]), (data_copy["latitude"], data_copy["longitude"])), 1)
    within_data = school_data.loc[dist_list <= dist]
    
    
    for i, ctgr in enumerate(category_list):
        data_copy[f"school_category_{i}"] = (within_data["category"] == ctgr).sum()

    return data_copy

In [279]:
%%time
bus_stop_401_info = parallelize_dataframe(df = bus_stop_401_info, 
                                              func = count_school_nearby, 
                                              num_cores = 12, 
                                              school_data = school_data, 
                                              dist = 0.2)

100%|██████████| 11/11 [00:00<00:00, 75.12it/s]
100%|██████████| 11/11 [00:00<00:00, 78.86it/s]
100%|██████████| 11/11 [00:00<00:00, 71.43it/s]
100%|██████████| 11/11 [00:00<00:00, 69.90it/s]
100%|██████████| 11/11 [00:00<00:00, 69.26it/s]
100%|██████████| 11/11 [00:00<00:00, 66.73it/s]
100%|██████████| 11/11 [00:00<00:00, 65.95it/s]
100%|██████████| 11/11 [00:00<00:00, 66.10it/s]
100%|██████████| 11/11 [00:00<00:00, 67.34it/s]
100%|██████████| 11/11 [00:00<00:00, 71.13it/s]
100%|██████████| 11/11 [00:00<00:00, 74.89it/s]
100%|██████████| 10/10 [00:00<00:00, 77.69it/s]


CPU times: user 424 ms, sys: 449 ms, total: 872 ms
Wall time: 975 ms


### 정류장 정보 Join(거리기반)

In [280]:
ml_data = pd.merge(ml_data, bus_stop_401_info.drop(["stop_id", "city"],1), on = "mybi_stop_id")

In [281]:
ml_data.shape

(620940, 175)

### 울산행사정보

In [282]:
event_data = pd.read_csv("~/Passenger_Demand/data/ulsan_event_data.csv")

In [283]:
event_data["eventStartDate"] = pd.to_datetime(event_data["eventStartDate"], format = "%Y-%m-%d")
event_data["eventEndDate"] = pd.to_datetime(event_data["eventEndDate"], format = "%Y-%m-%d")

In [284]:
def count_event_nearby(data, event_data, dist = 0.2):
    data_copy = data.copy()
    within_data = event_data.loc[(event_data["eventStartDate"] <= data_copy["transdate"]) & (event_data["eventEndDate"] >= data_copy["transdate"])]
    
    if len(within_data) == 0:
        data_copy[f"event_nearby"] = 0
    else:
        dist_list = within_data[["latitude", "longitude"]].apply(lambda x: haversine((x["latitude"], x["longitude"]), (data_copy["latitude"], data_copy["longitude"])), 1)
        data_copy[f"event_nearby"] = (dist_list <= dist).sum()

    return data_copy

In [285]:
%%time
ml_data = parallelize_dataframe(df = ml_data, 
                                func = count_event_nearby, 
                                num_cores = 12, 
                                event_data = event_data, 
                                dist = 0.2)

100%|██████████| 51745/51745 [04:27<00:00, 193.52it/s]
100%|██████████| 51745/51745 [04:26<00:00, 194.04it/s]
100%|██████████| 51745/51745 [04:28<00:00, 192.79it/s]
100%|██████████| 51745/51745 [04:29<00:00, 192.17it/s]
100%|██████████| 51745/51745 [04:24<00:00, 195.47it/s]
100%|██████████| 51745/51745 [04:26<00:00, 194.07it/s]
100%|██████████| 51745/51745 [04:26<00:00, 194.53it/s]
100%|██████████| 51745/51745 [04:27<00:00, 193.25it/s]
100%|██████████| 51745/51745 [04:24<00:00, 195.72it/s]
100%|██████████| 51745/51745 [04:24<00:00, 195.89it/s]
100%|██████████| 51745/51745 [04:26<00:00, 193.81it/s]
100%|██████████| 51745/51745 [04:25<00:00, 195.26it/s]


CPU times: user 29.9 s, sys: 18.8 s, total: 48.7 s
Wall time: 4min 37s


In [286]:
ml_data.shape

(620940, 176)

### 축제 정보

In [287]:
festival_data = pd.read_csv("~/Passenger_Demand/data/festival_data.csv")

In [288]:
festival_data["fstvlStartDate"] = pd.to_datetime(festival_data["fstvlStartDate"], format = "%Y-%m-%d")
festival_data["fstvlEndDate"] = pd.to_datetime(festival_data["fstvlEndDate"], format = "%Y-%m-%d")

In [289]:
def count_festival_nearby(data, festival_data, dist = 0.2):
    data_copy = data.copy()
    within_data = festival_data.loc[(festival_data["fstvlStartDate"] <= data_copy["transdate"]) & (festival_data["fstvlEndDate"] >= data_copy["transdate"])]
    
    if len(within_data) == 0:
        data_copy[f"festival_nearby"] = 0
    else:
        dist_list = within_data[["latitude", "longitude"]].apply(lambda x: haversine((x["latitude"], x["longitude"]), (data_copy["latitude"], data_copy["longitude"])), 1)
        data_copy[f"festival_nearby"] = (dist_list <= dist).sum()

    return data_copy

In [290]:
%%time
ml_data = parallelize_dataframe(df = ml_data, 
                                func = count_festival_nearby, 
                                num_cores = 12, 
                                festival_data = festival_data, 
                                dist = 0.2)

100%|██████████| 51745/51745 [01:53<00:00, 455.71it/s]
100%|█████████▉| 51576/51745 [01:45<00:00, 309.50it/s]
100%|██████████| 51745/51745 [01:54<00:00, 452.62it/s]
100%|██████████| 51745/51745 [01:53<00:00, 455.10it/s]
100%|██████████| 51745/51745 [01:53<00:00, 457.21it/s]
100%|██████████| 51745/51745 [01:53<00:00, 455.10it/s]
100%|██████████| 51745/51745 [01:53<00:00, 456.65it/s]
100%|██████████| 51745/51745 [01:52<00:00, 459.80it/s]
100%|██████████| 51745/51745 [01:53<00:00, 454.17it/s]
100%|██████████| 51745/51745 [01:51<00:00, 462.20it/s]
100%|██████████| 51745/51745 [01:52<00:00, 458.51it/s]
100%|██████████| 51745/51745 [01:51<00:00, 462.82it/s]


CPU times: user 18.1 s, sys: 10.5 s, total: 28.6 s
Wall time: 2min 4s


In [291]:
ml_data.shape

(620940, 177)

In [293]:
ml_data.to_pickle("/home/seho/Passenger_Demand/data/ml_data.pkl")

In [298]:
ml_data.columns

Index(['transdate', 'mybi_stop_id', 'normalcnt', 'studentcnt', 'childcnt',
       'totalcnt', 'dayofweek', 'hour', 'totalcnt_bf1d', 'totalcnt_bf2d',
       ...
       'hospital_category_4', 'hospital_category_5', 'school_category_0',
       'school_category_1', 'school_category_2', 'school_category_3',
       'school_category_4', 'school_category_5', 'event_nearby',
       'festival_nearby'],
      dtype='object', length=177)

### 인구 정보

In [294]:
population_data = pd.read_csv("~/Passenger_Demand/data/울산광역시_인구 현황_20200727.csv", encoding = "euc-kr")