In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import seaborn as sns

In [2]:
from utils.Parallelize_DataFrame import *

In [3]:
from datetime import datetime, timedelta

### 마이비 카드 데이터

In [4]:
%%time
mybicard = pd.read_parquet('/home/seho/Passenger_Demand/data/mybicard.parquet', engine='pyarrow')

CPU times: user 22.3 s, sys: 5.48 s, total: 27.8 s
Wall time: 12.4 s


In [5]:
mybicard.shape

(36261767, 15)

In [6]:
# 수집일자 데이트 포맷으로 변환
#mybicard["collectdate"] = pd.to_datetime(mybicard["collectdate"], format = "%Y%m%d")

In [7]:
# 전송일자 데이트 포맷으로 변환
mybicard["transdate"] = pd.to_datetime(mybicard["transdate"], format = "%Y%m%d %H:%M:%S")

In [15]:
%%time
mybicard["totalcnt"] = mybicard["normalcnt"] + mybicard["studentcnt"] + mybicard["childcnt"]

CPU times: user 324 ms, sys: 349 ms, total: 673 ms
Wall time: 122 ms


In [16]:
%%time
# 전체 승객 수 변수 생성(일반 + 학생 + 아동)
mybicard["totalcnt"] = mybicard[["normalcnt", "studentcnt", "childcnt"]].sum(axis = 1)

CPU times: user 178 ms, sys: 150 ms, total: 327 ms
Wall time: 325 ms


In [23]:
# route_nm에 공백이 포함되어 있어 공백 제거
mybicard["route_nm"] = mybicard["route_nm"].replace("\s", "", regex = True)

In [24]:
# ;mybicard = mybicard.sort_values(["transdate", "seq"]).reset_index(drop=True)

In [25]:
mybicard = mybicard.rename(columns = {"stop_id" : "mybi_stop_id"})

### 401번 버스

In [26]:
mybicard_401 = mybicard.loc[(mybicard["route_nm"] == "401") & (mybicard["transflag"].isin(["환승", "비환승"])), ["route_nm", "transdate", "mybi_stop_id", "normalcnt", "studentcnt", "childcnt", "totalcnt"]].reset_index(drop = True)

In [26]:
mybicard_401.shape

(1964401, 17)

In [27]:
mybicard_401.shape

(1770869, 7)

### 정류장 X,Y 좌표 추가

In [28]:
# 경주시, 양산시, 울산광역시, 부산광역시
bus_stop_info = pd.read_csv("/home/seho/Passenger_Demand/data/울산광역시_버스 정류소 위치 정보_20200531.csv", encoding = "euc-kr")
bus_stop_info = bus_stop_info.loc[bus_stop_info["권역"] == "울산광역시"]
bus_stop_info.columns = ["stop_nm", "stop_id", "longitude", "latitude", "city"]

In [29]:
bus_stop_401_1 = pd.read_csv("/home/seho/Passenger_Demand/data/401_율리_꽃바위.csv", encoding = "euc_kr")
bus_stop_401_2 = pd.read_csv("/home/seho/Passenger_Demand/data/401_꽃바위_율리.csv", encoding = "euc_kr")
bus_stop_401 = pd.concat([bus_stop_401_1, bus_stop_401_2])
bus_stop_401.columns = ["mybi_stop_id", "stop_id"]

In [30]:
bus_stop_401_info = pd.merge(bus_stop_401, bus_stop_info, on = "stop_id")

In [31]:
mybicard_401 = pd.merge(mybicard_401, bus_stop_401_info[["mybi_stop_id", "stop_id", "stop_nm", "longitude", "latitude"]], on = "mybi_stop_id", how = "inner")

In [32]:
mybicard_401 = mybicard_401.drop(["mybi_stop_id"], 1)

In [33]:
mybicard_401 = mybicard_401.loc[mybicard_401["transdate"].dt.hour.isin([1,2,3,4]) == False]

In [34]:
mybicard_401.shape

(1770869, 10)

In [545]:
bus_stop_info = mybicard_401[["stop_id", "stop_nm", "longitude", "latitude"]].drop_duplicates().reset_index(drop = True)

In [546]:
bus_stop_info

Unnamed: 0,stop_id,stop_nm,longitude,latitude
0,24234,일산해수욕장,129.428013,35.497744
1,40416,목화예식장앞,129.330419,35.538175
2,25005,우성현대아파트,129.418852,35.483501
3,25007,방어동 행정복지센터,129.424330,35.485841
4,24219,현대중공업 울산대학병원,129.432616,35.522595
...,...,...,...,...
126,22402,옥현주공아파트앞,129.264414,35.545098
127,40620,보건환경연구원,129.270913,35.538592
128,30597,문수실버복지관,129.251570,35.534900
129,24204,정수장,129.409310,35.530381


### 결측치 
하루의 수집 데이터의 수가 0인 날짜의 데이터

In [313]:
def create_data_agg(data, date_col, stop_id_col, target_cols, freq = None, groupby_cols = None, agg_dict = None, agg_func = sum):
    
    if isinstance(target_cols, list) == False:
        target_cols = [target_cols]
        
    if isinstance(groupby_cols, list) == False:
        if groupby_cols is None:
            groupby_cols = []
        else:
            groupby_cols = [groupby_cols]
    
    grouper = pd.Grouper(key = date_col, freq = freq)
    
    # 1시간 단위로 Target 변수들을 집계
    if agg_dict is None:
        agg_dict = {col : agg_func for col in target_cols}
    
    groupby_cols.append(stop_id_col)
    if freq is not None:
        groupby_cols.append(grouper)

    data_agg = (data.groupby(groupby_cols)
                    .agg(agg_dict)
                    .reset_index())
    
    return data_agg

In [314]:
base_data = create_data_agg(mybicard_401, date_col = "transdate", stop_id_col = "stop_id", target_cols = ["totalcnt", "normalcnt", "studentcnt", "childcnt"], freq = "60min")

In [315]:
def add_time_features(data, date_col):
    
    # 시간 변수들 생성
    # 요일
    data["dayofweek"] = data[date_col].dt.dayofweek
    dow_dict = {0:"월", 1:"화", 2:"수", 3:"목", 4:"금", 5:"토", 6:"일"}
    data["dayofweek"] = data["dayofweek"].replace(dow_dict)
    # 시간
    data["hour"] = data[date_col].dt.hour
    # 일
    data["date"] = data["transdate"].dt.date   
    # 월
    data["month"] = data[date_col].dt.month
    # 주
    data["weekofyear"] = data[date_col].dt.isocalendar().week
  
    return data

In [316]:
def create_all_date(data, date_col, stop_id_col, except_hours = None):
    
    if isinstance(except_hours, list) == False:
        except_hours = [except_hours]
        
    # 정류장별 모든 시간대의 조합을 생성해 버스 집계 데이터를 Join
    # 데이터가 존재하지 않는 시간대 : NA -> 이후 Impute
    
    # 데이터의 시작과 끝 사이를 1시간 간격으로 구분하여 list 생성
    dt_list = pd.date_range(start = data[date_col].min(), end = data[date_col].max(), freq = "1h")
    date_df = pd.DataFrame({date_col : dt_list}).reset_index(drop = True)
    stop_id_df = pd.DataFrame({stop_id_col : data[stop_id_col].drop_duplicates()}).reset_index(drop = True)

    # 전체 일정(시간 단위)과 정류소 별 조합 DF 생성
    all_date = pd.merge(date_df, stop_id_df, how = "cross")
    
    # 결측일의 데이터를 채워넣은 전체 데이터를 left join
    all_date = pd.merge(all_date, data, on = [date_col, stop_id_col], how = "left")
    
    all_date = all_date.loc[all_date[date_col].dt.hour.isin(except_hours) == False]
    
    return all_date

In [317]:
all_date = create_all_date(data = base_data, date_col = "transdate", stop_id_col = "stop_id", except_hours=[1,2,3,4])

In [318]:
all_date.shape

(563300, 6)

In [319]:
all_date = add_time_features(data = all_date, date_col = "transdate")

In [320]:
# n주 전 같은 요일 같은 시간대의 인원 수로 Impute
def impute_recent_data(data, missing_date, date_col = "transdate"):
    data = data.copy()
    for x in missing_date:
        temp = []
        w = 0
        while len(temp) == 0:
            w +=1
            temp = data.loc[data[date_col].dt.date == (x - timedelta(weeks = w)).date()].copy()

        temp[date_col] = temp[date_col] + timedelta(weeks = w)
        data = pd.concat([data, temp], 0)
        
    return data

In [321]:
# 최근 n주의 같은 요일 같은 시간대의 평균값으로 Impute
def impute_recent_mean_data(data, missing_date, date_col):

    data = data.copy()

    # 요일, 시간 추가
    data["dayofweek"] = data["transdate"].dt.dayofweek
    dow_dict = {0:"월", 1:"화", 2:"수", 3:"목", 4:"금", 5:"토", 6:"일"}
    data["dayofweek"] = data["dayofweek"].replace(dow_dict)
    data["hour"] = data["transdate"].dt.hour
         
    
    for x in missing_date:
        base_date = x
        w = 0
        # 결측일의 이전 4주를 기본으로 검색하며, 데이터가 없는 경우 범위를 1주씩 늘려가며 데이터 조회
        temp = []
        while len(temp) == 0:
            temp = data.loc[(data["transdate"].dt.date.between((x - timedelta(weeks = 4+w)).date(), x.date())) & (data["transdate"].dt.dayofweek == x.day_of_week)].copy()
            w += 1

        # 4+w 전까지의 데이터를 찾아서 정류장별, 요일별, 시간별 평균값 산출
        temp2 = temp.groupby(["mybi_stop_id", "dayofweek", "hour"]).agg({"totalcnt" : np.mean,
                                                                         "normalcnt" : np.mean,
                                                                         "studentcnt" : np.mean,
                                                                         "childcnt" : np.mean}).reset_index()
        # 평균값 변환 (Float -> Int : 반올림 효과)
        temp2["totalcnt"] = temp2["totalcnt"].astype(int)
        temp2["normalcnt"] = temp2["normalcnt"].astype(int)
        temp2["studentcnt"] = temp2["studentcnt"].astype(int)
        temp2["childcnt"] = temp2["childcnt"].astype(int)

        # 기준 일자, 시간으로 부터 transdate을 재생성
        temp2["transdate"] = temp2.apply(lambda x: base_date + timedelta(hours = x["hour"]), 1)

        data = pd.concat([data, temp2], 0)
        
    return data

In [322]:
def impute_bus_demand_data(data, date_col, stop_id_col):
    # 일 단위 집계 -> 데이터가 존재하지 않는 일은 결측일로 판단 (missing_date)
    count_by_date = data.groupby([pd.Grouper(key=date_col, freq="1D")]).size().reset_index(name = "cnt")
    missing_date = count_by_date.loc[count_by_date["cnt"] == 0, date_col]
    
    # 1) 결측일을 제외한 결측치(데이터가 존재하지 않는 시간대)는 승객이 0명 이므로 0으로 대체
    data = data.loc[data[date_col].dt.date.isin(missing_date.dt.date) == False].fillna(0)
    
    # 2) 최근 n주의 같은 요일 같은 시간대의 평균값으로 Impute
    data = impute_recent_mean_data(data = data, missing_date = missing_date, date_col = "transdate")
    
    return data

In [323]:
all_date = impute_bus_demand_data(data = all_date, date_col = "transdate", stop_id_col = "stop_id")

### 시계열 변수 생성

In [324]:
def create_lag_feature(data, target_cols, date_cols, lags, type = "hour", groupby_cols = None):
    data = data.copy()
    if isinstance(lags, list) == False:
        lags = [lags]
    if isinstance(date_cols, list) == False:
        date_cols = [date_cols]
    if isinstance(target_cols, list) == False:
        target_cols = [target_cols]
    if isinstance(groupby_cols, list) == False:
        groupby_cols = [groupby_cols]
                
    
    for lg in lags:
        if groupby_cols is None:
            cnt_bf = data.set_index(date_cols)[target_cols].shift(freq = lg).reset_index()
        else:
            cnt_bf = data.set_index(date_cols).groupby(groupby_cols)[target_cols].shift(freq = lg).reset_index()
        
        rename_dict = {col: f"{col}_bf_{lg}" for col in target_cols}
        cnt_bf = cnt_bf.rename(columns = rename_dict)
        
        data = pd.merge(data, cnt_bf, on = date_cols + groupby_cols, how = "left")
    
    return data
    

In [325]:
%%time
all_date = create_lag_feature(data = all_date, target_cols = ["totalcnt", "normalcnt", "studentcnt", "childcnt"], date_cols = "transdate", lags = ["1d", "2d", "3d", "4d", "5d", "6d", "7d"], groupby_cols = "stop_id")

CPU times: user 1.83 s, sys: 29.5 ms, total: 1.86 s
Wall time: 1.87 s


In [326]:
all_date.shape
# (563300, 39)

(563300, 39)

### 날짜별 평균 Lag

In [327]:
data_agg_daily_mean = create_data_agg(data = all_date, date_col = "transdate", stop_id_col = "stop_id", target_cols = ["totalcnt", "normalcnt", "studentcnt", "childcnt"], freq = "1D", agg_func = np.mean)

In [328]:
lags = ["1d", "2d", "3d", "4d", "5d", "6d", "7d"]
rename_dict = {f"{col}_bf_{lg}": f"{col}_bf_{lg}_total" for col in ["totalcnt"] for lg in lags}

In [329]:
daily_lag = create_lag_feature(data = data_agg_daily_mean, target_cols = "totalcnt", date_cols = "transdate", lags = lags, groupby_cols = "stop_id")

In [330]:
daily_lag = daily_lag.rename(columns = rename_dict)
daily_lag["date"] = daily_lag["transdate"].dt.date

In [331]:
all_date = pd.merge(all_date, daily_lag[["date", "stop_id"] + list(rename_dict.values())], on = ["date", "stop_id"], how = "left")

In [332]:
all_date.shape

(563300, 46)

### Moving Average

#### 1) 이전 n개일자들의 동일 시간대 평균

In [333]:
def calculate_moving_agg(data, target_cols, date_col, groupby_cols, col_nm = "", rollings = ["2d"], agg_func = [np.mean, np.std]):
    if isinstance(target_cols, list) == False:
        target_cols = [target_cols]
        
    if isinstance(groupby_cols, list) == False:
        groupby_cols = [groupby_cols]
        
    if col_nm != "":
        col_nm = f"{col_nm}_"
    
    
    for rl in rollings:
        for tg in target_cols:
            data = data.set_index(date_col).sort_index(ascending=True).copy()
            rolling_data = data.groupby(groupby_cols)[tg].rolling(rl).agg(agg_func)
            rolling_data = rolling_data.rename(columns = {"mean" : f"{tg}_ma_{col_nm}mean_{rl}", 
                                                          "std" : f"{tg}_ma_{col_nm}std_{rl}"})
            rolling_data = rolling_data.groupby(groupby_cols).shift(1).reset_index()    
            
            data = pd.merge(data.reset_index(), rolling_data, on = [date_col] + groupby_cols, how = "left")
            
    return data

In [334]:
%%time
all_date = calculate_moving_agg(data = all_date, target_cols = ["totalcnt"], date_col = "transdate", groupby_cols = ["stop_id", "hour"], col_nm = "hour", rollings = ["2d", "3d", "4d", "5d", "6d"])

CPU times: user 2.22 s, sys: 0 ns, total: 2.22 s
Wall time: 2.22 s


In [335]:
all_date.shape
# (620940, 33)

(563300, 56)

#### 2) n주전까지의 동일 요일의 동일 시간대 평균

In [336]:
%%time
all_date = calculate_moving_agg(data = all_date, target_cols = ["totalcnt"], date_col = "transdate", groupby_cols = ["stop_id", "hour", "dayofweek"], col_nm = "hour_week", rollings = ["14d", "21d", "28d"])

CPU times: user 3.13 s, sys: 8.15 ms, total: 3.13 s
Wall time: 3.13 s


In [337]:
all_date.shape
# (620940, 39)

(563300, 62)

#### 3) 이전 n개일자들의 전체 평균

In [338]:
data_agg_daily_sum = create_data_agg(mybicard_401, date_col = "transdate", stop_id_col = "stop_id", target_cols = ["totalcnt"], freq = "1D", agg_func = sum)

In [339]:
data_agg_daily_sum = add_time_features(data_agg_daily_sum, date_col = "transdate")

In [340]:
%%time
daily_mv_agg = calculate_moving_agg(data = data_agg_daily_sum, target_cols = ["totalcnt"], date_col = "transdate", groupby_cols = ["stop_id"], col_nm = "daily", rollings = ["2d", "3d", "4d", "5d", "6d"])
daily_mv_agg["date"] = daily_mv_agg["transdate"].dt.date
daily_mv_agg = daily_mv_agg.drop(["transdate", "totalcnt", "dayofweek", "hour", "month", "weekofyear"], 1)

CPU times: user 101 ms, sys: 97 µs, total: 101 ms
Wall time: 101 ms


In [341]:
all_date = pd.merge(all_date, daily_mv_agg, on = ["stop_id", "date"], how = "left")

In [342]:
all_date.shape
# (563300, 72)

(563300, 72)

#### 4) n주전까지의 동일 요일의 전체 평균

In [343]:
%%time
daily_week_mv_agg = calculate_moving_agg(data = data_agg_daily_sum, target_cols = ["totalcnt"], date_col = "transdate", groupby_cols = ["stop_id", "dayofweek"], col_nm = "daily_week", rollings = ["14d", "21d", "28d"])
daily_week_mv_agg["date"] = daily_week_mv_agg["transdate"].dt.date
daily_week_mv_agg = daily_week_mv_agg.drop(["transdate","totalcnt", "dayofweek", "hour", "month", "weekofyear"], 1)

CPU times: user 146 ms, sys: 0 ns, total: 146 ms
Wall time: 146 ms


In [344]:
all_date = pd.merge(all_date, daily_week_mv_agg, on = ["stop_id", "date"], how = "left")

In [345]:
all_date.shape
# (620940, 55)

(563300, 78)

#### 5) n주전까지의 주 평균의 이동평균

In [346]:
mybicard_401 = add_time_features(mybicard_401, date_col = "transdate")

In [347]:
data_agg_weekly_mean = create_data_agg(data_agg_daily_sum, date_col = "transdate", stop_id_col = "stop_id", groupby_cols = "weekofyear",  target_cols = ["totalcnt"], agg_func = np.mean)

In [350]:
weekly_mv_agg = calculate_moving_agg(data = data_agg_weekly_mean, target_cols = ["totalcnt"], date_col = "weekofyear", groupby_cols = "stop_id", col_nm = "weekly", rollings = [2,3,4])
weekly_mv_agg = weekly_mv_agg.drop("totalcnt", 1)

In [351]:
all_date = pd.merge(all_date, weekly_mv_agg, on = ["stop_id", "weekofyear"], how = "left")

In [352]:
all_date.shape
# (620940, 62)

(563300, 84)

### 특일 데이터 추가

In [453]:
holiday_data = pd.read_parquet("/home/seho/Passenger_Demand/data/holiday_data.parquet")

In [454]:
def find_seq_y(data, criterion = 2):
    seq_list = []
    Y_cnt = 0
    for i, x in enumerate(data):
        if x == "Y":
            Y_cnt += 1

        if (x == "N") | (i == len(data)):
            if Y_cnt > criterion:
                temp_list = ["Y"] * Y_cnt
                seq_list += temp_list
            elif (Y_cnt > 0) & (Y_cnt <= criterion):
                temp_list = ["N"] * Y_cnt
                seq_list += temp_list
            seq_list.append("N")
            Y_cnt = 0
            
    return seq_list

In [455]:
def preprocess_holiday_data(holiday_data, 
                            date_col = "locdate"):
    """
        특일 데이터를 전처리 하는 함수
        
        Args: 
            holiday_data: 특일 데이터 (Pandas.DataFrame)
            date_col: 날짜 컬럼명 (str)

        Returns: 
            data: lag feature를 생성한 데이터 (Pandas.DataFrame)

        Exception: 
            
    """    
    holiday_data["date"] = pd.to_datetime(holiday_data[date_col], format = "%Y%m%d")
    
    # 전체 일자 생성
    start_year = holiday_data["date"].dt.year.min()
    end_year = holiday_data["date"].dt.year.max()
    
    date_df = pd.DataFrame({"date" : pd.date_range(f"{start_year}-01-01", f"{end_year}-12-31", freq = "1D")})
#     date_df["date"] = date_df["date"].dt.date
    
    # 1) 주말 여부
    # 주말 여부를 Y/N으로 표시
    date_df["weekend"] = np.where(date_df["date"].dt.dayofweek.isin([5,6]), "Y", "N")    
    
    # 2) 공휴일, 명절 여부
    # 명절(ntl_holiday)
    ntl_holiday = holiday_data.loc[holiday_data["dateName"].isin(["설날", "추석"])]
    ntl_holiday = ntl_holiday.rename(columns = {"dateName" : "ntl_holi"})
    # 공휴일(holiday) 
    holiday = holiday_data.loc[holiday_data["dateName"].isin(["설날", "추석"]) == False]
    holiday = holiday.rename(columns = {"dateName" : "holi"})
    
    # 곻휴일, 명절 여부 추가 (left join)
    date_df = pd.merge(date_df, ntl_holiday.drop("locdate", 1), on = "date", how = "left")
    date_df = pd.merge(date_df, holiday.drop("locdate", 1), on = "date", how = "left")
    date_df["ntl_holi"] = np.where(date_df["ntl_holi"].isna(),"N", "Y")
    date_df["holi"] = np.where(date_df["holi"].isna(),"N", "Y")
    
    # 3) 3일 이상 연휴 여부
    date_df["rest_yn"] = date_df[["weekend", "ntl_holi", "holi"]].apply(lambda x: any(x == "Y"), 1)
    date_df["rest_yn"] = np.where(date_df["rest_yn"],"Y", "N")
    
#     return date_df
#     print(date_df.head())
    date_df["seq_holi"] = find_seq_y(data = date_df["rest_yn"], criterion = 2)
    date_df = date_df.drop(["weekend", "rest_yn"], 1)
    date_df["date"] = date_df["date"].dt.date
    
    return date_df

In [456]:
holiday_data = preprocess_holiday_data(holiday_data = holiday_data)

In [457]:
temp = pd.merge(all_date, holiday_data, on = "date", how = "left")

### 날씨 데이터 추가

In [472]:
weather_data = pd.read_parquet("/home/seho/Passenger_Demand/data/weather_2018.parquet")

In [473]:
def preprocess_weather_data(weather_data):
    """
        날씨 데이터를 전처리 하는 함수
        
        Args: 
            weather_data: 날씨 데이터 (Pandas.DataFrame)

        Returns: 
            data: 날씨 데이터를 전처리한 데이터 (Pandas.DataFrame)

        Exception: 
            
    """     
    # 필요 컬럼만 추출
    # tm(시간), ta(기온), hm(습도), rn(강수량), dsnw(적설량)
    weather_data = weather_data.loc[:, ["tm", "ta", "hm", "rn", "dsnw"]]
    weather_data = weather_data.rename(columns = {"tm" : "time",
                                                  "ta" : "temperature",
                                                  "hm" : "humidity",
                                                  "rn" : "precipitation",
                                                  "dsnw" : "snowfall",})
    weather_data["time"] = pd.to_datetime(weather_data["time"], format = "%Y-%m-%d %H:%M")
    
    for col in ["temperature", "humidity", "precipitation", "snowfall"]:
        weather_data[col] = weather_data[col].replace("", "0.0").astype(float)
        weather_data[col] = weather_data[col].astype(float)
        
    weather_data["time_hours"] = weather_data["time"].dt.strftime("%Y-%m-%d %H")
    weather_data = weather_data.drop("time", 1)
    
    return weather_data

In [474]:
weather_data = preprocess_weather_data(weather_data = weather_data)

### 미세먼지 데이터 추가

In [532]:
pm_data = pd.read_csv("/home/seho/Passenger_Demand/data/pm_data.csv")
# pm_data["issueDate"] = pd.to_datetime(pm_data["issueDate"], format = "%Y-%m-%d")

In [533]:
def preprocess_pm_data(pm_data, date_col = "issueDate", city = "울산"):
    """
        미세먼지 경보 데이터를 전처리 하는 함수
        
        Args: 
            pm_data: 날씨 데이터 (Pandas.DataFrame)
            date_col: date_col: 날짜 컬럼명 (str)
            city: 도시명 (str)
            
        Returns: 
            data: 미세먼지 경보 데이터를 전처리한 데이터 (Pandas.DataFrame)

        Exception: 
            
    """
    pm_data[date_col] = pd.to_datetime(pm_data[date_col], format = "%Y-%m-%d")
    pm_data_agg = pm_data.loc[pm_data["districtName"] == city].groupby(pd.Grouper(key=date_col, freq="1D")).size().reset_index(name = "pm_alert_cnt")
    
    pm_data_agg["date"] = pm_data_agg[date_col].dt.date
    pm_data_agg = pm_data_agg.drop(date_col, 1)
    
    
    # 전체 일자 생성
    start_year = pm_data[date_col].dt.year.min()
    end_year = pm_data[date_col].dt.year.max()
    
    date_df = pd.DataFrame({"date" : pd.date_range(f"{start_year}-01-01", f"{end_year}-12-31", freq = "1D")})
    date_df["date"] = date_df["date"].dt.date
    
    # 미세먼지 경보일자에 Y/N 추가
    date_df = pd.merge(date_df, pm_data_agg, on = "date", how = "left")
    date_df["pm_alert_cnt"] = np.where(date_df["pm_alert_cnt"].isna(), "N", "Y")
    
    
    return date_df

In [534]:
pm_data = preprocess_pm_data(pm_data = pm_data)

In [535]:
ml_data = pd.merge(all_date, pm_data, how = "left", on = "date")

### 상권정보

In [547]:
trading_area = pd.read_csv("/home/seho/Passenger_Demand/data/울산광역시_상권정보_201231.csv")

In [548]:
from math import radians, cos, sin, asin, sqrt

def haversine(latlon1, latlon2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians
    lat1, lon1 = map(radians, latlon1)
    lat2, lon2 = map(radians, latlon2)
#     lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles
    return c * r

def count_store_nearby(data, trading_area = trading_area, dist = 0.1, category_list = None):
    data_copy = data.copy()
    if category_list == None:
        category_list = trading_area["상권업종중분류명"].drop_duplicates().to_list()
    
    dist_list = trading_area[["위도", "경도"]].apply(lambda x: haversine((x["위도"], x["경도"]), (data_copy["latitude"], data_copy["longitude"])), 1)
    within_data = trading_area.loc[dist_list <= dist]
    
    
    for i, ctgr in enumerate(category_list):
        data_copy[f"store_category_{i}"] = (within_data["상권업종중분류명"] == ctgr).sum()

    return data_copy
    

In [550]:
%%time
bus_stop_info = parallelize_dataframe(df = bus_stop_info, 
                                           func = count_store_nearby, 
                                           num_cores = 12, 
                                           trading_area = trading_area, 
                                           dist = 0.2)

100%|██████████| 11/11 [00:09<00:00,  1.12it/s]
100%|██████████| 11/11 [00:09<00:00,  1.14it/s]
100%|██████████| 11/11 [00:09<00:00,  1.10it/s]
100%|██████████| 11/11 [00:10<00:00,  1.08it/s]
 18%|█▊        | 2/11 [00:00<00:04,  2.10it/s]]
100%|██████████| 11/11 [00:10<00:00,  1.05it/s]
100%|██████████| 11/11 [00:10<00:00,  1.09it/s]
100%|██████████| 11/11 [00:10<00:00,  1.08it/s]
100%|██████████| 11/11 [00:10<00:00,  1.10it/s]
100%|██████████| 11/11 [00:09<00:00,  1.12it/s]
100%|██████████| 11/11 [00:09<00:00,  1.16it/s]
100%|██████████| 10/10 [00:08<00:00,  1.18it/s]


CPU times: user 17.6 s, sys: 983 ms, total: 18.6 s
Wall time: 27.2 s


### 병원정보

In [91]:
hospital_data = pd.read_parquet("/home/seho/Passenger_Demand/data/hospital_data.parquet")

In [127]:
hospital_data.to_csv("hospital_data.csv", encoding = "euc-kr")

In [93]:
hospital_data["category"] = hospital_data["의료기관종별"].replace({"한방병원" : "병원",
                                                                  "치과병원" : "병원",
                                                                  "일반요양병원" : "요양병원",
                                                                  "부속의원" : "의원",
                                                                  "치과의원" : "의원",
                                                                  "한의원" : "의원",
                                                                  "보건지소" : "보건소",
                                                                  "보건진료소" : "보건소"})

In [94]:
hospital_category_list = hospital_data["category"].drop_duplicates().to_list()

In [95]:
def count_hospital_nearby(data, hospital_data = hospital_data, dist = 0.2, category_list = None):
    data_copy = data.copy()
    if category_list == None:
        category_list = hospital_data["category"].drop_duplicates().to_list()
    
    dist_list = hospital_data[["lat", "lng"]].apply(lambda x: haversine((x["lat"], x["lng"]), (data_copy["latitude"], data_copy["longitude"])), 1)
    within_data = hospital_data.loc[dist_list <= dist]
    
    
    for i, ctgr in enumerate(category_list):
        data_copy[f"hospital_category_{i}"] = (within_data["category"] == ctgr).sum()

    return data_copy

In [96]:
%%time
bus_stop_401_info = parallelize_dataframe(df = bus_stop_401_info, 
                                           func = count_hospital_nearby, 
                                           num_cores = 12, 
                                           hospital_data = hospital_data, 
                                           dist = 0.2)

100%|██████████| 11/11 [00:00<00:00, 23.78it/s]
100%|██████████| 11/11 [00:00<00:00, 24.00it/s]
100%|██████████| 11/11 [00:00<00:00, 23.93it/s]
100%|██████████| 11/11 [00:00<00:00, 22.80it/s]
100%|██████████| 11/11 [00:00<00:00, 22.79it/s]
100%|██████████| 11/11 [00:00<00:00, 22.66it/s]
100%|██████████| 11/11 [00:00<00:00, 23.19it/s]
100%|██████████| 11/11 [00:00<00:00, 23.41it/s]
100%|██████████| 11/11 [00:00<00:00, 23.66it/s]
100%|██████████| 11/11 [00:00<00:00, 22.36it/s]
100%|██████████| 11/11 [00:00<00:00, 24.53it/s]
100%|██████████| 10/10 [00:00<00:00, 23.25it/s]


CPU times: user 1.07 s, sys: 0 ns, total: 1.07 s
Wall time: 1.47 s


### 학교정보

In [97]:
school_data = pd.read_excel("/home/seho/Passenger_Demand/data/gv_school.xlsx")

In [98]:
school_data.shape

(12883, 35)

In [99]:
school_data["표준일차명"] = school_data["표준일차명"].fillna("")

In [100]:
school_data = school_data.loc[school_data["표준일차명"].str.contains("울산", na="")]

In [101]:
import googlemaps

In [102]:
gmaps = googlemaps.Client(key='AIzaSyBRxjIW7qfFhaVyCsc2xhk5mf1hXUSi9DI')

In [103]:
def get_geocode(x, gmaps):
    try:
        result = gmaps.geocode(x)[0]["geometry"]["location"]
        # result = [temp["lat"], temp["lng"]]
    except:
        result = None
    
    return result        

In [104]:
school_data["category"] = school_data["학교종류"].replace({"전문대학(3년제)" : "전문대학",
                                                          "사내대학(전문)" : "전문대학",
                                                          "기능대학" : "전문대학",
                                                          "일반대학원" : "대학원",
                                                          "전문대학원" : "대학원",
                                                          "특수대학원" : "대학원",
                                                          "일반고등학교" : "고등학교",
                                                          "공업고등학교" : "고등학교",
                                                          "상업고등학교" : "고등학교",
                                                          "가사고등학교" : "고등학교",
                                                          "체육고등학교" : "고등학교",
                                                          "외국어고등학교" : "고등학교",
                                                          "과학고등학교" : "고등학교",
                                                          "예술고등학교" : "고등학교"})

In [105]:
%%time
school_data["lat_lng"] = school_data["새주소"].apply(get_geocode, gmaps = gmaps)

CPU times: user 481 ms, sys: 0 ns, total: 481 ms
Wall time: 1min 1s


In [106]:
school_data["lat"] = school_data["lat_lng"].apply(lambda x: x["lat"])
school_data["lng"] = school_data["lat_lng"].apply(lambda x: x["lng"])

In [107]:
def count_school_nearby(data, school_data = school_data, dist = 0.2, category_list = None):
    data_copy = data.copy()
    if category_list == None:
        category_list = school_data["category"].drop_duplicates().to_list()
    
    dist_list = school_data[["lat", "lng"]].apply(lambda x: haversine((x["lat"], x["lng"]), (data_copy["latitude"], data_copy["longitude"])), 1)
    within_data = school_data.loc[dist_list <= dist]
    
    
    for i, ctgr in enumerate(category_list):
        data_copy[f"school_category_{i}"] = (within_data["category"] == ctgr).sum()

    return data_copy

In [108]:
%%time
bus_stop_401_info = parallelize_dataframe(df = bus_stop_401_info, 
                                              func = count_school_nearby, 
                                              num_cores = 12, 
                                              school_data = school_data, 
                                              dist = 0.2)

  0%|          | 0/11 [00:00<?, ?it/s]
100%|██████████| 11/11 [00:00<00:00, 102.96it/s]
100%|██████████| 11/11 [00:00<00:00, 77.12it/s] 
100%|██████████| 11/11 [00:00<00:00, 76.67it/s]
100%|██████████| 11/11 [00:00<00:00, 81.10it/s]
100%|██████████| 11/11 [00:00<00:00, 79.73it/s]
100%|██████████| 11/11 [00:00<00:00, 74.05it/s]
100%|██████████| 11/11 [00:00<00:00, 75.82it/s]
100%|██████████| 11/11 [00:00<00:00, 73.45it/s]
100%|██████████| 11/11 [00:00<00:00, 74.86it/s]
100%|██████████| 10/10 [00:00<00:00, 86.17it/s]
100%|██████████| 11/11 [00:00<00:00, 81.24it/s]


CPU times: user 747 ms, sys: 0 ns, total: 747 ms
Wall time: 807 ms


### 정류장 정보 Join(거리기반)

In [109]:
ml_data = pd.merge(ml_data, bus_stop_401_info.drop(["stop_id", "city"],1), on = "mybi_stop_id")

In [110]:
ml_data.shape
# (620940, 175)

(620940, 175)

### 울산행사정보

In [111]:
event_data = pd.read_csv("~/Passenger_Demand/data/ulsan_event_data.csv")

In [112]:
event_data.shape

(312, 19)

In [113]:
event_data["eventStartDate"] = pd.to_datetime(event_data["eventStartDate"], format = "%Y-%m-%d")
event_data["eventEndDate"] = pd.to_datetime(event_data["eventEndDate"], format = "%Y-%m-%d")

In [114]:
def count_event_nearby(data, event_data, dist = 0.2):
    data_copy = data.copy()
    within_data = event_data.loc[(event_data["eventStartDate"] <= data_copy["transdate"]) & (event_data["eventEndDate"] >= data_copy["transdate"])]
    
    if len(within_data) == 0:
        data_copy[f"event_nearby"] = 0
    else:
        dist_list = within_data[["latitude", "longitude"]].apply(lambda x: haversine((x["latitude"], x["longitude"]), (data_copy["latitude"], data_copy["longitude"])), 1)
        data_copy[f"event_nearby"] = (dist_list <= dist).sum()

    return data_copy

In [115]:
%%time
ml_data = parallelize_dataframe(df = ml_data, 
                                func = count_event_nearby, 
                                num_cores = 12, 
                                event_data = event_data, 
                                dist = 0.2)

100%|██████████| 51745/51745 [03:32<00:00, 243.24it/s]
100%|██████████| 51745/51745 [03:33<00:00, 242.30it/s]
100%|██████████| 51745/51745 [03:38<00:00, 236.29it/s]
100%|██████████| 51745/51745 [03:41<00:00, 234.05it/s]
100%|██████████| 51745/51745 [03:42<00:00, 233.08it/s]
100%|██████████| 51745/51745 [03:39<00:00, 235.81it/s]
100%|██████████| 51745/51745 [03:41<00:00, 233.70it/s]
100%|██████████| 51745/51745 [03:41<00:00, 233.23it/s]
100%|██████████| 51745/51745 [03:39<00:00, 235.56it/s]
100%|██████████| 51745/51745 [03:42<00:00, 232.61it/s]
100%|██████████| 51745/51745 [03:41<00:00, 233.26it/s]
100%|██████████| 51745/51745 [03:42<00:00, 232.05it/s]


CPU times: user 32.8 s, sys: 0 ns, total: 32.8 s
Wall time: 3min 51s


In [116]:
ml_data.shape
# (620940, 176)

(620940, 176)

### 축제 정보

In [117]:
festival_data = pd.read_csv("~/Passenger_Demand/data/festival_data.csv")

In [118]:
festival_data.shape

(21, 18)

In [119]:
festival_data["fstvlStartDate"] = pd.to_datetime(festival_data["fstvlStartDate"], format = "%Y-%m-%d")
festival_data["fstvlEndDate"] = pd.to_datetime(festival_data["fstvlEndDate"], format = "%Y-%m-%d")

In [120]:
def count_festival_nearby(data, festival_data, dist = 0.2):
    data_copy = data.copy()
    within_data = festival_data.loc[(festival_data["fstvlStartDate"] <= data_copy["transdate"]) & (festival_data["fstvlEndDate"] >= data_copy["transdate"])]
    
    if len(within_data) == 0:
        data_copy[f"festival_nearby"] = 0
    else:
        dist_list = within_data[["latitude", "longitude"]].apply(lambda x: haversine((x["latitude"], x["longitude"]), (data_copy["latitude"], data_copy["longitude"])), 1)
        data_copy[f"festival_nearby"] = (dist_list <= dist).sum()

    return data_copy

In [121]:
%%time
ml_data = parallelize_dataframe(df = ml_data, 
                                func = count_festival_nearby, 
                                num_cores = 12, 
                                festival_data = festival_data, 
                                dist = 0.2)

100%|██████████| 51745/51745 [01:31<00:00, 563.83it/s]
100%|██████████| 51745/51745 [01:32<00:00, 560.97it/s]
100%|██████████| 51745/51745 [01:31<00:00, 563.95it/s]
100%|██████████| 51745/51745 [01:32<00:00, 560.48it/s]
100%|██████████| 51745/51745 [01:32<00:00, 561.49it/s]
100%|██████████| 51745/51745 [01:31<00:00, 563.31it/s]
100%|██████████| 51745/51745 [01:32<00:00, 561.81it/s]
100%|██████████| 51745/51745 [01:32<00:00, 561.99it/s]
100%|██████████| 51745/51745 [01:32<00:00, 561.17it/s]
100%|██████████| 51745/51745 [02:26<00:00, 352.92it/s]
100%|██████████| 51745/51745 [02:27<00:00, 349.96it/s]
100%|██████████| 51745/51745 [02:26<00:00, 353.26it/s]


CPU times: user 1min 17s, sys: 0 ns, total: 1min 17s
Wall time: 2min 39s


In [122]:
ml_data.shape
# (620940, 177)

(620940, 177)

In [126]:
ml_data.drop(["weekofyear", "date"], 1).to_pickle("/home/seho/Passenger_Demand/data/ml_data.pkl")

### 인구 정보

In [None]:
population_data = pd.read_csv("~/Passenger_Demand/data/울산광역시_인구 현황_20200727.csv", encoding = "euc-kr")

In [None]:
gmaps = googlemaps.Client(key='AIzaSyBRxjIW7qfFhaVyCsc2xhk5mf1hXUSi9DI')

In [None]:
rq = requests.get("https://maps.googleapis.com/maps/api/geocode/json?latlng=35.60467,129.4328&key=AIzaSyBRxjIW7qfFhaVyCsc2xhk5mf1hXUSi9DI")

In [None]:
"https://maps.googleapis.com/maps/api/geocode/json?latlng=35.60467,129.4328&key=AIzaSyBRxjIW7qfFhaVyCsc2xhk5mf1hXUSi9DI"

In [None]:
gmaps.reverse_geocode((35.60467, 129.4328), language = "korean")