In [104]:
import pandas as pd
import numpy as np
from datetime import datetime
import seaborn as sns

In [2]:
from utils.Parallelize_DataFrame import *

In [3]:
from datetime import datetime, timedelta

### 마이비 카드 데이터

In [4]:
%%time
mybicard = pd.read_parquet('/home/seho/Passenger_Demand/data/mybicard.parquet', engine='pyarrow')

CPU times: user 26.1 s, sys: 7.36 s, total: 33.4 s
Wall time: 17.3 s


In [5]:
%%time
mybicard = mybicard.drop_duplicates()

CPU times: user 45.7 s, sys: 3.34 s, total: 49.1 s
Wall time: 49 s


In [6]:
# 수집일자 데이트 포맷으로 변환
mybicard["collectdate"] = pd.to_datetime(mybicard["collectdate"], format = "%Y%m%d")

In [7]:
# 전송일자 데이트 포맷으로 변환
mybicard["transdate"] = pd.to_datetime(mybicard["transdate"], format = "%Y%m%d %H:%M:%S")

In [8]:
# 전체 승객 수 변수 생성(일반 + 학생 + 아동)
mybicard["totalcnt"] = mybicard[["normalcnt", "studentcnt", "childcnt"]].sum(axis = 1)

In [9]:
# route_nm에 공백이 포함되어 있어 공백 제거
mybicard["route_nm"] = mybicard["route_nm"].replace("\s", "", regex = True)

In [10]:
mybicard = mybicard.sort_values(["transdate", "seq"]).reset_index(drop=True)

In [11]:
mybicard = mybicard.rename(columns = {"stop_id" : "mybi_stop_id"})

### 401번 버스

In [12]:
mybicard_401 = mybicard.loc[(mybicard["route_nm"] == "401")].reset_index()

In [13]:
mybicard_401_agg = (mybicard_401.loc[mybicard_401["transflag"] != "하차"]
                                   .groupby(["mybi_stop_id", pd.Grouper(key="transdate", freq='60Min')])
                                   .agg(normalcnt = ("normalcnt",sum), 
                                        studentcnt = ("studentcnt", sum), 
                                        childcnt = ("childcnt", sum),
                                        totalcnt = ("totalcnt", sum))
                                   .reset_index())

In [91]:
mybicard_401_agg.to_parquet("/home/seho/Passenger_Demand/data/mybicard_401_agg.parquet")


### 결측치 
하루의 수집 데이터의 수가 0인 날짜의 데이터

In [14]:
count_by_date = mybicard_401_agg.groupby([pd.Grouper(key="transdate", freq="1D")]).size().reset_index(name = "cnt")

In [15]:
missing_date = count_by_date.loc[count_by_date["cnt"] == 0, "transdate"]

#### n주 전 같은 요일 같은 시간대의 인원 수로 Impute

In [16]:
%%time
cnt_df= mybicard_401_agg.copy()
print(cnt_df.shape)
for x in missing_date:
    cnt_df_temp = []
    w = 0
    while len(cnt_df_temp) == 0:
        w +=1
        cnt_df_temp = cnt_df.loc[cnt_df["transdate"].dt.date == (x - timedelta(weeks = w)).date()]
    
    cnt_df_temp["transdate"] = cnt_df_temp["transdate"] + timedelta(weeks = w)
    cnt_df = pd.concat([cnt_df, cnt_df_temp], 0)
print(cnt_df.shape)

(314862, 6)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


(392195, 6)
CPU times: user 3.07 s, sys: 108 ms, total: 3.18 s
Wall time: 3.18 s


#### 결측일을 제외한 결측치(특정 시간에 데이터가 없는 경우)는 승객이 0이므로 0으로 대체한다.

In [17]:
# 데이터의 시작과 끝 사이를 1시간 간격으로 구분하여 list 생성
dt_list = pd.date_range(start = cnt_df["transdate"].min(), end = cnt_df["transdate"].max(), freq = "1h")

In [18]:
transdate_df = pd.DataFrame({"transdate" : dt_list}).reset_index(drop = True)
mybi_stop_id_df = pd.DataFrame({"mybi_stop_id" : cnt_df["mybi_stop_id"].drop_duplicates()}).reset_index(drop = True)

In [19]:
all_date = pd.merge(transdate_df, mybi_stop_id_df, how = "cross")

In [20]:
ml_data = pd.merge(all_date, cnt_df, on = ["mybi_stop_id", "transdate"], how = "left")

In [21]:
# 요일, 날짜, 월 변수 생성
ml_data["dayofweek"] = ml_data["transdate"].dt.dayofweek
dow_dict = {0:"월", 1:"화", 2:"수", 3:"목", 4:"금", 5:"토", 6:"일"}
ml_data["dayofweek"] = ml_data["dayofweek"].replace(dow_dict)
ml_data["day"] = ml_data["transdate"].dt.day
ml_data["month"] = ml_data["transdate"].dt.month
ml_data["hour"] = ml_data["transdate"].dt.hour

In [22]:
ml_data = ml_data.fillna(0)

In [108]:
ml_data.loc[ml_data["hour"].isin([1,2,3,4]) == False ].shape

(620940, 130)

#### 최근 n주의 같은 요일 같은 시간대의 평균값으로 Impute

In [469]:
%%time
cnt_df= mybicard_401_agg.copy()

# 요일, 시간 추가
cnt_df["dayofweek"] = cnt_df["transdate"].dt.dayofweek
dow_dict = {0:"월", 1:"화", 2:"수", 3:"목", 4:"금", 5:"토", 6:"일"}
cnt_df["dayofweek"] = cnt_df["dayofweek"].replace(dow_dict)
cnt_df["hour"] = cnt_df["transdate"].dt.hour

print(cnt_df.shape)
for x in missing_date:
    base_date = x
    w = 0
    # 결측일의 이전 4주를 기본으로 검색하며, 데이터가 없는 경우 범위를 1주씩 늘려가며 데이터 조회
    cnt_df_temp = []
    while len(cnt_df_temp) == 0:
        cnt_df_temp = cnt_df.loc[(cnt_df["transdate"].dt.date.between((x - timedelta(weeks = 4+w)).date(), x.date())) & (cnt_df["transdate"].dt.dayofweek == x.day_of_week)]
        w += 1
    
    # 4+w 전까지의 데이터를 찾아서 정류장별, 요일별, 시간별 평균값 산출
    cnt_df_temp2 = cnt_df_temp.groupby(["mybi_stop_id", "dayofweek", "hour"]).agg({"totalcnt" : np.mean,
                                                                                   "normalcnt" : np.mean,
                                                                                   "studentcnt" : np.mean,
                                                                                   "childcnt" : np.mean}).reset_index()
    # 평균값 변환 (Float -> Int : 반올림 효과)
    cnt_df_temp2["totalcnt"] = cnt_df_temp2["totalcnt"].astype(int)
    cnt_df_temp2["normalcnt"] = cnt_df_temp2["normalcnt"].astype(int)
    cnt_df_temp2["studentcnt"] = cnt_df_temp2["studentcnt"].astype(int)
    cnt_df_temp2["childcnt"] = cnt_df_temp2["childcnt"].astype(int)
       
    # 기준 일자, 시간으로 부터 transdate을 재생성
    cnt_df_temp2["transdate"] = cnt_df_temp2.apply(lambda x: base_date + timedelta(hours = x["hour"]), 1)

    cnt_df = pd.concat([cnt_df, cnt_df_temp2], 0)
print(cnt_df.shape)



# 데이터의 시작과 끝 사이를 1시간 간격으로 구분하여 list 생성
dt_list = pd.date_range(start = cnt_df["transdate"].min(), end = cnt_df["transdate"].max(), freq = "1h")

transdate_df = pd.DataFrame({"transdate" : dt_list}).reset_index(drop = True)
mybi_stop_id_df = pd.DataFrame({"mybi_stop_id" : cnt_df["mybi_stop_id"].drop_duplicates()}).reset_index(drop = True)

# 전체 일정(시간 단위)과 정류소 별 조합 DF 생성
all_date = pd.merge(transdate_df, mybi_stop_id_df, how = "cross")

# 결측일의 데이터를 채워넣은 전체 데이터를 left join
ml_data = pd.merge(all_date, cnt_df, on = ["mybi_stop_id", "transdate"], how = "left")

# 결측일이 아닌 결측값은 승객수가 없다고 판단하여 0으로 대체
ml_data = ml_data.fillna(0)

(314862, 8)
(408337, 8)
CPU times: user 5.06 s, sys: 168 ms, total: 5.23 s
Wall time: 5.19 s


In [470]:
ml_data.head()

Unnamed: 0,transdate,mybi_stop_id,normalcnt,studentcnt,childcnt,totalcnt,dayofweek,hour
0,2020-04-08,3100020,0.0,0.0,0.0,0.0,0,0.0
1,2020-04-08,3100021,0.0,0.0,0.0,0.0,0,0.0
2,2020-04-08,3100057,0.0,0.0,0.0,0.0,0,0.0
3,2020-04-08,3100058,0.0,0.0,0.0,0.0,0,0.0
4,2020-04-08,3100085,0.0,0.0,0.0,0.0,0,0.0


### 시계열 변수 생성

In [471]:
def create_lag_feature(data, target_cols, date_cols, freqs, groupby_cols = None):
    data = data.copy()
    if isinstance(freqs, list) == False:
        freqs = [freqs]
    if isinstance(date_cols, list) == False:
        date_cols = [date_cols]
    if isinstance(target_cols, list) == False:
        target_cols = [target_cols]
    if isinstance(groupby_cols, list) == False:
        groupby_cols = [groupby_cols]
    
    for frq in freqs:
        if groupby_cols is None:
            cnt_bf = data.set_index(date_cols)[target_cols].shift(freq = frq).reset_index()
        else:
            cnt_bf = data.set_index(date_cols).groupby(groupby_cols)[target_cols].shift(freq = frq).reset_index()
        
        rename_dict = {col: f"{col}_bf{frq}" for col in target_cols}
        cnt_bf = cnt_bf.rename(columns = rename_dict)
        
        data = pd.merge(data, cnt_bf, on = date_cols + groupby_cols, how = "left")
    
    return data
    

In [472]:
ml_data = create_lag_feature(data = ml_data, target_cols = "totalcnt", date_cols = "transdate", freqs = ["1d", "7d"], groupby_cols = "mybi_stop_id")

### 날짜별 합계, 평균 Lag

In [473]:
mybicard_401_agg_daily = (mybicard_401.loc[mybicard_401["transflag"] != "하차"]
                                   .groupby(["mybi_stop_id", pd.Grouper(key="transdate", freq='1d')])
                                   .agg(normalcnt = ("normalcnt", np.mean), 
                                        studentcnt = ("studentcnt", np.mean), 
                                        childcnt = ("childcnt", np.mean),
                                        totalcnt = ("totalcnt", np.mean))
                                   .reset_index())

In [474]:
daily_lag = create_lag_feature(data = mybicard_401_agg_daily, target_cols = "totalcnt", date_cols = "transdate", freqs = ["1d", "7d"], groupby_cols = "mybi_stop_id")

In [475]:
daily_lag = daily_lag.rename(columns = {"totalcnt_bf1d" : "totalcnt_bf1d_total", "totalcnt_bf7d" : "totalcnt_bf7d_total"})
daily_lag["date"] = daily_lag["transdate"].dt.date

In [476]:
ml_data["date"] = ml_data["transdate"].dt.date

In [477]:
ml_data = pd.merge(ml_data, daily_lag[["date", "mybi_stop_id", "totalcnt_bf1d_total", "totalcnt_bf7d_total"]], on = ["date", "mybi_stop_id"], how = "left")

### 특일 데이터 추가

In [478]:
holiday_data = pd.read_parquet("/home/seho/Passenger_Demand/data/holiday_data.parquet")

In [479]:
holiday_data["date"] = pd.to_datetime(holiday_data["locdate"], format = "%Y%m%d").dt.date
holiday_data = holiday_data.drop(["locdate"], 1)

In [480]:
date_df = pd.DataFrame({"date" : pd.date_range("2020-01-01", "2020-12-31", freq = "1D")})
date_df["weekend"] = np.where(date_df["date"].dt.dayofweek.isin([5,6]), "Y", "N")
date_df["date"] = date_df["date"].dt.date

#### 공휴일 / 명절 여부 

In [481]:
# 공휴일(holiday), 명절(ntl_holiday) 구분
ntl_holiday = holiday_data.loc[holiday_data["dateName"].isin(["설날", "추석"])]
ntl_holiday = ntl_holiday.rename(columns = {"dateName" : "ntl_holi"})

holiday = holiday_data.loc[holiday_data["dateName"].isin(["설날", "추석"]) == False]
holiday = holiday.rename(columns = {"dateName" : "holi"})

In [482]:
date_df = pd.merge(date_df, ntl_holiday, on = "date", how = "left")
date_df = pd.merge(date_df, holiday, on = "date", how = "left")

In [483]:
date_df = date_df.assign(ntl_holi = np.where(date_df["ntl_holi"].isna(), "N", "Y"),
                         holi = np.where(date_df["holi"].isna(), "N", "Y"))

#### 3일 이상 연휴 여부

In [484]:
date_df["rest_yn"] = date_df[["weekend", "ntl_holi", "holi"]].apply(lambda x: any(x == "Y"), 1)
date_df["rest_yn"] = np.where(date_df["rest_yn"],"Y", "N")

In [485]:
def find_seq_y(data, criterion = 2):
    seq_list = []
    Y_cnt = 0
    for i, x in enumerate(data):
        if x == "Y":
            Y_cnt += 1

        if (x == "N") | (i == len(data)):
            if Y_cnt > criterion:
                temp_list = ["Y"] * Y_cnt
                seq_list += temp_list
            elif (Y_cnt > 0) & (Y_cnt <= criterion):
                temp_list = ["N"] * Y_cnt
                seq_list += temp_list
            seq_list.append("N")
            Y_cnt = 0
            
    return seq_list


In [486]:
date_df["seq_holi"] = find_seq_y(data = date_df["rest_yn"])

In [487]:
date_df = date_df.drop(["weekend", "rest_yn"], 1)

In [488]:
pd.merge(ml_data, date_df, on = "date", how = "left")

Unnamed: 0,transdate,mybi_stop_id,normalcnt,studentcnt,childcnt,totalcnt,dayofweek,hour,totalcnt_bf1d,totalcnt_bf7d,date,totalcnt_bf1d_total,totalcnt_bf7d_total,ntl_holi,holi,seq_holi
0,2020-04-08 00:00:00,3100020,0.0,0.0,0.0,0.0,0,0.0,,,2020-04-08,,,N,N,N
1,2020-04-08 00:00:00,3100021,0.0,0.0,0.0,0.0,0,0.0,,,2020-04-08,,,N,N,N
2,2020-04-08 00:00:00,3100057,0.0,0.0,0.0,0.0,0,0.0,,,2020-04-08,,,N,N,N
3,2020-04-08 00:00:00,3100058,0.0,0.0,0.0,0.0,0,0.0,,,2020-04-08,,,N,N,N
4,2020-04-08 00:00:00,3100085,0.0,0.0,0.0,0.0,0,0.0,,,2020-04-08,,,N,N,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
745123,2020-11-30 23:00:00,3101560,0.0,0.0,0.0,0.0,0,0.0,0.0,2.0,2020-11-30,1.0,1.038095,N,N,N
745124,2020-11-30 23:00:00,3102261,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,2020-11-30,,1.000000,N,N,N
745125,2020-11-30 23:00:00,3102622,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,2020-11-30,,1.000000,N,N,N
745126,2020-11-30 23:00:00,3102806,2.0,0.0,0.0,2.0,월,23.0,0.0,3.0,2020-11-30,1.0,1.036364,N,N,N


### 날씨 데이터 추가

In [28]:
weather_data = pd.read_parquet("/home/seho/Passenger_Demand/data/weather_2018.parquet")

In [29]:
weather_data = weather_data.loc[:, ["tm", "ta", "hm", "rn", "dc10Tca",  "dsnw", "wd", "ws"]]
weather_data = weather_data.rename(columns = {"tm" : "time",
                                              "ta" : "temperature",
                                              "hm" : "humidity",
                                              "rn" : "precipitation",
                                              "dc10Tca" : "전운량",
                                              "dsnw" : "snowfall",
                                              "wd" : "풍향",
                                              "ws" : "풍속"})
weather_data["time"] = pd.to_datetime(weather_data["time"], format = "%Y-%m-%d %H:%M")

In [30]:
for col in weather_data.columns:
    if col == "time":
        continue
    weather_data[col] = weather_data[col].replace("", "0.0").astype(float)
    weather_data[col] = weather_data[col].astype(float)

In [31]:
weather_data["time_hours"] = weather_data["time"].dt.strftime("%Y-%m-%d %H")

In [32]:
ml_data["transdate_hours"] = ml_data["transdate"].dt.strftime("%Y-%m-%d %H")

In [33]:
ml_data = pd.merge(ml_data, weather_data[["time_hours", "temperature", "humidity", "precipitation", "snowfall"]], left_on = "transdate_hours", right_on = "time_hours")
ml_data = ml_data.drop(["transdate_hours", "time_hours"], 1)

In [34]:
ml_data.shape

(745128, 22)

### 미세먼지 데이터 추가

In [35]:
pm_data = pd.read_csv("/home/seho/Passenger_Demand/data/pm_data.csv")
pm_data["issueDate"] = pd.to_datetime(pm_data["issueDate"], format = "%Y-%m-%d")

In [36]:
pm_data_agg = pm_data.loc[pm_data["districtName"] == "울산"].groupby(pd.Grouper(key="issueDate", freq="1D")).size().reset_index(name = "pm_alert_cnt")

In [37]:
ml_data["date"] = ml_data["transdate"].dt.strftime("%Y-%m-%d")
pm_data_agg["issueDate"] = pm_data_agg["issueDate"].dt.strftime("%Y-%m-%d")

In [38]:
ml_data = pd.merge(ml_data, pm_data_agg, how = "left", left_on = "date", right_on = "issueDate")
ml_data = ml_data.drop(["date", "issueDate"], 1)
ml_data["pm_alert_cnt"] = ml_data["pm_alert_cnt"].fillna("0")

In [39]:
ml_data.head()

Unnamed: 0,transdate,mybi_stop_id,normalcnt,studentcnt,childcnt,totalcnt,dayofweek,day,month,hour,...,childcnt_bf1d,totalcnt_bf1w,normalcnt_bf1w,studentcnt_bf1w,childcnt_bf1w,temperature,humidity,precipitation,snowfall,pm_alert_cnt
0,2020-04-08,3100020,0.0,0.0,0.0,0.0,수,8,4,0,...,,,,,,12.8,47.0,0.0,0.0,0
1,2020-04-08,3100021,0.0,0.0,0.0,0.0,수,8,4,0,...,,,,,,12.8,47.0,0.0,0.0,0
2,2020-04-08,3100057,0.0,0.0,0.0,0.0,수,8,4,0,...,,,,,,12.8,47.0,0.0,0.0,0
3,2020-04-08,3100058,0.0,0.0,0.0,0.0,수,8,4,0,...,,,,,,12.8,47.0,0.0,0.0,0
4,2020-04-08,3100085,0.0,0.0,0.0,0.0,수,8,4,0,...,,,,,,12.8,47.0,0.0,0.0,0


### 정류장 X,Y 좌표 추가

In [40]:
# 경주시, 양산시, 울산광역시, 부산광역시
bus_stop_info = pd.read_csv("/home/seho/Passenger_Demand/data/울산광역시_버스 정류소 위치 정보_20200531.csv", encoding = "euc-kr")
bus_stop_info = bus_stop_info.loc[bus_stop_info["권역"] == "울산광역시"]
bus_stop_info.columns = ["stop_nm", "stop_id", "longitude", "latitude", "city"]

In [41]:
bus_stop_401_1 = pd.read_csv("/home/seho/Passenger_Demand/data/401_율리_꽃바위.csv", encoding = "euc_kr")
bus_stop_401_2 = pd.read_csv("/home/seho/Passenger_Demand/data/401_꽃바위_율리.csv", encoding = "euc_kr")
bus_stop_401 = pd.concat([bus_stop_401_1, bus_stop_401_2])
bus_stop_401.columns = ["mybi_stop_id", "stop_id"]

In [42]:
bus_stop_401_info = pd.merge(bus_stop_401, bus_stop_info, on = "stop_id")

### 상권정보

In [43]:
trading_area = pd.read_csv("/home/seho/Passenger_Demand/data/울산광역시_상권정보_201231.csv")

In [44]:
category_list = trading_area["상권업종중분류명"].drop_duplicates().to_list()

In [45]:
from math import radians, cos, sin, asin, sqrt

def haversine(latlon1, latlon2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians
    lat1, lon1 = map(radians, latlon1)
    lat2, lon2 = map(radians, latlon2)
#     lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles
    return c * r

def count_store_nearby(data, trading_area = trading_area, dist = 0.1, category_list = None):
    data_copy = data.copy()
    if category_list == None:
        category_list = trading_area["상권업종중분류명"].drop_duplicates().to_list()
    
    dist_list = trading_area[["위도", "경도"]].apply(lambda x: haversine((x["위도"], x["경도"]), (data_copy["latitude"], data_copy["longitude"])), 1)
    within_data = trading_area.loc[dist_list <= dist]
    
    
    for i, ctgr in enumerate(category_list):
        data_copy[f"store_category_{i}"] = (within_data["상권업종중분류명"] == ctgr).sum()

    return data_copy
    

In [100]:
category_list[32]

'자동차/이륜차'

In [46]:
%%time
bus_stop_401_info = parallelize_dataframe(df = bus_stop_401_info, 
                                           func = count_store_nearby, 
                                           num_cores = 12, 
                                           trading_area = trading_area, 
                                           dist = 0.2, 
                                           category_list = category_list)

100%|██████████| 11/11 [00:09<00:00,  1.17it/s]
 91%|█████████ | 10/11 [00:08<00:00,  1.02it/s]
100%|██████████| 11/11 [00:10<00:00,  1.05it/s]
100%|██████████| 11/11 [00:10<00:00,  1.02it/s]
100%|██████████| 11/11 [00:10<00:00,  1.01it/s]
100%|██████████| 11/11 [00:10<00:00,  1.04it/s]
100%|██████████| 11/11 [00:10<00:00,  1.02it/s]
100%|██████████| 11/11 [00:10<00:00,  1.00it/s]
100%|██████████| 11/11 [00:10<00:00,  1.04it/s]
100%|██████████| 11/11 [00:10<00:00,  1.04it/s]
100%|██████████| 11/11 [00:10<00:00,  1.07it/s]
100%|██████████| 10/10 [00:08<00:00,  1.14it/s]


CPU times: user 17.6 s, sys: 765 ms, total: 18.3 s
Wall time: 27.1 s


### 병원정보

In [47]:
hospital_data = pd.read_parquet("/home/seho/Passenger_Demand/data/hospital_data.parquet")

In [48]:
hospital_data["category"] = hospital_data["의료기관종별"].replace({"한방병원" : "병원",
                                                                  "치과병원" : "병원",
                                                                  "일반요양병원" : "요양병원",
                                                                  "부속의원" : "의원",
                                                                  "치과의원" : "의원",
                                                                  "한의원" : "의원",
                                                                  "보건지소" : "보건소",
                                                                  "보건진료소" : "보건소"})

In [49]:
hospital_data.head()

Unnamed: 0,의료기관명,의료기관종별,의료기관주소(도로명),lat_lng,lat,lng,category
0,의료법인 정안의료재단 중앙병원,종합병원,울산광역시 남구 문수로480번길 10 (신정동),"{'lat': 35.5315233, 'lng': 129.3052032}",35.531523,129.305203,종합병원
1,의료법인 은성의료재단 좋은삼정병원,종합병원,울산광역시 남구 북부순환도로 51 (무거동),"{'lat': 35.5528284, 'lng': 129.2690588}",35.552828,129.269059,종합병원
2,의료법인혜명심의료재단 울산병원,종합병원,울산광역시 남구 월평로171번길 13 (신정동),"{'lat': 35.546209, 'lng': 129.3230784}",35.546209,129.323078,종합병원
3,학교법인 울산공업학원 울산대학교병원,종합병원,"울산광역시 동구 방어진순환도로 877, 울산대학교병원 (전하동)","{'lat': 35.5199931, 'lng': 129.4289601}",35.519993,129.42896,종합병원
4,의료법인 송은의료재단 울산시티병원,종합병원,울산광역시 북구 산업로 1007(연암동),"{'lat': 35.5810679, 'lng': 129.3623813}",35.581068,129.362381,종합병원


In [96]:
hospital_category_list = hospital_data["category"].drop_duplicates().to_list()

In [97]:
hospital_category_list[3]

'의원'

In [50]:
def count_hospital_nearby(data, hospital_data = hospital_data, dist = 0.2, category_list = None):
    data_copy = data.copy()
    if category_list == None:
        category_list = hospital_data["category"].drop_duplicates().to_list()
    
    dist_list = hospital_data[["lat", "lng"]].apply(lambda x: haversine((x["lat"], x["lng"]), (data_copy["latitude"], data_copy["longitude"])), 1)
    within_data = hospital_data.loc[dist_list <= dist]
    
    
    for i, ctgr in enumerate(category_list):
        data_copy[f"hospital_category_{i}"] = (within_data["category"] == ctgr).sum()

    return data_copy

In [51]:
%%time
bus_stop_401_info = parallelize_dataframe(df = bus_stop_401_info, 
                                           func = count_hospital_nearby, 
                                           num_cores = 12, 
                                           hospital_data = hospital_data, 
                                           dist = 0.2)

  0%|          | 0/11 [00:00<?, ?it/s].76it/s]]
100%|██████████| 11/11 [00:00<00:00, 23.53it/s]
100%|██████████| 11/11 [00:00<00:00, 23.39it/s]
100%|██████████| 11/11 [00:00<00:00, 24.12it/s]
100%|██████████| 11/11 [00:00<00:00, 23.95it/s]
100%|██████████| 11/11 [00:00<00:00, 22.49it/s]
100%|██████████| 11/11 [00:00<00:00, 22.10it/s]
100%|██████████| 11/11 [00:00<00:00, 22.45it/s]
100%|██████████| 11/11 [00:00<00:00, 23.25it/s]
100%|██████████| 11/11 [00:00<00:00, 23.54it/s]
100%|██████████| 11/11 [00:00<00:00, 23.48it/s]



CPU times: user 658 ms, sys: 455 ms, total: 1.11 s
Wall time: 1.53 s


### 학교정보

In [53]:
school_data = pd.read_excel("/home/seho/Passenger_Demand/data/gv_school.xlsx")

In [54]:
school_data["표준일차명"] = school_data["표준일차명"].fillna("")

In [55]:
school_data = school_data.loc[school_data["표준일차명"].str.contains("울산", na="")]

In [56]:
import googlemaps

In [57]:
gmaps = googlemaps.Client(key='AIzaSyBRxjIW7qfFhaVyCsc2xhk5mf1hXUSi9DI')

In [58]:
def get_geocode(x, gmaps):
    try:
        result = gmaps.geocode(x)[0]["geometry"]["location"]
        # result = [temp["lat"], temp["lng"]]
    except:
        result = None
    
    return result        

In [59]:
school_data["category"] = school_data["학교종류"].replace({"전문대학(3년제)" : "전문대학",
                                                          "사내대학(전문)" : "전문대학",
                                                          "기능대학" : "전문대학",
                                                          "일반대학원" : "대학원",
                                                          "전문대학원" : "대학원",
                                                          "특수대학원" : "대학원",
                                                          "일반고등학교" : "고등학교",
                                                          "공업고등학교" : "고등학교",
                                                          "상업고등학교" : "고등학교",
                                                          "가사고등학교" : "고등학교",
                                                          "체육고등학교" : "고등학교",
                                                          "외국어고등학교" : "고등학교",
                                                          "과학고등학교" : "고등학교",
                                                          "예술고등학교" : "고등학교"})

In [60]:
school_data["lat_lng"] = school_data["새주소"].apply(get_geocode, gmaps = gmaps)

In [61]:
school_data["lat"] = school_data["lat_lng"].apply(lambda x: x["lat"])
school_data["lng"] = school_data["lat_lng"].apply(lambda x: x["lng"])

In [62]:
def count_school_nearby(data, school_data = school_data, dist = 0.2, category_list = None):
    data_copy = data.copy()
    if category_list == None:
        category_list = school_data["category"].drop_duplicates().to_list()
    
    dist_list = school_data[["lat", "lng"]].apply(lambda x: haversine((x["lat"], x["lng"]), (data_copy["latitude"], data_copy["longitude"])), 1)
    within_data = school_data.loc[dist_list <= dist]
    
    
    for i, ctgr in enumerate(category_list):
        data_copy[f"school_category_{i}"] = (within_data["category"] == ctgr).sum()

    return data_copy

In [63]:
%%time
bus_stop_401_info = parallelize_dataframe(df = bus_stop_401_info, 
                                              func = count_school_nearby, 
                                              num_cores = 12, 
                                              school_data = school_data, 
                                              dist = 0.2)

100%|██████████| 11/11 [00:00<00:00, 90.13it/s]]
100%|██████████| 11/11 [00:00<00:00, 85.72it/s] 
100%|██████████| 11/11 [00:00<00:00, 85.85it/s] 
100%|██████████| 11/11 [00:00<00:00, 93.12it/s]
100%|██████████| 11/11 [00:00<00:00, 92.27it/s]
100%|██████████| 11/11 [00:00<00:00, 79.33it/s]
100%|██████████| 11/11 [00:00<00:00, 75.62it/s]
100%|██████████| 11/11 [00:00<00:00, 70.70it/s]

100%|██████████| 11/11 [00:00<00:00, 73.95it/s]
100%|██████████| 11/11 [00:00<00:00, 81.71it/s]
100%|██████████| 10/10 [00:00<00:00, 82.14it/s]


CPU times: user 295 ms, sys: 423 ms, total: 718 ms
Wall time: 777 ms


In [64]:
bus_stop_401_info.head()

Unnamed: 0,mybi_stop_id,stop_id,stop_nm,longitude,latitude,city,store_category_0,store_category_1,store_category_2,store_category_3,...,hospital_category_2,hospital_category_3,hospital_category_4,hospital_category_5,school_category_0,school_category_1,school_category_2,school_category_3,school_category_4,school_category_5
0,3100597,30504,율리공영차고지,129.246863,35.52952,울산광역시,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3100339,30714,우신고등학교입구,129.255322,35.537435,울산광역시,0,2,0,10,...,0,0,0,0,0,0,0,0,1,0
2,3101462,30712,울산과학대학앞,129.25744,35.5398,울산광역시,3,6,0,27,...,0,1,0,0,0,0,0,1,1,0
3,3101461,30708,울산대학교앞,129.260306,35.543755,울산광역시,6,3,0,42,...,0,1,1,0,0,0,0,0,0,0
4,3100479,30706,울산대학교후문,129.261981,35.546813,울산광역시,5,7,0,14,...,2,27,4,0,0,0,0,0,0,0


### 정류장 정보 Join(거리기반)

In [65]:
ml_data = pd.merge(ml_data, bus_stop_401_info.drop(["stop_id", "city"],1), on = "mybi_stop_id")

In [77]:
ml_data = ml_data_temp.copy()

In [67]:
ml_data_temp = ml_data.copy()

### 울산행사정보

In [78]:
event_data = pd.read_csv("~/Passenger_Demand/data/ulsan_event_data.csv")

In [79]:
event_data["eventStartDate"] = pd.to_datetime(event_data["eventStartDate"], format = "%Y-%m-%d")
event_data["eventEndDate"] = pd.to_datetime(event_data["eventEndDate"], format = "%Y-%m-%d")

In [80]:
def count_event_nearby(data, event_data, dist = 0.2):
    data_copy = data.copy()
    within_data = event_data.loc[(event_data["eventStartDate"] <= data_copy["transdate"]) & (event_data["eventEndDate"] >= data_copy["transdate"])]
    
    if len(within_data) == 0:
        data_copy[f"event_nearby"] = 0
    else:
        dist_list = within_data[["latitude", "longitude"]].apply(lambda x: haversine((x["latitude"], x["longitude"]), (data_copy["latitude"], data_copy["longitude"])), 1)
        data_copy[f"event_nearby"] = (dist_list <= dist).sum()

    return data_copy

In [81]:
%%time
ml_data = parallelize_dataframe(df = ml_data, 
                                func = count_event_nearby, 
                                num_cores = 12, 
                                event_data = event_data, 
                                dist = 0.2)

100%|██████████| 62094/62094 [04:19<00:00, 238.93it/s]
100%|██████████| 62094/62094 [04:21<00:00, 237.88it/s]
100%|██████████| 62094/62094 [04:24<00:00, 234.87it/s]
100%|██████████| 62094/62094 [04:28<00:00, 231.39it/s]
100%|██████████| 62094/62094 [04:26<00:00, 232.86it/s]
100%|██████████| 62094/62094 [04:26<00:00, 233.08it/s]
100%|██████████| 62094/62094 [04:28<00:00, 231.14it/s]
100%|██████████| 62094/62094 [04:27<00:00, 232.42it/s]
100%|██████████| 62094/62094 [04:28<00:00, 231.34it/s]
100%|██████████| 62094/62094 [04:29<00:00, 230.59it/s]
100%|██████████| 62094/62094 [04:28<00:00, 230.86it/s]
100%|██████████| 62094/62094 [04:29<00:00, 230.26it/s]


CPU times: user 14.5 s, sys: 13 s, total: 27.5 s
Wall time: 4min 34s


### 축제 정보

In [82]:
festival_data = pd.read_csv("~/Passenger_Demand/data/festival_data.csv")

In [83]:
festival_data["fstvlStartDate"] = pd.to_datetime(festival_data["fstvlStartDate"], format = "%Y-%m-%d")
festival_data["fstvlEndDate"] = pd.to_datetime(festival_data["fstvlEndDate"], format = "%Y-%m-%d")

In [84]:
def count_festival_nearby(data, festival_data, dist = 0.2):
    data_copy = data.copy()
    within_data = festival_data.loc[(festival_data["fstvlStartDate"] <= data_copy["transdate"]) & (festival_data["fstvlEndDate"] >= data_copy["transdate"])]
    
    if len(within_data) == 0:
        data_copy[f"festival_nearby"] = 0
    else:
        dist_list = within_data[["latitude", "longitude"]].apply(lambda x: haversine((x["latitude"], x["longitude"]), (data_copy["latitude"], data_copy["longitude"])), 1)
        data_copy[f"festival_nearby"] = (dist_list <= dist).sum()

    return data_copy

In [85]:
%%time
ml_data = parallelize_dataframe(df = ml_data, 
                                func = count_festival_nearby, 
                                num_cores = 12, 
                                festival_data = festival_data, 
                                dist = 0.2)

100%|██████████| 62094/62094 [01:50<00:00, 564.10it/s]
100%|██████████| 62094/62094 [01:51<00:00, 557.63it/s]
100%|██████████| 62094/62094 [01:51<00:00, 559.08it/s]
100%|██████████| 62094/62094 [01:50<00:00, 561.15it/s]
100%|██████████| 62094/62094 [01:50<00:00, 560.48it/s]
100%|██████████| 62094/62094 [01:50<00:00, 563.29it/s]
100%|██████████| 62094/62094 [01:50<00:00, 564.37it/s]
100%|██████████| 62094/62094 [01:51<00:00, 555.30it/s]
100%|██████████| 62094/62094 [01:49<00:00, 564.95it/s]
100%|██████████| 62094/62094 [01:51<00:00, 558.95it/s]
100%|██████████| 62094/62094 [01:49<00:00, 565.03it/s]
100%|██████████| 62094/62094 [01:50<00:00, 563.06it/s]


CPU times: user 8.65 s, sys: 6.27 s, total: 14.9 s
Wall time: 1min 55s


In [88]:
ml_data.to_pickle("/home/seho/Passenger_Demand/data/ml_data.pkl")

In [87]:
ml_data.to_parquet("/home/seho/Passenger_Demand/data/ml_data.parquet")

ArrowTypeError: ("Expected bytes, got a 'float' object", 'Conversion failed for column pm_alert_cnt with type object')

In [111]:
ml_data.head()

Unnamed: 0,0,childcnt,childcnt_bf1d,childcnt_bf1w,day,dayofweek,event_nearby,festival_nearby,hospital_category_0,hospital_category_1,...,store_category_89,store_category_9,studentcnt,studentcnt_bf1d,studentcnt_bf1w,temperature,totalcnt,totalcnt_bf1d,totalcnt_bf1w,transdate
0,0,,,,,,,,,,...,,,,,,,,,,
1,0,,,,,,,,,,...,,,,,,,,,,
2,0,,,,,,,,,,...,,,,,,,,,,
3,0,,,,,,,,,,...,,,,,,,,,,
4,0,,,,,,,,,,...,,,,,,,,,,


In [102]:
ml_data = ml_data.drop(["event_nearby", "festival_nearby"], 1)

### 인구 정보

In [150]:
population_data = pd.read_csv("~/Passenger_Demand/data/울산광역시_인구 현황_20200727.csv", encoding = "euc-kr")

In [151]:
population_data.head()

Unnamed: 0,행정구역코드,행정구역명,행정구역레벨,성별,총 이동 전입,총 이동 전출,시군구내 전입,시군구내 전출,시군구간 전입,시군구간 전출,시도간 전입,시도간 전출,순이동
0,31,울산광역시,시도,남자,74934,80048,25926,25926,26968,26968,22040,27154,-5114
1,31,울산광역시,시도,여자,67122,72180,24209,24209,24880,24880,18033,23091,-5058
2,31110,울산광역시 중구,시군구,남자,12119,16494,4273,4273,4624,7674,3222,4547,-4375
3,31110,울산광역시 중구,시군구,여자,11358,15454,4202,4202,4393,7272,2763,3980,-4096
4,31140,울산광역시 남구,시군구,남자,21038,24659,8500,8500,6361,8356,6177,7803,-3621
