## 3. Preprocess Safegraph Data

In [1]:
import pandas as pd
import os
import gzip
from tqdm.auto import tqdm
import geopandas as gpd
from pyproj import Geod
import numpy as np
import warnings

warnings.filterwarnings("ignore")

## Code
### 1) DO to OD: Covert Destination area to Origin area

In [2]:
def filteringNeighbor(df, print_progress = True):
    neighbors = df.copy()
    col = ['area', 'work_behavior_device_home_areas', 'weekday_device_home_areas', 'weekend_device_home_areas']

    neighbors = neighbors[col]
    
    return neighbors

In [3]:
def DOtoOD(df, print_progress = True):
    ODdata = df.copy()

    def calculate_work_behavior_from(df):
        # 각 area에 대한 방문을 저장할 딕셔너리를 초기화합니다.
        all_area_dict = {str(area): {} for area in df['area'].tolist()}

        for _, row in tqdm(df.iterrows(), total = len(df), desc = '1. Work behavior...'):
            current_area_dict = eval(row['work_behavior_device_home_areas'])
            destination = str(row['area'])

            for source, count in current_area_dict.items():
                # 추가된 부분: 출발지가 all_area_dict에 없으면 다음 key로 이동합니다.
                if source not in all_area_dict:
                    continue

                if destination not in all_area_dict[source]:
                    all_area_dict[source][destination] = 0

                all_area_dict[source][destination] += count

        # all_area_dict에 기록된 결과를 df의 새로운 컬럼에 할당합니다.
        df['work_behavior_from_area'] = df['area'].map(lambda x: all_area_dict[str(x)])

        return df

    def fix_malformed_dict_str(s):
        # 문자열이 }'로 끝나지 않는 경우
        if not s.endswith("}"):
            # 마지막 ,의 인덱스를 찾아
            last_comma_index = s.rfind(",")
            # 그 이전의 문자열에 }'을 붙여준다.
            s = s[:last_comma_index] + "}"
        return s

    def calculate_weekday_behavior_from(df):
        all_area_dict = {str(area): {} for area in df['area'].tolist()}

        for _, row in tqdm(df.iterrows(), total = len(df), desc = '2) Other behaviors - weekday...'):
            # 문자열을 수정
            fixed_str = fix_malformed_dict_str(row['weekday_device_home_areas'])
            try:
                current_area_dict = eval(fixed_str)
            except Exception as e:
                print(f"Error in row {_}: {e}")
                continue

            destination = str(row['area'])

            for source, count in current_area_dict.items():
                if source not in all_area_dict:
                    continue

                if destination not in all_area_dict[source]:
                    all_area_dict[source][destination] = 0

                all_area_dict[source][destination] += count

        df['weekday_device_from_area_home'] = df['area'].map(lambda x: all_area_dict[str(x)])

        return df

    def calculate_weekend_behavior_from(df):
        all_area_dict = {str(area): {} for area in df['area'].tolist()}

        for _, row in tqdm(df.iterrows(), total = len(df), desc = '3) Other behaviors - weekend...'):
            # 문자열을 수정
            fixed_str = fix_malformed_dict_str(row['weekend_device_home_areas'])
            try:
                current_area_dict = eval(fixed_str)
            except Exception as e:
                print(f"Error in row {_}: {e}")
                continue

            destination = str(row['area'])

            for source, count in current_area_dict.items():
                if source not in all_area_dict:
                    continue

                if destination not in all_area_dict[source]:
                    all_area_dict[source][destination] = 0

                all_area_dict[source][destination] += count

        df['weekend_device_from_area_home'] = df['area'].map(lambda x: all_area_dict[str(x)])

        return df
    
    # 1) Work behavior
    ODdata = calculate_work_behavior_from(ODdata)
    
    # 2) Other behaviors - Weekday
    ODdata = calculate_weekday_behavior_from(ODdata)
    
    # 3) Other behaviors - Weekend
    ODdata = calculate_weekend_behavior_from(ODdata)

    # Extracting columns
    col = ['area', 'work_behavior_from_area','weekday_device_from_area_home','weekend_device_from_area_home']
    ODdata = ODdata[col]
    
    return ODdata


### 2) Computing probability of trips from origin cbg to dest cbg

In [4]:
def compute_probabilityByk_Ws_Wd(neighbor_safegraphDF, landuseGDF, W_s, W_d):

    def compute_probability(df, area='area', cols=['work_behavior_from_area', 'weekday_device_from_area_home', 'weekend_device_from_area_home']):
        # Create a copy of the original DataFrame for results

        prob_trips_in_space = df.copy()
        prob_trips_in_space['work_behavior_from_area'] = prob_trips_in_space['work_behavior_from_area'].astype(str)
        prob_trips_in_space['weekday_device_from_area_home'] = prob_trips_in_space['weekday_device_from_area_home'].astype(str)
        prob_trips_in_space['weekend_device_from_area_home'] = prob_trips_in_space['weekend_device_from_area_home'].astype(str)

        # Iterate over each row in the prob_trips_in_space DataFrame
        for index, row in tqdm(prob_trips_in_space.iterrows(), total=prob_trips_in_space.shape[0], desc = '1) Probability A to A_i...1 (add k folmula)'):

            # Update each dictionary in the row based on its own total_k
            for col in cols:
                dict_data = eval(row[col])
                total_k = sum(dict_data.values())  # Calculate the total k for the current column only

                for key in dict_data:
                    if total_k != 0:  # Ensure not to divide by zero
                        dict_data[key] = dict_data[key] / total_k
                    else:
                        dict_data[key] = 0

                prob_trips_in_space.at[index, col] = str(dict_data)  # Convert updated dictionary back to string representation

        # Additional columns for 'work_behavior_from_area'
        prob_trips_in_space['weekday_Work'] = prob_trips_in_space['work_behavior_from_area']
        prob_trips_in_space['weekend_Work'] = prob_trips_in_space['work_behavior_from_area']

        # Split weekday_device_from_area_home into multiple columns
        weekday_cols = ['weekday_School', 'weekday_University', 'weekday_Dailycare', 'weekday_Religion', 'weekday_Large_shop', 'weekday_Etc_shop', 'weekday_Meals', 'weekday_V_fr_rel', 'weekday_Rec_lei', 'weekday_Serv_trip', 'weekday_Others']
        for col in weekday_cols:
            prob_trips_in_space[col] = prob_trips_in_space['weekday_device_from_area_home']

        # Split weekend_device_from_area_home into multiple columns
        weekend_cols = [col.replace('weekday', 'weekend') for col in weekday_cols]
        for col in weekend_cols:
            prob_trips_in_space[col] = prob_trips_in_space['weekend_device_from_area_home']

        return prob_trips_in_space

    def apply_Ws_formula(prob_trips_in_space, landUse, W_s):
        df_ws = prob_trips_in_space.copy()
        exclude_columns = ['work_behavior_from_area', 'weekday_device_from_area_home', 'weekend_device_from_area_home']
        df_ws.drop(exclude_columns, axis=1, inplace=True)

        # Iterate over the rows and columns of df_ws
        for index, row in tqdm(df_ws.iterrows(), total=df_ws.shape[0], desc = '2) Probability A to A_i...2 (add Ws weight)'):
            for col in df_ws.columns:
                # Avoid processing non-dictionary columns and excluded columns
                if col == 'area':
                    continue

                # Extract the purpose from the column name
                purpose = "_".join(col.split('_')[1:])

                dict_data = eval(row[col])

                # Calculate C_Ai for each key in the dictionary
                C_Ai_dict = {key: landUse[landUse['CBGCODE'] == key]['TRPPURP'].apply(lambda x: purpose in x).sum() for key in dict_data.keys()}
    #             print(purpose)
    #             print(C_Ai_dict)

                # Calculate sum_j C_Aj
                sum_C_Aj = sum(C_Ai_dict.values())

                # Apply the formula
                new_prob_values = {}  # A new dictionary to store normalized probabilities
                for key, value in dict_data.items():
                    C_Ai = C_Ai_dict[key]
                    multiplier = (C_Ai / sum_C_Aj) ** W_s if sum_C_Aj != 0 else 0  # Ensure not to divide by zero
                    new_prob_values[key] = value * multiplier

                # Normalize the probabilities to sum up to 1
                total_probability = sum(new_prob_values.values())
                for key in new_prob_values:
                    new_prob_values[key] = new_prob_values[key] / total_probability if total_probability != 0 else 0

                df_ws.at[index, col] = str(new_prob_values)

        return df_ws
    
    def apply_Ws_formula_optimized(prob_trips_in_space, landUse, W_s):
        df_ws = prob_trips_in_space.copy()
        exclude_columns = ['work_behavior_from_area', 'weekday_device_from_area_home', 'weekend_device_from_area_home']
        df_ws.drop(exclude_columns, axis=1, inplace=True)

        # Pre-calculate C_Ai for all keys in landUse
        all_keys = set()
        for col in tqdm(df_ws.columns, desc = '2) Probability A to A_i...2 (add Ws weight)'):
            if col != 'area':
                df_ws[col].apply(lambda x: all_keys.update(eval(x).keys()))
        all_keys = list(all_keys)
        C_Ai_dict = {key: {} for key in all_keys}
        for key in tqdm(all_keys, desc = '  2.1) Indexing '):
            sub_df = landUse[landUse['CBGCODE'] == key]
            for col in df_ws.columns:
                if col != 'area':
                    purpose = "_".join(col.split('_')[1:])
                    C_Ai_dict[key][purpose] = sub_df['TRPPURP'].apply(lambda x: purpose in x).sum()

        # Iterate over the rows and columns of df_ws
        for index, row in tqdm(df_ws.iterrows(), total=df_ws.shape[0], desc='  2.2) add Ws weight '):
            for col in df_ws.columns:
                if col == 'area':
                    continue

                # Extract the purpose from the column name
                purpose = "_".join(col.split('_')[1:])
                dict_data = eval(row[col])

                # Fetch C_Ai values from the pre-calculated dictionary
                local_C_Ai_values = [C_Ai_dict[key][purpose] for key in dict_data.keys()]
                sum_C_Aj = sum(local_C_Ai_values)

                # Apply the formula
                new_prob_values = {}
                for (key, value), C_Ai in zip(dict_data.items(), local_C_Ai_values):
                    multiplier = (C_Ai / sum_C_Aj) ** W_s if sum_C_Aj != 0 else 0
                    new_prob_values[key] = value * multiplier

                # Normalize the probabilities to sum up to 1
                total_probability = sum(new_prob_values.values())
                for key in new_prob_values:
                    new_prob_values[key] = new_prob_values[key] / total_probability if total_probability != 0 else 0

                df_ws.at[index, col] = str(new_prob_values)

        return df_ws

    
    
    def calculate_distance_meters(point1, point2):
        # WGS 84
        geod = Geod(ellps="WGS84")

        angle1,angle2,distance = geod.inv(point1.x, point1.y, point2.x, point2.y)

        return distance


    def apply_Wd_formula(df, landUse, W_d=0):
        df_wd = df.copy()
        # Iterate over the rows and columns of df_wd
        for index, row in tqdm(df_wd.iterrows(), total=df_wd.shape[0], desc='3) Probability A to A_i...3 (add W_d weight)'):

            # Convert row['area'] to string for matching
            area_str = str(row['area'])
            area_geometry = landUse[landUse['CBGCODE'] == area_str].geometry.iloc[0]
            area_center = area_geometry.centroid

            for col in df_wd.columns:
                if col == 'area':
                    continue

                dict_data = eval(row[col])
                keys_to_remove = []

                for key, value in dict_data.items():
                    # Get the destination geometry, if it does not exist, set the value to 0
                    destination_geometry_series = landUse[landUse['CBGCODE'] == key].geometry

                    if destination_geometry_series.empty:   # value가 (probability) 0이면 지우기.
#                         print(key)
                        dict_data[key] = 0
                        continue

                    destination_geometry = destination_geometry_series.iloc[0]
                    destination_center = destination_geometry.centroid
                    # if area_center is None:
                    #     print(dict_data)
                    #     print(key)
                    distance = calculate_distance_meters(area_center, destination_center)
                    distance = distance/1000 # Convert to km

                    dict_data[key] = value * np.exp(-W_d * distance)

                # Normalize the updated values
                total_probability = sum(dict_data.values())
                for key in dict_data:
                    dict_data[key] = round(dict_data[key] / total_probability if total_probability != 0 else 0, 5)

                for key in keys_to_remove:
                    del dict_data[key]

                df_wd.at[index, col] = str(dict_data)

        return df_wd
    
    def apply_Wd_formula_optimized(df, landUse, W_d=0):
        df_wd = df.copy()

        # Precompute centroid for all areas in landUse
        landUse['centroid'] = landUse['geometry'].centroid

        # Create a dictionary for fast lookup of centroids
        centroid_lookup = landUse.set_index('CBGCODE')['centroid'].to_dict()

        rows_to_drop = []  # List to keep track of rows to drop
        areas_to_drop = []

        # Iterate over the rows and columns of df_wd
        for index, row in tqdm(df_wd.iterrows(), total=df_wd.shape[0], desc='3) Probability A to A_i...3 (add W_d weight)'):

            area_str = str(row['area'])
            area_center = centroid_lookup.get(area_str)

            if area_center is None:  # If there's no centroid for the area, mark row for removal
                rows_to_drop.append(index)
                areas_to_drop.append(area_str)
                continue

            for col in df_wd.columns:
                if col == 'area':
                    continue

                dict_data = eval(row[col])
                keys_to_remove = []

                for key, value in dict_data.items():
                    destination_center = centroid_lookup.get(key)

                    if destination_center is None:  # Check for missing destination geometry
                        dict_data[key] = 0
                    else:
                        distance = calculate_distance_meters(area_center, destination_center) / 1000
                        dict_data[key] = value * np.exp(-W_d * distance)

                    # If value is zero, mark for removal
                    if dict_data[key] == 0:
                        keys_to_remove.append(key)

                # Normalize the updated values
                total_probability = sum(dict_data.values())
                for key in dict_data:
                    dict_data[key] = round(dict_data[key] / total_probability if total_probability != 0 else 0, 5)

                # Remove keys that have zero values
                for key in keys_to_remove:
                    del dict_data[key]

                df_wd.at[index, col] = str(dict_data)

        # Drop rows where area_center is None and reset index
        df_wd.drop(rows_to_drop, inplace=True)
        df_wd.reset_index(drop=True, inplace=True)

#         print('dropped area: ', areas_to_drop)
        return df_wd
    
    # k formula
    print('----- W_s: ' + str(W_s) + ', W_d: ' + str(W_d) + '-----')
    prob_trips_in_space_k = compute_probability(neighbor_safegraphDF)
    prob_trips_in_space_ws = apply_Ws_formula_optimized(prob_trips_in_space_k, landUse, W_s = W_s)
    prob_trips_in_space_wd = apply_Wd_formula_optimized(prob_trips_in_space_ws, landUse, W_d = W_d)
    
    # Dealing with Empty variables
    for index, row in tqdm(prob_trips_in_space_wd.iterrows(), total = len(prob_trips_in_space_wd), desc = '4) Filling empty values'):
        if row['weekday_Work'] == '{}' or row['weekend_Work'] == '{}':
            area_value = row['area']
            prob_trips_in_space_wd.at[index, 'weekday_Work'] = f"{{'{area_value}': 1.0}}"
            prob_trips_in_space_wd.at[index, 'weekend_Work'] = f"{{'{area_value}': 1.0}}"

    # "Unnamed: 0" 컬럼 삭제
    if 'Unnamed: 0' in prob_trips_in_space_wd.columns:
        prob_trips_in_space_wd.drop(columns=['Unnamed: 0'], inplace=True)
    
    return prob_trips_in_space_wd
    

## Execution
### 1) filtering Neighbor data

In [11]:
neighborPath = 'E:/data/Chapter_3_data/Origin_data/Safegraph_neighbor/'

neighbor_2020_09 =  pd.read_excel(neighborPath + 'neighbor_2020_09csv.xlsx')

In [12]:
neighbor_2020_09.columns

Index(['Unnamed: 0', 'area', 'area_type', 'origin_area_type',
       'date_range_start', 'date_range_end', 'day_counts', 'raw_stop_counts',
       'raw_device_counts', 'stops_by_day', 'stops_by_each_hour',
       'device_home_areas', 'weekday_device_home_areas',
       'weekend_device_home_areas', 'breakfast_device_home_areas',
       'lunch_device_home_areas', 'afternoon_tea_device_home_areas',
       'dinner_device_home_areas', 'nightlife_device_home_areas',
       'work_hours_device_home_areas', 'work_behavior_device_home_areas',
       'device_daytime_areas', 'distance_from_home',
       'distance_from_primary_daytime_location', 'median_dwell',
       'top_same_day_brand', 'top_same_month_brand', 'popularity_by_each_hour',
       'popularity_by_hour_monday', 'popularity_by_hour_tuesday',
       'popularity_by_hour_wednesday', 'popularity_by_hour_thursday',
       'popularity_by_hour_friday', 'popularity_by_hour_saturday',
       'popularity_by_hour_sunday', 'device_type', 'iso_coun

In [7]:
neighbor_2020_09 = filteringNeighbor(neighbor_2020_09, print_progress = True)

In [8]:
neighbor_2020_09.head(2)

Unnamed: 0,area,work_behavior_device_home_areas,weekday_device_home_areas,weekend_device_home_areas
0,550791854002,"{""550790198003"":4,""550790066002"":4,""5507901410...","{""550791858001"":9,""550790099002"":9,""5507900870...","{""550790162001"":9,""550790141001"":6,""5507902000..."
1,550790033002,"{""550790902002"":4,""550790034003"":4}","{""550790015002"":9,""550790033002"":5,""5507900340...","{""210730706003"":6,""550790038001"":5,""2916947028..."


### 2) Convert O to D
 - Now, the area column means the destination area (cbg). and other columns shows the number of people who is from cbg to 'area'
 - So we need to convert origin to dest, which means we convert area into origin area, and convert other columns into dest cbg.

In [35]:
neighbor_2020_09_converted = DOtoOD(neighbor_2020_09, print_progress = True)

1. Work behavior...:   0%|          | 0/859 [00:00<?, ?it/s]

2) Other behaviors - weekday...:   0%|          | 0/859 [00:00<?, ?it/s]

3) Other behaviors - weekend...:   0%|          | 0/859 [00:00<?, ?it/s]

In [36]:
neighbor_2020_09_converted.head(2)

Unnamed: 0,area,work_behavior_from_area,weekday_device_from_area_home,weekend_device_from_area_home
0,550791854002,"{'550790042003': 4, '550790042002': 4}","{'550791854002': 6, '550790201003': 4, '550790...","{'550791854002': 4, '550791002003': 4, '550790..."
1,550790033002,"{'550790040003': 4, '550790019004': 4, '550790...","{'550790033002': 5, '550790051002': 4, '550790...","{'550791854002': 4, '550791009002': 5, '550790..."


### 3) Computing probability of trips from origin cbg to dest cbg
 - Ws and Wd

In [41]:
path = 'E:/data/Chapter_3_data/\Analysis/1_Preprocessed_Parcel_data/'
landUse = gpd.read_file(path + 'Milwaukee_parcels.shp')

In [66]:
prob_trips_2020_09_ws0_wd0 = compute_probabilityByk_Ws_Wd(neighbor_2020_09_converted[0:3], landUse, W_s = 0, W_d = 0)

----- W_s: 0, W_d: 0-----


1) Probability A to A_i...1 (add k folmula):   0%|          | 0/3 [00:00<?, ?it/s]

2) Probability A to A_i...2 (add Ws weight):   0%|          | 0/25 [00:00<?, ?it/s]

  2.1) Indexing :   0%|          | 0/186 [00:00<?, ?it/s]

  2.2) add Ws weight :   0%|          | 0/3 [00:00<?, ?it/s]

3) Probability A to A_i...3 (add W_d weight):   0%|          | 0/3 [00:00<?, ?it/s]

4) Filling empty values:   0%|          | 0/3 [00:00<?, ?it/s]

In [68]:
prob_trips_2020_09_ws0_wd0.head(1)

Unnamed: 0,area,weekday_Work,weekend_Work,weekday_School,weekday_University,weekday_Dailycare,weekday_Religion,weekday_Large_shop,weekday_Etc_shop,weekday_Meals,...,weekend_University,weekend_Dailycare,weekend_Religion,weekend_Large_shop,weekend_Etc_shop,weekend_Meals,weekend_V_fr_rel,weekend_Rec_lei,weekend_Serv_trip,weekend_Others
0,550791854002,"{'550790042003': 0.5, '550790042002': 0.5}","{'550790042003': 0.5, '550790042002': 0.5}","{'550791854002': 0.06122, '550790201003': 0.04...",{},"{'550791854002': 0.06122, '550790201003': 0.04...","{'550791854002': 0.06122, '550790201003': 0.04...","{'550791854002': 0.06122, '550790201003': 0.04...","{'550791854002': 0.06122, '550790201003': 0.04...","{'550791854002': 0.06122, '550790201003': 0.04...",...,"{'550791854002': 0.07692, '550791002003': 0.07...","{'550791854002': 0.07692, '550791002003': 0.07...","{'550791854002': 0.07692, '550791002003': 0.07...","{'550791854002': 0.07692, '550791002003': 0.07...","{'550791854002': 0.07692, '550791002003': 0.07...","{'550791854002': 0.07692, '550791002003': 0.07...","{'550791854002': 0.07692, '550791002003': 0.07...","{'550791854002': 0.07692, '550791002003': 0.07...","{'550791854002': 0.07692, '550791002003': 0.07...","{'550791854002': 0.07692, '550791002003': 0.07..."


### 4) Combine all probability tables by their month
  - First, make the tables with all possible ws and wd that user want to set
  - Add columns of Ws and Wd
  - Save to directory and merge all of those by the Month

In [69]:
probPath = 'E:/data/Chapter_3_data/Analysis/1_Preprocessed_prob_CBG_fromTo/Combined_by_Month/'
prob_2020_09 = pd.read_csv(probPath + 'prob_2020_09_combined.csv')

In [71]:
# e.g.,
prob_2020_09.head(3)

Unnamed: 0,area,weekday_Work,weekend_Work,weekday_School,weekday_University,weekday_Dailycare,weekday_Religion,weekday_Large_shop,weekday_Etc_shop,weekday_Meals,...,weekend_Religion,weekend_Large_shop,weekend_Etc_shop,weekend_Meals,weekend_V_fr_rel,weekend_Rec_lei,weekend_Serv_trip,weekend_Others,Ws,Wd
0,550791854002,"{'550790042003': 0.36955, '550790042002': 0.63...","{'550790042003': 0.36955, '550790042002': 0.63...","{'550791854002': 0.2124, '550790201003': 0.017...",{},"{'550791854002': 0.3265, '550790201003': 0.027...","{'550791854002': 0.20281, '550790084002': 0.04...","{'550791101002': 0.35012, '550790044001': 0.47...","{'550791854002': 0.21749, '550790032001': 0.01...","{'550791854002': 0.23899, '550790201003': 0.03...",...,"{'550791854002': 0.22481, '550790091002': 0.17...",{'550791101003': 1.0},"{'550791854002': 0.35874, '550790144002': 0.22...","{'550791854002': 0.29982, '550790091002': 0.25...","{'550791854002': 0.1629, '550791002003': 0.033...","{'550791854002': 0.25523, '550790144002': 0.14...","{'550791854002': 0.23391, '550790091002': 0.13...","{'550791854002': 0.31536, '550791002003': 0.02...",0.5,0.25
1,550790033002,"{'550790040003': 0.12963, '550790019004': 0.10...","{'550790040003': 0.12963, '550790019004': 0.10...","{'550790034001': 0.26269, '550790602001': 0.02...","{'550790601012': 0.15189, '550790141001': 0.16...","{'550791201022': 0.00303, '550790050003': 0.07...","{'550790051002': 0.02014, '550790039003': 0.01...","{'550790001021': 0.02615, '550791201022': 0.00...","{'550790033002': 0.03647, '550790034001': 0.06...","{'550790033002': 0.03589, '550790065003': 0.00...",...,"{'550791854002': 0.01933, '550790051002': 0.03...","{'550791501004': 0.00128, '550791101002': 0.01...","{'550791854002': 0.01095, '550791009002': 0.01...","{'550791854002': 0.00804, '550791009002': 0.00...","{'550791854002': 0.00578, '550791009002': 0.00...","{'550791854002': 0.00862, '550791009002': 0.01...","{'550791854002': 0.0121, '550791009002': 0.007...","{'550791854002': 0.02209, '550791009002': 0.00...",0.5,0.25
2,550790908001,{'550790903002': 1.0},{'550790903002': 1.0},"{'550790127001': 0.05153, '550790907002': 0.07...","{'550791501002': 0.00509, '550791863002': 0.03...","{'550791503031': 0.00375, '550791863002': 0.03...","{'550790054003': 0.06015, '550791503031': 0.00...","{'550791501002': 0.00431, '550791503031': 0.00...","{'550790908001': 0.10723, '550790912002': 0.04...","{'550790912002': 0.07258, '550791202031': 0.00...",...,"{'550791301002': 0.00569, '550790054003': 0.08...","{'550791101002': 0.03894, '550790902002': 0.17...","{'550790908001': 0.09475, '550791301002': 0.01...","{'550791101002': 0.0122, '550790912002': 0.068...","{'550790908001': 0.07794, '550791301002': 0.00...","{'550791301002': 0.00921, '550790054003': 0.03...","{'550790908001': 0.14125, '550791301002': 0.00...","{'550790908001': 0.03699, '550791301002': 0.00...",0.5,0.25


### 5) fill empty probability

In [281]:
# 빈것들 채워주는 코드

def fill_values(row):
    # weekday_Dailycare 컬럼 처리
    if row['weekday_Dailycare'] == '{}':
        # weekday_Religion, weekday_School 중에서 '{}'가 아닌 값을 찾아 채워주기
        if row['weekday_Religion'] != '{}':
            row['weekday_Dailycare'] = row['weekday_Religion']
        elif row['weekday_School'] != '{}':
            row['weekday_Dailycare'] = row['weekday_School']

    if row['weekend_Dailycare'] == '{}':
        # weekday_Religion, weekday_School 중에서 '{}'가 아닌 값을 찾아 채워주기
        if row['weekday_Dailycare'] != '{}':
            row['weekend_Dailycare'] = row['weekday_Dailycare']
        elif row['weekend_Religion'] != '{}':
            row['weekend_Dailycare'] = row['weekend_Religion']
        elif row['weekend_School'] != '{}':
            row['weekend_Dailycare'] = row['weekend_School']
            
    # weekday_Large_shop 컬럼 처리
    if row['weekday_Large_shop'] == '{}':
        # weekday_Etc_shop이 '{}'가 아니라면 값을 채워주기
        if row['weekday_Etc_shop'] != '{}':
            row['weekday_Large_shop'] = row['weekday_Etc_shop']

    # weekday_Large_shop 컬럼 처리
    if row['weekend_Large_shop'] == '{}':
        # weekday_Etc_shop이 '{}'가 아니라면 값을 채워주기
        if row['weekday_Large_shop'] != '{}':
            row['weekend_Large_shop'] = row['weekday_Large_shop']
        elif row['weekend_Etc_shop'] != '{}':
            row['weekend_Large_shop'] = row['weekend_Etc_shop']
        elif row['weekday_Etc_shop'] != '{}':
            row['weekend_Large_shop'] = row['weekday_Etc_shop']
            
    # weekday_Large_shop 컬럼 처리
    if row['weekend_Etc_shop'] == '{}':
        # weekday_Etc_shop이 '{}'가 아니라면 값을 채워주기
        if row['weekday_Etc_shop'] != '{}':
            row['weekend_Etc_shop'] = row['weekday_Etc_shop']
        elif row['weekend_Large_shop'] != '{}':
            row['weekend_Etc_shop'] = row['weekend_Large_shop']

    if row['weekday_Religion'] == '{}':
        # weekday_Religion, weekday_School 중에서 '{}'가 아닌 값을 찾아 채워주기
        if row['weekday_Dailycare'] != '{}':
            row['weekday_Religion'] = row['weekday_Dailycare']
        elif row['weekday_School'] != '{}':
            row['weekday_Religion'] = row['weekday_School']
            
    if row['weekend_Religion'] == '{}':
        # weekday_Religion, weekday_School 중에서 '{}'가 아닌 값을 찾아 채워주기
        if row['weekday_Religion'] != '{}':
            row['weekend_Religion'] = row['weekday_Religion']
        elif row['weekend_Dailycare'] != '{}':
            row['weekend_Religion'] = row['weekend_Dailycare']
            
    if row['weekend_School'] == '{}':
        # weekday_Religion, weekday_School 중에서 '{}'가 아닌 값을 찾아 채워주기
        if row['weekend_Dailycare'] != '{}':
            row['weekend_School'] = row['weekend_Dailycare']
        elif row['weekend_Religion'] != '{}':
            row['weekend_School'] = row['weekend_Religion']            
            
    if row['weekend_Meals'] == '{}':
        # weekday_Religion, weekday_School 중에서 '{}'가 아닌 값을 찾아 채워주기
        if row['weekday_Meals'] != '{}':
            row['weekend_Meals'] = row['weekday_Meals']
        elif row['weekend_Etc_shop'] != '{}':
            row['weekend_Meals'] = row['weekend_Etc_shop']              
            
    if row['weekend_Rec_lei'] == '{}':
        # weekday_Religion, weekday_School 중에서 '{}'가 아닌 값을 찾아 채워주기
        if row['weekday_Rec_lei'] != '{}':
            row['weekend_Rec_lei'] = row['weekday_Rec_lei']   
            
    if row['weekend_Serv_trip'] == '{}':
        # weekday_Religion, weekday_School 중에서 '{}'가 아닌 값을 찾아 채워주기
        if row['weekday_Serv_trip'] != '{}':
            row['weekend_Serv_trip'] = row['weekday_Serv_trip']
            
    if row['weekend_Others'] == '{}':
        # weekday_Religion, weekday_School 중에서 '{}'가 아닌 값을 찾아 채워주기
        if row['weekday_Others'] != '{}':
            row['weekend_Others'] = row['weekday_Others']     
            
            
            
    return row

In [None]:
# 나중에 -> 다 가져와야 함. trip 별로 cbg -> cbg 확률들
directory_path = "E:/data/Chapter_3_data/Analysis/1_Preprocessed_prob_CBG_fromTo/Combined_by_Month"

# List all CSV files in the directory
csv_files = [f for f in os.listdir(directory_path) if f.endswith('.csv')]

# Load each CSV file and assign to a global variable
for file in tqdm(csv_files):
    # Create a variable name from the file name (without extension)
    variable_name = file.split('.')[0]
    # Load the CSV file
    globals()[variable_name] = pd.read_csv(os.path.join(directory_path, file))
    
    globals()[variable_name] = globals()[variable_name].apply(fill_values, axis=1)
    
    globals()[variable_name].to_csv(directory_path + '/' + variable_name + '.csv')

  0%|          | 0/16 [00:00<?, ?it/s]