In [None]:
# Add user specific python libraries to path
import sys
sys.path.insert(0, "/home/smehra/local-packages")

In [None]:
from dask.distributed import Client, LocalCluster

cluster = LocalCluster(dashboard_address = 'localhost:7920', 
                       n_workers = 16, 
                       processes = True, 
                       threads_per_worker = 16,
                       memory_limit = '16GB', 
                       local_directory = "/data/tmp/smehra/tmp/dask-worker-space")
client = Client(cluster)


import multiprocessing as mp
from multiprocessing.pool import ThreadPool

import numpy as np
import pandas as pd
import geopandas as gpd
import dask.dataframe as ddf

# enable automated generational garbage collection
import gc
gc.enable()

import time, os
from datetime import timedelta  
from datetime import date
from datetime import datetime


In [None]:
import logging

# setup logging to a specified file
log_file = '/data/tmp/smehra/logs/migration_metrics_computation.log'
logging.basicConfig(filename=log_file,
                            filemode='a+',
                            format='%(asctime)s %(levelname)s %(message)s',
                            datefmt='%Y-%m-%d %H:%M:%S',
                            level=logging.INFO)
logger=logging.getLogger(__name__)


In [None]:
def get_total_migrated_after_time_delta(origin_district, impact_day, impacted_users, total_impacted, time_delta):

    if(((impact_day + time_delta) > 2770) or (total_impacted == 0)):
        
        empty_df = pd.DataFrame(np.NaN, columns=["migrated"], index=district_ids)
        return empty_df.to_dict()['migrated']
    
    users_in_other_districts_after_time_delta = users_per_district_day[(users_per_district_day.day_series == (impact_day + time_delta))].copy()

    users_in_other_districts_after_time_delta['migrated'] = users_in_other_districts_after_time_delta.active_users.apply(lambda migrated: len(impacted_users.intersection(migrated)))
    
    users_in_other_districts_after_time_delta.set_index('district_id', inplace = True)
    
    return users_in_other_districts_after_time_delta[['migrated']].to_dict()['migrated']
    

def get_migration_metrics(district_day):
    
    origin_district = district_day[0]
    impact_day = district_day[1]
    
    logger.info('origin_district: ' + str(origin_district)  + '. impact_day: ' + str(impact_day))
    
    impacted_users = users_per_district_day[(users_per_district_day.district_id == origin_district) & 
                                            (users_per_district_day.day_series == impact_day)].active_users.item()
    total_impacted = len(impacted_users)

    summary = get_total_migrated_after_time_delta(origin_district, impact_day, impacted_users, total_impacted, time_delta)
    
    summary['origin_district'] = origin_district
    summary['impact_day'] = impact_day
    summary['impacted'] = total_impacted
    summary['visit_day'] = impact_day + time_delta

    return summary


# get a list of day_series values
days = pd.DataFrame({'day_series': np.arange(1, 2771, 1).tolist()}) 

# get a list of district ids
district_ids = sorted(gpd.read_file('/data/afg_anon/ShapeFiles/AFG_district_398/district398.shp').DISTID.tolist())
districts = pd.DataFrame({'district_id': district_ids}) 

# get all district - day_series pairs in a dataframe
district_day_all_permutations_df = districts.assign(dummy_col = 1).merge(days.assign(dummy_col = 1), how = 'outer').drop(columns = ["dummy_col"])
district_day_all_permutations_list = district_day_all_permutations_df.to_numpy().tolist()


## Compute visits per district day at user id level

In [None]:
%%time

user_location_directory = '/data/afg_anon/displacement_metrics/home_locations/daily_modal_voice_only_2013-2020_version/'

for fname in os.listdir(user_location_directory):
    
    print(str(datetime.now()) + ' reading ' + fname)
    logger.info('reading ' + fname)
    
    users_per_district_day = pd.read_csv(user_location_directory + fname,
                                         dtype = {'phoneHash1': object,
                                                  'home_location': 'int16',
                                                  'day': object})
    if(len(users_per_district_day) == 0):
        print(str(datetime.now()) + ' Empty dataset. Skipped.')
        logger.info('Empty dataset. Skipped.')
        continue
    
    # get "set" of users for each district each day
    users_per_district_day = users_per_district_day.groupby(['home_location', 'day'])['phoneHash1'].apply(set)
    users_per_district_day = users_per_district_day.reset_index()
    
    # convert dates to day series
    users_per_district_day['day'] = pd.to_datetime(users_per_district_day.day, format='%Y-%m-%d', errors='coerce')
    users_per_district_day['day_series'] = users_per_district_day['day'].dt.date.apply(lambda d: (d - date(2013,3,31)).days)

    # rename and reorder columns
    users_per_district_day.rename(columns = {'home_location': 'district_id', 'phoneHash1': 'active_users'}, inplace = True)
    users_per_district_day = users_per_district_day[['day_series', 'district_id', 'active_users']]
    
    # ensure we have a row for all permutations of district - day.
    users_per_district_day = district_day_all_permutations_df.merge(users_per_district_day, on = ['district_id', 'day_series'], how = 'left')

    # convert no users to empty set
    users_per_district_day['active_users'] = users_per_district_day.active_users.apply(lambda u: set() if u != u else u)
    
    for time_delta in [30]:

        print(str(datetime.now()) + ' Computing for time_delta = ' + str(time_delta))
        logger.info('Computing for time_delta = ' + str(time_delta))
        
        with ThreadPool() as pool:

            # get migration metrics for impacted users of each district - day
            migration_metrics_as_list = pool.map(get_migration_metrics, district_day_all_permutations_list)
        
        # convert migration metrics list to dataframe
        # schema - ['origin_district', 'impact_day', 'impacted', 'visit_day', 101, 102, 103, ... 3401]
        migration_metrics_column_major = pd.DataFrame(migration_metrics_as_list)

        print(str(datetime.now()) + ' Unpivoting dataset.')
        logger.info('Unpivoting dataset.')

        # convert to row major form
        # schema - ['origin_district', 'impact_day', 'impacted', 'visit_day', 'destination_district', 'visits']
        migration_metrics = migration_metrics_column_major.melt(id_vars=['origin_district', 'impact_day', 'impacted', 'visit_day'], var_name='destination_district', value_name='visits')
        migration_metrics = migration_metrics.sort_values(['origin_district', 'impact_day']).reset_index(drop = True)
        migration_metrics = migration_metrics[['origin_district', 'impact_day', 'impacted', 'destination_district', 'visit_day', 'visits']]

        print(str(datetime.now()) + ' Saving dataset.')
        logger.info('Saving dataset.')
        migration_metrics.to_csv('/data/tmp/smehra/aggregated_data/poppy/visits_per_district_day/time_delta_' + str(time_delta) + '_days/user_id_level/' + fname, index = False)



## Convert user id level metrics to impact day level

In [None]:
time_delta = 30
    
visits_dataset_directory = '/data/tmp/smehra/aggregated_data/poppy/visits_per_district_day/time_delta_' + str(time_delta) + '_days/user_id_level/'
filepaths = [visits_dataset_directory + f for f in os.listdir(visits_dataset_directory)]

for impact_day in range(1, 2771):

    print(str(datetime.now()) + ' Computing for impact day ' + str(impact_day))

    user_id_level_datasets = []

    for file in filepaths:

        # read user_id level dataset
        user_id_level_data = ddf.read_csv(file,
                                          dtype = {'origin_district': 'int16',
                                                   'impact_day': 'int16',
                                                   'impacted': 'int32',
                                                   'destination_district': 'int16',
                                                   'visit_day': 'int16',
                                                   'visits': 'float64'})

        # keep only data for a specific impact_day
        user_id_level_data = user_id_level_data[user_id_level_data.impact_day == impact_day]

        # add it to the list of dataframes
        user_id_level_datasets.append(user_id_level_data)

    # concatenate together data for a specific impact_day from all user id datasets
    impact_day_level_data = ddf.concat(user_id_level_datasets)            
    impact_day_level_data.to_csv('/data/tmp/smehra/aggregated_data/poppy/visits_per_district_day/time_delta_' + str(time_delta) + '_days/impact_day_level/impact_day_' + str(impact_day), index = False)



## Aggregate impact day level metrics

In [None]:
time_delta = 30
    
datasets_directory = '/data/tmp/smehra/aggregated_data/poppy/visits_per_district_day/time_delta_' + str(time_delta) + '_days/impact_day_level/'

for impact_day in range(1, 2771):
    
    print(str(datetime.now()) + ' Computing for impact day ' + str(impact_day))
    impact_day_folder = datasets_directory + 'impact_day_' + str(impact_day)
    
    # read impact_day level dataset
    impact_day_data = ddf.read_csv(impact_day_folder + '/*',
                                         dtype = {'origin_district': 'int16',
                                                  'impact_day': 'int16',
                                                  'impacted': 'int32',
                                                  'destination_district': 'int16',
                                                  'visit_day': 'int16',
                                                  'visits': 'float64'})

    # calculate sum of "impacted" users for each group
    impacted_agg = impact_day_data.groupby(['origin_district', 'impact_day', 'destination_district', 'visit_day'])['impacted'].sum().reset_index()

    # drop rows where impacted_users count were zero i.e visits were NA
    visits_agg = impact_day_data[~impact_day_data.visits.isna()]
    # calculate sum of "visits" for each group
    visits_agg = visits_agg.groupby(['origin_district', 'impact_day', 'destination_district', 'visit_day'])['visits'].sum().reset_index()
    
    # merge to have total impacted and visits count in same dataset
    impacted_and_visits_merged = impacted_agg.merge(visits_agg, on = ['origin_district', 'impact_day', 'destination_district', 'visit_day'], how = 'left')
    
    impacted_and_visits_merged.to_csv('/data/tmp/smehra/aggregated_data/poppy/visits_per_district_day/time_delta_' + str(time_delta) + '_days/aggregated/impact_day_' + str(impact_day) + '.csv', index = False, single_file = True)
