In [1]:
# Add user specific python libraries to path
import sys
sys.path.insert(0, "/home/smehra/local-packages")

In [2]:
import multiprocessing as mp

#import dask.dataframe as dd
import numpy as np
import pandas as pd
import geopandas as gpd

# enable automated generational garbage collection
import gc
gc.enable()

import time, os
from datetime import timedelta  
from datetime import date
from datetime import datetime


In [3]:
import logging

# setup logging to a specified file
log_file = '/data/tmp/smehra/logs/migration_metrics_computation.log'
logging.basicConfig(filename=log_file,
                            filemode='a+',
                            format='%(asctime)s %(levelname)s %(message)s',
                            datefmt='%Y-%m-%d %H:%M:%S',
                            level=logging.INFO)
logger=logging.getLogger(__name__)


## Get users for each district - day

In [4]:
def pd_read_csv(fname):
    
    return pd.read_csv(fname,
                       dtype = {'phoneHash1': object,
                                'home_location': 'int16',
                                'day': object})


user_location_directory = '/data/afg_anon/displacement_metrics/home_locations/daily_modal_voice_only_2013-2020_version/'
filepaths = [user_location_directory + f for f in os.listdir(user_location_directory)]

user_locations = pd.concat(map(pd_read_csv, filepaths), ignore_index=True)
user_locations.head()


Unnamed: 0,phoneHash1,day,home_location
0,DxOvqKO100GmqeLX,2013-04-01,607
1,DxOvqKO104AYqeLX,2013-04-01,2201
2,DxOvqKO104MGqeLX,2013-04-01,2201
3,DxOvqKO104ZJqeLX,2013-04-01,2201
4,DxOvqKO104eyqeLX,2013-04-01,301


In [None]:
# get users per district - day in a set

users_per_district_day = user_locations.groupby(['home_location', 'day'])['phoneHash1'].apply(set)

users_per_district_day = users_per_district_day.reset_index()

users_per_district_day.head()


In [None]:
# convert dates to day series

users_per_district_day['day'] = pd.to_datetime(users_per_district_day.day, format='%Y-%m-%d', errors='coerce')

users_per_district_day['day_series'] = users_per_district_day['day'].dt.date.apply(lambda d: (d - date(2013,3,31)).days)

users_per_district_day.head()


In [None]:
# rename and reorder columns

users_per_district_day.rename(columns = {'home_location': 'district_id', 'phoneHash1': 'active_users'}, inplace = True)

users_per_district_day = users_per_district_day[['day_series', 'district_id', 'active_users']]

users_per_district_day.head()


In [7]:
del user_locations
gc.collect()

20

In [10]:
# ensure we have a row for all permutations of district - day.

# get a list of day_series values
days = pd.DataFrame({'day_series': np.arange(1, 2771, 1).tolist()}) 

# get a list of district ids
district_ids = sorted(gpd.read_file('/data/afg_anon/ShapeFiles/AFG_district_398/district398.shp').DISTID.tolist())
districts = pd.DataFrame({'district_id': district_ids}) 

# get all district - day_series pairs in a dataframe
district_day_all_permutations_df = districts.assign(dummy_col = 1).merge(days.assign(dummy_col = 1), how = 'outer').drop(columns = ["dummy_col"])
district_day_all_permutations_list = district_day_all_permutations_df.to_numpy().tolist()

# merge all district - day_series pairs to respective active_users
users_per_district_day = district_day_all_permutations_df.merge(users_per_district_day, on = ['district_id', 'day_series'], how = 'left')

users_per_district_day.head()


Unnamed: 0,district_id,day_series,active_users
0,101,1,"{EmOKlkoz7Bbelp1A, DxOvqKOAd7wWqeLX, 305xqe0Vp..."
1,101,2,"{EmOKlkoz7Bbelp1A, DxOvqKOAd7wWqeLX, 305xqe0Vp..."
2,101,3,"{EmOKlkoz7Bbelp1A, DxOvqKOAd7wWqeLX, DxOvqKO1x..."
3,101,4,"{EmOKlkoz7Bbelp1A, DxOvqKOAd7wWqeLX, DxOvqKO1x..."
4,101,5,"{EmOKlkoz7Bbelp1A, DxOvqKOAd7wWqeLX, DxOvqKO1x..."


In [11]:
# convert no user values to empty set

users_per_district_day['active_users'] = users_per_district_day.active_users.apply(lambda u: set() if u != u else u)

users_per_district_day.head()


Unnamed: 0,district_id,day_series,active_users
0,101,1,"{EmOKlkoz7Bbelp1A, DxOvqKOAd7wWqeLX, 305xqe0Vp..."
1,101,2,"{EmOKlkoz7Bbelp1A, DxOvqKOAd7wWqeLX, 305xqe0Vp..."
2,101,3,"{EmOKlkoz7Bbelp1A, DxOvqKOAd7wWqeLX, DxOvqKO1x..."
3,101,4,"{EmOKlkoz7Bbelp1A, DxOvqKOAd7wWqeLX, DxOvqKO1x..."
4,101,5,"{EmOKlkoz7Bbelp1A, DxOvqKOAd7wWqeLX, DxOvqKO1x..."


## Compute visits per district day

In [None]:
def get_total_migrated_after_time_delta(origin_district, impact_day, impacted_users, total_impacted, time_delta):

    if(((impact_day + time_delta) > 2770) or (total_impacted == 0)):
        
        empty_df = pd.DataFrame(np.NaN, columns=["migrated"], index=district_ids)
        return empty_df.to_dict()['migrated']
    
    users_in_other_districts_after_time_delta = users_per_district_day[(users_per_district_day.day_series == (impact_day + time_delta))].copy()

    users_in_other_districts_after_time_delta['migrated'] = users_in_other_districts_after_time_delta.active_users.apply(lambda migrated: len(impacted_users.intersection(migrated)))
    
    users_in_other_districts_after_time_delta.set_index('district_id', inplace = True)
    
    return users_in_other_districts_after_time_delta[['migrated']].to_dict()['migrated']
    

def get_migration_metrics(district_day):
    
    origin_district = district_day[0]
    impact_day = district_day[1]
    
    logger.info('origin_district: ' + str(origin_district)  + '. impact_day: ' + str(impact_day))
    
    impacted_users = users_per_district_day[(users_per_district_day.district_id == origin_district) & 
                                            (users_per_district_day.day_series == impact_day)].active_users.item()
    total_impacted = len(impacted_users)

    summary = get_total_migrated_after_time_delta(origin_district, impact_day, impacted_users, total_impacted, time_delta)
    
    summary['origin_district'] = origin_district
    summary['impact_day'] = impact_day
    summary['impacted'] = total_impacted
    summary['visit_day'] = impact_day + time_delta

    return summary


In [None]:
time_delta = 30

print(str(datetime.now()) + ' Computing for time_delta = ' + str(time_delta))
logger.info('Computing for time_delta = ' + str(time_delta))

# get migration metrics for impacted users of each district - day
migration_metrics_as_list = map(get_migration_metrics, district_day_all_permutations_list)

# convert migration metrics list to dataframe
# schema - ['origin_district', 'impact_day', 'impacted', 'visit_day', 101, 102, 103, ... 3401]        
migration_metrics_column_major = pd.DataFrame(migration_metrics_as_list)

print(str(datetime.now()) + ' Unpivoting dataset.')
logger.info('Unpivoting dataset.')

# convert to row major form
# schema - ['origin_district', 'impact_day', 'impacted', 'visit_day', 'destination_district', 'visits']        
migration_metrics = migration_metrics_column_major.melt(id_vars=['origin_district', 'impact_day', 'impacted', 'visit_day'], var_name='destination_district', value_name='visits')
migration_metrics = migration_metrics.sort_values(['origin_district', 'impact_day']).reset_index(drop = True)
migration_metrics = migration_metrics[['origin_district', 'impact_day', 'impacted', 'destination_district', 'visit_day', 'visits']]

print(str(datetime.now()) + ' Saving dataset.')
logger.info('Saving dataset.')
migration_metrics.to_csv('/data/afg_anon/displacement_metrics/visits_per_district_day/using_2013-2020_data/time_delta_' + str(time_delta) + '_days.csv', index = False)

    