In [1]:
# Add user specific python libraries to path
import sys
sys.path.insert(0, "/home/smehra/local-packages")

In [2]:
# setup dask client
from dask.distributed import Client, LocalCluster
cluster = LocalCluster(dashboard_address='localhost:7900', 
                       n_workers=6, 
                       processes=True, 
                       threads_per_worker=8, 
                       memory_limit='32GB', 
                       local_directory="/data/tmp/smehra/tmp/dask-worker-space")
client = Client(cluster)

import multiprocessing as mp

import dask.dataframe as dd
import numpy as np
import pandas as pd

# enable automated generational garbage collection
import gc
gc.enable()

import time
from datetime import timedelta  
from datetime import date
from datetime import datetime


In [3]:
import logging

# setup logging to a specified file
log_file = '/data/tmp/smehra/logs/migration_metrics_computation.log'
logging.basicConfig(filename=log_file,
                            filemode='a+',
                            format='%(asctime)s %(levelname)s %(message)s',
                            datefmt='%Y-%m-%d %H:%M:%S',
                            level=logging.INFO)
logger=logging.getLogger(__name__)


## Get users location per day

In [4]:
%%time

def get_optimized_column_types_for_user_segments_data():
    
    column_types = {}
    for col in ['userId'] + np.arange(1, 1462, 1).tolist():
        if(col == 'userId'):
            column_types[col] = object
        else:
            column_types[col] = 'float32'
    return column_types
            
user_locations = dd.read_csv('/data/tmp/smehra/aggregated_data/afgh-displacement/migration_detector_output_data/district_level/*.csv', 
                             dtype = get_optimized_column_types_for_user_segments_data())

user_locations = user_locations.compute()
user_locations.head()




CPU times: user 2min 13s, sys: 3min 33s, total: 5min 46s
Wall time: 10min 57s


Unnamed: 0,userId,1,2,3,4,5,6,7,8,9,...,1452,1453,1454,1455,1456,1457,1458,1459,1460,1461
0,0e3VqrADB4z6lxo7,,,,,,,,,,...,,,,,,,,,,
1,0e3VqrA3eBDVlxo7,,,,,,,,,,...,,,,,,,,,,
2,0e3VqrA5kyjAlxo7,810.0,810.0,810.0,810.0,810.0,810.0,810.0,810.0,810.0,...,,,,,,,,,,
3,0mDzqJpd1OEeQJeR,,,,,,,,,,...,,,,,,,,,,
4,0mDzqJpaOjLXQJeR,,,,,,,,,,...,,,,,,,,,,


In [5]:
len(user_locations)

10477778

In [6]:
len(user_locations.userId.unique())

10477778

## Get users for each district - day

In [5]:
# create an empty dataframe where we will store per district per day counts

districts = pd.DataFrame(columns = ['district_id']) 
district_ids = [101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 201, 202, 203, 204, 205, 206, 207, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 401, 402, 403, 404, 405, 406, 407, 408, 409, 501, 502, 503, 504, 505, 506, 507, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 818, 819, 820, 821, 822, 901, 902, 903, 904, 905, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1101, 1102, 1103, 1104, 1105, 1106, 1107, 1108, 1109, 1110, 1111, 1112, 1113, 1114, 1115, 1116, 1117, 1118, 1119, 1120, 1121, 1122, 1123, 1124, 1125, 1126, 1127, 1128, 1201, 1202, 1203, 1204, 1205, 1206, 1207, 1208, 1209, 1210, 1211, 1212, 1213, 1214, 1215, 1216, 1217, 1301, 1302, 1303, 1304, 1305, 1306, 1307, 1308, 1309, 1310, 1311, 1312, 1313, 1314, 1315, 1401, 1402, 1403, 1404, 1405, 1406, 1407, 1501, 1502, 1503, 1504, 1505, 1506, 1507, 1601, 1602, 1603, 1604, 1605, 1606, 1607, 1608, 1609, 1610, 1611, 1612, 1613, 1614, 1615, 1701, 1702, 1703, 1704, 1705, 1706, 1707, 1708, 1709, 1710, 1711, 1801, 1802, 1803, 1804, 1805, 1806, 1807, 1808, 1809, 1810, 1811, 1812, 1813, 1814, 1901, 1902, 1903, 1904, 1905, 1906, 1907, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2101, 2102, 2103, 2104, 2105, 2106, 2107, 2108, 2109, 2110, 2111, 2201, 2202, 2203, 2204, 2205, 2301, 2302, 2303, 2304, 2305, 2306, 2307, 2308, 2309, 2310, 2311, 2312, 2313, 2401, 2402, 2403, 2404, 2405, 2406, 2407, 2408, 2409, 2410, 2411, 2412, 2413, 2414, 2415, 2416, 2501, 2502, 2503, 2504, 2505, 2506, 2507, 2508, 2509, 2510, 2511, 2601, 2602, 2603, 2604, 2605, 2701, 2702, 2703, 2704, 2705, 2706, 2707, 2708, 2709, 2710, 2801, 2802, 2803, 2804, 2805, 2806, 2807, 2901, 2902, 2903, 2904, 2905, 2906, 2907, 2908, 2909, 2910, 2911, 2912, 2913, 2914, 2915, 2916, 2917, 2918, 2919, 3001, 3002, 3003, 3004, 3005, 3006, 3007, 3008, 3101, 3102, 3103, 3104, 3105, 3106, 3107, 3201, 3202, 3203, 3204, 3205, 3206, 3207, 3208, 3209, 3210, 3211, 3212, 3213, 3301, 3302, 3303, 3304, 3305, 3306, 3307, 3401, 3402, 3403, 3404, 3405, 3406, 3407, 3408, 3409]
districts['district_id'] = district_ids

days = pd.DataFrame(columns = ['day_series']) 
days['day_series'] = np.arange(1, 1462, 1).tolist()

district_day_all_permutations_df = districts.assign(dummy_col = 1).merge(days.assign(dummy_col = 1)).drop('dummy_col', 1)
district_day_all_permutations_list = district_day_all_permutations_df.to_numpy().tolist()

district_day_all_permutations_df.head()


Unnamed: 0,district_id,day_series
0,101,1
1,101,2
2,101,3
3,101,4
4,101,5


In [6]:
%%time

def get_location_of_users_for_day_series(daySeries):
    
    location_of_users = user_locations.groupby(str(daySeries))['userId'].apply(set).reset_index(name='active_users')
    location_of_users.columns = ['district_id', 'active_users']
    location_of_users['day_series'] = daySeries
    
    return location_of_users

with mp.Pool(2) as pool:

    day_series_list = np.arange(1, 1462, 1).tolist()
    location_of_users_per_day_series = pool.map(get_location_of_users_for_day_series, day_series_list)    

users_per_district_day= pd.concat(location_of_users_per_day_series, ignore_index = True)
users_per_district_day.head()    



CPU times: user 19min 10s, sys: 3min 37s, total: 22min 47s
Wall time: 44min 29s


Unnamed: 0,district_id,active_users,day_series
0,101.0,"{bV6DQwW47nYGQ3ek, DO3L2BdyMny0lozp, gODYQnEZa...",1
1,102.0,"{j9XkQmAgvPVwQPBG, vBNal7V7JrzXQpme, GDdNlzWnD...",1
2,103.0,"{mYGNQABOWm3GlvZ9, LzVvQa1pk5mXQMdG, j9XkQmAbD...",1
3,104.0,"{javpljpon7ma2BL0, nWoGQGKWLwDRQwON, gODYQnEZ8...",1
4,105.0,"{nWoGQGKzr1x1QwON, n5p3lgj305V0Q6Zw, 7orJ23Roo...",1


In [7]:
# ensure we have a row for all permutations of district - day.
# users_per_district_day will have rows only for district - day where there is atleast 1 user
users_per_district_day = district_day_all_permutations_df.merge(users_per_district_day, on = ['district_id', 'day_series'], how = 'left')

# convert no users to empty set
users_per_district_day.loc[(users_per_district_day.active_users.isna()), 'active_users'] = [set()]

# sort by district_id, day_series
users_per_district_day.sort_values(['district_id', 'day_series'], inplace = True)

# reset index
users_per_district_day = users_per_district_day.reset_index(drop = True)

# reorder columns
users_per_district_day = users_per_district_day[['day_series', 'district_id', 'active_users']]

users_per_district_day.head()


Unnamed: 0,day_series,district_id,active_users
0,1,101,"{bV6DQwW47nYGQ3ek, DO3L2BdyMny0lozp, gODYQnEZa..."
1,2,101,"{bV6DQwW47nYGQ3ek, DO3L2BdyMny0lozp, gODYQnEZa..."
2,3,101,"{bV6DQwW47nYGQ3ek, DO3L2BdyMny0lozp, gODYQnEZa..."
3,4,101,"{bV6DQwW47nYGQ3ek, DO3L2BdyMny0lozp, gODYQnEZa..."
4,5,101,"{bV6DQwW47nYGQ3ek, DO3L2BdyMny0lozp, gODYQnEZa..."


In [8]:
# stop dask client
client.shutdown()

# trigger garbage collection
del user_locations
del location_of_users_per_day_series
gc.collect()

1316

## Compute visits per district day

In [9]:
def get_total_migrated_as_per_day_k(district, daySeries, impacted_users, total_impacted, k):

    if(((daySeries + k) > 1461) or (total_impacted == 0)):
        
        empty_df = pd.DataFrame(np.NaN, columns=["migrated"], index=list(users_per_district_day.district_id.unique()))
        return empty_df.to_dict()['migrated']
    
    data_for_other_districts_k_days_later = users_per_district_day[(users_per_district_day.day_series == (daySeries + k))].copy()

    data_for_other_districts_k_days_later['migrated'] = data_for_other_districts_k_days_later.active_users.apply(lambda migrated: len(impacted_users.intersection(migrated)))
    
    data_for_other_districts_k_days_later.set_index('district_id', inplace = True)
    
    return data_for_other_districts_k_days_later[['migrated']].to_dict()['migrated']
    

def get_migration_metrics(district_day):
    
    district = district_day[0]
    daySeries = district_day[1]
    
    impacted_users = users_per_district_day[(users_per_district_day.district_id == district) & 
                                            (users_per_district_day.day_series == daySeries)].active_users.item()
    total_impacted = len(impacted_users)

    summary = get_total_migrated_as_per_day_k(district, daySeries, impacted_users, total_impacted, k)
    
    logger.info('Origin_district: ' + str(district)  + '. Impact_day: ' + str(daySeries))
    
    summary['origin_district'] = district
    summary['impact_day'] = daySeries
    summary['impacted'] = total_impacted
    summary['visit_day'] = daySeries + k

    return summary


In [None]:
%%time

for k in np.arange(2, 121, 2):
    
    if(k%10 == 0):
        continue
    
    print('Computing for k = ' + str(k))
    logger.info('Computing for k = ' + str(k))
    
    # one by one
    #district_day_metrics1 = get_migration_metrics(district_day_all_permutations_list[1446])
    #district_day_metrics2 = get_migration_metrics(district_day_all_permutations_list[29000])
    #district_day_metrics_df = pd.DataFrame([district_day_metrics1])
    
    # together without multi processing
    district_day_metrics = map(get_migration_metrics, district_day_all_permutations_list)
    district_day_metrics_df = pd.DataFrame(district_day_metrics)

    # with multi processing
    #with mp.Pool(4) as pool:

        #district_day_metrics = pool.map(get_migration_metrics, district_day_all_permutations_list)
        #district_day_metrics_df = pd.DataFrame(district_day_metrics)
        
    print('Unpivoting dataset.')
    logger.info('Unpivoting dataset.')

    district_day_metrics_unpivoted = district_day_metrics_df.melt(id_vars=['origin_district', 'impact_day', 'impacted', 'visit_day'], var_name='destination_district', value_name='visits')
    district_day_metrics_unpivoted = district_day_metrics_unpivoted.sort_values(['origin_district', 'impact_day']).reset_index(drop = True)
    district_day_metrics_unpivoted = district_day_metrics_unpivoted[['origin_district', 'impact_day', 'impacted',
                                                                   'destination_district', 'visit_day', 'visits']]

    print('Saving dataset.')
    logger.info('Saving dataset.')
    district_day_metrics_unpivoted.to_csv('/data/afg_anon/displacement_metrics/visits_per_district_day/k_' + str(k) + '.csv', index = False)
    
    del district_day_metrics
    del district_day_metrics_df
    del district_day_metrics_unpivoted
    gc.collect()


Computing for k = 2
Unpivoting dataset.
Saving dataset.
Computing for k = 4
Unpivoting dataset.
Saving dataset.
Computing for k = 6
Unpivoting dataset.
Saving dataset.
Computing for k = 8
Unpivoting dataset.
Saving dataset.
Computing for k = 12
Unpivoting dataset.
Saving dataset.
Computing for k = 14
Unpivoting dataset.
Saving dataset.
Computing for k = 16
Unpivoting dataset.
Saving dataset.
Computing for k = 18
Unpivoting dataset.
Saving dataset.
Computing for k = 22
Unpivoting dataset.
Saving dataset.
Computing for k = 24


## Compute percentage migrated per district day

In [8]:
# calculate percentage migrated for each district day

def get_total_non_migrated_as_per_day_k(district, daySeries, impacted_users, total_impacted, k):

    if(((daySeries + k) > 1461) or (total_impacted == 0)):
        return np.NaN

    users_in_district_k_days_later = users_per_district_day[(users_per_district_day.district_id == district) & 
                                                            (users_per_district_day.day_series == (daySeries + k))].active_users.item()
    
    return len(impacted_users.intersection(users_in_district_k_days_later))


def get_total_migrated_as_per_day_k(district, daySeries, impacted_users, total_impacted, k):

    if(((daySeries + k) > 1461) or (total_impacted == 0)):
        return np.NaN
    
    data_for_other_districts_k_days_later = users_per_district_day[(users_per_district_day.district_id != district) & 
                                                                   (users_per_district_day.day_series == (daySeries + k))]

    users_in_other_districts_k_days_later = data_for_other_districts_k_days_later.active_users.tolist()
    
    users_in_other_districts_k_days_later = set().union(*users_in_other_districts_k_days_later)
    
    return len(impacted_users.intersection(users_in_other_districts_k_days_later))
    

def get_migration_metrics(district_day):
    
    district = district_day[0]
    daySeries = district_day[1]
    
    impacted_users = users_per_district_day[(users_per_district_day.district_id == district) & 
                                            (users_per_district_day.day_series == daySeries)].active_users.item()
    total_impacted = len(impacted_users)

    summary = {'district': district,
               'day_series': daySeries,
               'impacted': total_impacted}
    
    for k in range(k_lower, k_upper + 1): 

        summary['non_migrated_as_per_day_' + str(k)] = get_total_non_migrated_as_per_day_k(district, daySeries, impacted_users, total_impacted, k)
        summary['migrated_as_per_day_' + str(k)] = get_total_migrated_as_per_day_k(district, daySeries, impacted_users, total_impacted, k)

    return summary


In [9]:
%%time

# set migration window for generating migration_metrics
k_lower = 111
k_upper = 120

with mp.Pool(8) as pool:

    # get migration metrics for each district-day from k_lower to k_upper
    district_day_metrics = pool.map(get_migration_metrics, district_day_all_permutations_list)

# convert migration_metrics to dataframe and save as csv
output_dir = '/data/afg_anon/displacement_metrics/percentage_migrated_per_district_day/'
file_name = 'district_day_metrics_k' + str(k_lower) + '_to_k' + str(k_upper) + '.csv'
pd.DataFrame(district_day_metrics).to_csv(output_dir + file_name, index = False)

pd.DataFrame(district_day_metrics).head()



CPU times: user 2h 8min 11s, sys: 29min 34s, total: 2h 37min 46s
Wall time: 1d 1h 11min 39s


Unnamed: 0,district,day_series,impacted,non_migrated_as_per_day_111,migrated_as_per_day_111,non_migrated_as_per_day_112,migrated_as_per_day_112,non_migrated_as_per_day_113,migrated_as_per_day_113,non_migrated_as_per_day_114,...,non_migrated_as_per_day_116,migrated_as_per_day_116,non_migrated_as_per_day_117,migrated_as_per_day_117,non_migrated_as_per_day_118,migrated_as_per_day_118,non_migrated_as_per_day_119,migrated_as_per_day_119,non_migrated_as_per_day_120,migrated_as_per_day_120
0,101,1,86936,61902.0,7257.0,61728.0,7294.0,61558.0,7357.0,61328.0,...,60927.0,7579.0,60819.0,7628.0,60626.0,7653.0,60417.0,7719.0,60262.0,7738.0
1,101,2,100962,70831.0,8614.0,70634.0,8690.0,70378.0,8773.0,70149.0,...,69807.0,9000.0,69588.0,9027.0,69349.0,9095.0,69171.0,9122.0,68895.0,9153.0
2,101,3,107369,74474.0,9437.0,74199.0,9521.0,73953.0,9596.0,73716.0,...,73365.0,9786.0,73107.0,9849.0,72923.0,9878.0,72631.0,9916.0,72254.0,10082.0
3,101,4,113819,78169.0,10200.0,77906.0,10277.0,77653.0,10380.0,77522.0,...,77012.0,10545.0,76820.0,10571.0,76516.0,10623.0,76125.0,10790.0,75663.0,11064.0
4,101,5,119551,81330.0,10950.0,81062.0,11057.0,80922.0,11125.0,80670.0,...,80185.0,11272.0,79871.0,11320.0,79464.0,11492.0,78996.0,11767.0,78731.0,12040.0
