In [1]:
import sys
# Add the directory containing your module to the Python path
sys.path.append('/home/smehra/local-packages')

In [2]:
# setup dask client
from dask.distributed import Client, LocalCluster
cluster = LocalCluster(dashboard_address='localhost:7900', 
                       n_workers=4, 
                       processes=True, 
                       threads_per_worker=8, 
                       memory_limit='32GB', 
                       local_directory="/data/tmp/smehra/tmp/dask-worker-space")
client = Client(cluster)

import dask.dataframe as ddf
import geopandas as gpd
import pandas as pd
import numpy as np

import os
import gc
from datetime import datetime
import datetime as dt

import matplotlib.pyplot as plt
%matplotlib inline

## Compute Data

In [3]:
districts = gpd.read_file("/data/afg_anon/ShapeFiles/AFG_district_398/district398.shp")
districts = districts[['DISTID', 'PROVID']]
districts.columns = ['district_id', 'province_id']
districts = pd.DataFrame(districts)

# Kabul, Kandahar, Hirat, Mazari Sharif, Jalalabad
major_cities = pd.DataFrame({"district_id": [101, 2401, 2001, 1601, 801]})
major_cities_minus_kabul = pd.DataFrame({"district_id": [2401, 2001, 1601, 801]})

provicial_capitals = pd.DataFrame({"province_id": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34],
                                   "capital_district": [101, 201, 301, 401, 501, 601, 701, 801, 901, 1001, 1101, 1201, 1301, 1401, 1501, 1601, 1701, 1801, 1901, 2001, 2101, 2201, 2301, 2401, 2501, 2601, 2701, 2801, 2901, 3001, 3101, 3201, 3301, 3401]})

provicial_capitals_minus_major = pd.DataFrame({"province_id": [2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 21, 22, 23, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34],
                                               "capital_district": [201, 301, 401, 501, 601, 701, 901, 1001, 1101, 1201, 1301, 1401, 1501, 1701, 1801, 1901, 2101, 2201, 2301, 2501, 2601, 2701, 2801, 2901, 3001, 3101, 3201, 3301, 3401]})

districts = districts.merge(provicial_capitals, on = "province_id", how = "left")

districts.head()


Unnamed: 0,district_id,province_id,capital_district
0,2007,20,2001
1,2006,20,2001
2,2016,20,2001
3,2008,20,2001
4,2009,20,2001


In [4]:
# create an empty dataframe where we will store per district per day counts
days = pd.DataFrame(columns = ['impact_day']) 
days['impact_day'] = np.arange(1, 1462, 1).tolist()

district_day_all_permutations_df = districts[["district_id"]].assign(dummy_col = 1).merge(days.assign(dummy_col = 1)).drop('dummy_col', 1)
district_day_all_permutations_df.rename(columns = {'district_id': 'origin_district'}, inplace = True)
district_day_all_permutations_df.head()


Unnamed: 0,origin_district,impact_day
0,2007,1
1,2007,2
2,2007,3
3,2007,4
4,2007,5


In [5]:
events = pd.read_csv('/data/tmp/smehra/aggregated_data/afgh-displacement/afghanistan_violent_events_updated-7-29-2020.csv',
                     parse_dates = ['date_start'], 
                     date_parser = (lambda x: datetime.strptime(x, '%Y/%m/%d')))

events['day_series'] = events.date_start.apply(lambda x: (x.date() - dt.date(2013,3,31)).days)
events = events[['distid', 'day_series']]
events = events.drop_duplicates()

events = events.sort_values(['day_series', 'distid'])
events = events[events.day_series >= 1]

events = events.rename(columns = {'distid': 'origin_district', 
                                  'day_series': 'impact_day'})
events.head()


Unnamed: 0,origin_district,impact_day
1314,808,1
1315,1004,2
1318,615,3
1316,1701,3
1317,2101,3


In [6]:
def get_percent_moved_away_per_district_day(visits):
    
    x = visits[(visits.origin_district != visits.destination_district)]
    return x.groupby(['impact_day', 'origin_district']).agg({'visit_percent': 'sum'}).reset_index()


def get_total_moved_away(visits):
    
    x = visits[(visits.origin_district != visits.destination_district)]
    return x.visits.sum()


def get_percent_moved_to_home_province_rural_district(visits, moved_away):
    
    x = visits[(visits.origin_province == visits.destination_province) & 
               (visits.origin_district != visits.destination_district) & 
               (visits.destination_district != visits.origin_capital_district)]

    x = x.groupby(['impact_day', 'origin_district']).agg({'visit_percent': 'sum'})
    
    x = x.merge(moved_away, on = ['impact_day', 'origin_district'], how = 'left')

    x['visit_percent_among_moved'] = (x.visit_percent_x/x.visit_percent_y)*100

    return x.visit_percent_among_moved.mean()


def get_total_moved_to_home_province_rural_district(visits):
    
    x = visits[(visits.origin_province == visits.destination_province) & 
               (visits.origin_district != visits.destination_district) & 
               (visits.destination_district != visits.origin_capital_district)]

    return x.visits.sum()


def get_percent_moved_to_away_province_rural_district(visits, moved_away):
    
    x = visits[(visits.origin_province != visits.destination_province)]
    
    x = x.merge(provicial_capitals, left_on = ['destination_district'], right_on = "capital_district", how = 'outer', indicator = True)
    x = x[x._merge == "left_only"]

    x = x.groupby(['impact_day', 'origin_district']).agg({'visit_percent': 'sum'})
    
    x = x.merge(moved_away, on = ['impact_day', 'origin_district'], how = 'left')

    x['visit_percent_among_moved'] = (x.visit_percent_x/x.visit_percent_y)*100

    return x.visit_percent_among_moved.mean()


def get_total_moved_to_away_province_rural_district(visits):
    
    x = visits[(visits.origin_province != visits.destination_province)]
    
    x = x.merge(provicial_capitals, left_on = ['destination_district'], right_on = "capital_district", how = 'outer', indicator = True)
    x = x[x._merge == "left_only"]

    return x.visits.sum()


def get_percent_moved_to_home_province_capital_district(visits, moved_away):
    
    visits.loc[(visits.origin_province == visits.destination_province) & 
               (visits.origin_district == visits.origin_capital_district) &
               (visits.origin_district == visits.destination_district), 'visit_percent'] = 0
    
    x = visits[(visits.origin_province == visits.destination_province)]
    
    x = x.merge(provicial_capitals, left_on = ['destination_district'], right_on = "capital_district", how = 'inner')

    x = x.groupby(['impact_day', 'origin_district']).agg({'visit_percent': 'sum'})

    x = x.merge(moved_away, on = ['impact_day', 'origin_district'], how = 'left')

    x['visit_percent_among_moved'] = (x.visit_percent_x/x.visit_percent_y)*100

    return x.visit_percent_among_moved.mean()


def get_total_moved_to_home_province_capital_district(visits):
    
    visits.loc[(visits.origin_province == visits.destination_province) & 
               (visits.origin_district == visits.origin_capital_district) &
               (visits.origin_district == visits.destination_district), 'visits'] = 0
    
    x = visits[(visits.origin_province == visits.destination_province)]
    
    x = x.merge(provicial_capitals, left_on = ['destination_district'], right_on = "capital_district", how = 'inner')

    return x.visits.sum()


def get_percent_moved_to_away_province_capital_district(visits, moved_away):
    
    x = visits[(visits.origin_province != visits.destination_province)]
    
    x = x.merge(provicial_capitals, left_on = ['destination_district'], right_on = "capital_district", how = 'inner')

    x = x.groupby(['impact_day', 'origin_district']).agg({'visit_percent': 'sum'})
    
    x = x.merge(moved_away, on = ['impact_day', 'origin_district'], how = 'left')

    x['visit_percent_among_moved'] = (x.visit_percent_x/x.visit_percent_y)*100

    return x.visit_percent_among_moved.mean()


def get_total_moved_to_away_province_capital_district(visits):
    
    x = visits[(visits.origin_province != visits.destination_province)]
    
    x = x.merge(provicial_capitals, left_on = ['destination_district'], right_on = "capital_district", how = 'inner')

    return x.visits.sum()


def get_percent_moved_to_specific_district(district, visits, moved_away):
    
    x = visits[(visits.origin_district != visits.destination_district) & 
               (visits.destination_district == district)]

    x = x.groupby(['impact_day', 'origin_district']).agg({'visit_percent': 'sum'})
    
    x = x.merge(moved_away, on = ['impact_day', 'origin_district'], how = 'left')

    x['visit_percent_among_moved'] = (x.visit_percent_x/x.visit_percent_y)*100

    return x.visit_percent_among_moved.mean()


def get_total_moved_to_specific_district(district, visits):
    
    x = visits[(visits.origin_district != visits.destination_district) & 
               (visits.destination_district == district)]

    return x.visits.sum()


def get_percent_moved_to_non_major_capital_district(visits, moved_away):
    
    x = visits[(visits.origin_district != visits.destination_district)]
    
    x = x.merge(provicial_capitals_minus_major, left_on = ['destination_district'], right_on = "capital_district", how = 'right') 

    x = x.groupby(['impact_day', 'origin_district']).agg({'visit_percent': 'sum'})
    
    x = x.merge(moved_away, on = ['impact_day', 'origin_district'], how = 'left')

    x['visit_percent_among_moved'] = (x.visit_percent_x/x.visit_percent_y)*100

    return x.visit_percent_among_moved.mean()


def get_total_moved_to_non_major_capital_district(visits):
    
    x = visits[(visits.origin_district != visits.destination_district)]
    
    x = x.merge(provicial_capitals_minus_major, left_on = ['destination_district'], right_on = "capital_district", how = 'right') 

    return x.visits.sum()


def get_percent_moved_to_rural_district(visits, moved_away):
    
    x = visits[(visits.origin_district != visits.destination_district)]
    
    x = x.merge(provicial_capitals, left_on = ['destination_district'], right_on = "capital_district", how = 'outer', indicator = True) 
    x = x[x._merge == "left_only"]

    x = x.groupby(['impact_day', 'origin_district']).agg({'visit_percent': 'sum'})
    
    x = x.merge(moved_away, on = ['impact_day', 'origin_district'], how = 'left')

    x['visit_percent_among_moved'] = (x.visit_percent_x/x.visit_percent_y)*100

    return x.visit_percent_among_moved.mean()


def get_total_moved_to_rural_district(visits):
    
    x = visits[(visits.origin_district != visits.destination_district)]
    
    x = x.merge(provicial_capitals, left_on = ['destination_district'], right_on = "capital_district", how = 'outer', indicator = True) 
    x = x[x._merge == "left_only"]

    return x.visits.sum()


def get_percent_moved_to_away_province_non_major_capital(visits, moved_away):
    
    x = visits[(visits.origin_province != visits.destination_province)]
    
    x = x.merge(provicial_capitals_minus_major, left_on = ['destination_district'], right_on = "capital_district", how = 'inner')

    x = x.groupby(['impact_day', 'origin_district']).agg({'visit_percent': 'sum'})
    
    x = x.merge(moved_away, on = ['impact_day', 'origin_district'], how = 'left')

    x['visit_percent_among_moved'] = (x.visit_percent_x/x.visit_percent_y)*100

    return x.visit_percent_among_moved.mean()


def get_total_moved_to_away_province_non_major_capital(visits):
    
    x = visits[(visits.origin_province != visits.destination_province)]
    
    x = x.merge(provicial_capitals_minus_major, left_on = ['destination_district'], right_on = "capital_district", how = 'inner')

    return x.visits.sum()


def get_percent_moved_to_away_province_major_capital(visits, moved_away):
    
    x = visits[(visits.origin_province != visits.destination_province)]
    
    x = x.merge(major_cities, left_on = ['destination_district'], right_on = "district_id", how = 'inner')

    x = x.groupby(['impact_day', 'origin_district']).agg({'visit_percent': 'sum'})
    
    x = x.merge(moved_away, on = ['impact_day', 'origin_district'], how = 'left')

    x['visit_percent_among_moved'] = (x.visit_percent_x/x.visit_percent_y)*100

    return x.visit_percent_among_moved.mean()


def get_total_moved_to_away_province_major_capital(visits):
    
    x = visits[(visits.origin_province != visits.destination_province)]
    
    x = x.merge(major_cities, left_on = ['destination_district'], right_on = "district_id", how = 'inner')

    return x.visits.sum()


def get_impact_days(group, region):
    
    if(group == 'treatment'):
    
        impact_days = events.copy()
        
    elif(group == 'control'):
        
        impact_days = district_day_all_permutations_df.merge(events, on = ['origin_district', 'impact_day'], how = 'outer', indicator = True)
        impact_days = impact_days[impact_days._merge == 'left_only']
        impact_days = impact_days.drop(columns=['_merge'])
        
    if(region == 'rural_districts'):
        
        impact_days = impact_days[impact_days.origin_district.apply(lambda d: d not in provicial_capitals.capital_district.tolist())]
        
    elif(region == 'major_capital_districts'):
        
        impact_days = impact_days[impact_days.origin_district.apply(lambda d: d in major_cities.district_id.tolist())].copy()
        
    elif(region == 'non_major_capital_districts'):
        
        impact_days = impact_days[impact_days.origin_district.apply(lambda d: d in provicial_capitals_minus_major.capital_district.tolist())].copy()
        
    elif(region == 'capital_districts'):
        
        impact_days = impact_days[impact_days.origin_district.apply(lambda d: d in provicial_capitals.capital_district.tolist())].copy()
        
    return impact_days


## Version 1
#### origin: rural_districts, major_capital_districts, non_major_capital_districts, capital_districts
#### destination: home / away rural / capital

In [7]:
def get_movement_summary(group, region, impact_days, time_threshold):

    print(datetime.now(), 'reading data for people within ' + region + ' - ' + str(time_threshold) + ' days after the event.')
    visits = ddf.read_csv('/data/afg_anon/displacement_metrics/visits_per_district_day/k_' + str(time_threshold) + '.csv')

    visits['visit_percent'] = (visits.visits/visits.impacted)*100

    visits = visits.merge(districts, left_on = "origin_district", right_on = "district_id", how = "left").drop(columns = ["district_id"])
    visits = visits.rename(columns = {"province_id": "origin_province", "capital_district": "origin_capital_district"})

    visits = visits.merge(districts, left_on = "destination_district", right_on = "district_id", how = "left").drop(columns = ["district_id", "capital_district"])
    visits = visits.rename(columns = {"province_id": "destination_province"})

    visits = visits.merge(impact_days, on = ['origin_district', 'impact_day'], how = 'inner')

    visits = visits[(~visits.impacted.isna()) & (visits.impacted > 0)]
    visits = visits.compute()

    movement = dict()    
    moved_away_per_dd = get_percent_moved_away_per_district_day(visits)

    movement["impact_region"] = region
    movement["days_after_event"] = time_threshold
    
    movement["moved_to_home_province_rural_district"] = get_percent_moved_to_home_province_rural_district(visits, moved_away_per_dd)
    movement["moved_to_away_province_rural_district"] = get_percent_moved_to_away_province_rural_district(visits, moved_away_per_dd)
    
    movement["moved_to_home_province_capital_district"] = get_percent_moved_to_home_province_capital_district(visits, moved_away_per_dd)
    movement["moved_to_away_province_capital_district"] = get_percent_moved_to_away_province_capital_district(visits, moved_away_per_dd)
    
    movement["moved_to_away_province_major_capital_district"] = get_percent_moved_to_away_province_major_capital(visits, moved_away_per_dd)
    movement["moved_to_away_province_non_major_capital_district"] = get_percent_moved_to_away_province_non_major_capital(visits, moved_away_per_dd)
    
    return movement


movement_summary = pd.DataFrame()
version = 'v1'
group = 'treatment'
time_threshold = 30

for region in ['rural_districts', 'major_capital_districts', 'non_major_capital_districts', 'capital_districts']:

    impact_days = get_impact_days(group, region)
    movement = get_movement_summary(group, region, impact_days, time_threshold)
    movement_summary = movement_summary.append(movement, ignore_index = True)

movement_summary.to_csv('/data/tmp/smehra/aggregated_data/afgh-displacement/visits_descriptive_analysis/data/' + group + '_' + version + '.csv', index = False)
movement_summary.head()


2021-02-02 17:22:56.864070 reading data for people within rural_districts - 30 days after the event.
2021-02-02 17:23:24.229090 reading data for people within major_capital_districts - 30 days after the event.
2021-02-02 17:23:46.984279 reading data for people within non_major_capital_districts - 30 days after the event.
2021-02-02 17:24:09.795280 reading data for people within capital_districts - 30 days after the event.


Unnamed: 0,days_after_event,impact_region,moved_to_away_province_capital_district,moved_to_away_province_major_capital_district,moved_to_away_province_non_major_capital_district,moved_to_away_province_rural_district,moved_to_home_province_capital_district,moved_to_home_province_rural_district
0,30.0,rural_districts,29.899978,18.404429,11.495549,24.262729,22.632399,23.204894
1,30.0,major_capital_districts,41.991507,20.40374,21.587767,30.157207,0.0,27.851286
2,30.0,non_major_capital_districts,52.029319,36.605657,15.423662,29.073729,0.0,18.896952
3,30.0,capital_districts,48.932909,31.607778,17.325132,29.407954,0.0,21.659136


## Version 2
#### origin: rural_districts, major_capital_districts, non_major_capital_districts, capital_districts
#### destination: kabul / kandahar / hirat / mazari sharif / jalalabad

In [8]:
def get_movement_summary(group, region, impact_days, time_threshold):

    print(datetime.now(), 'reading data for people within ' + region + ' - ' + str(time_threshold) + ' days after the event.')
    visits = ddf.read_csv('/data/afg_anon/displacement_metrics/visits_per_district_day/k_' + str(time_threshold) + '.csv')

    visits['visit_percent'] = (visits.visits/visits.impacted)*100

    visits = visits.merge(districts, left_on = "origin_district", right_on = "district_id", how = "left").drop(columns = ["district_id"])
    visits = visits.rename(columns = {"province_id": "origin_province", "capital_district": "origin_capital_district"})

    visits = visits.merge(districts, left_on = "destination_district", right_on = "district_id", how = "left").drop(columns = ["district_id", "capital_district"])
    visits = visits.rename(columns = {"province_id": "destination_province"})

    visits = visits.merge(impact_days, on = ['origin_district', 'impact_day'], how = 'inner')

    visits = visits[(~visits.impacted.isna()) & (visits.impacted > 0)]
    visits = visits.compute()

    movement = dict()    
    moved_away_per_dd = get_percent_moved_away_per_district_day(visits)
    
    movement["impact_region"] = region
    movement["days_after_event"] = time_threshold

    movement["moved_to_rural"] = get_percent_moved_to_rural_district(visits, moved_away_per_dd)    
    movement["moved_to_non_major_capital_districts"] = get_percent_moved_to_non_major_capital_district(visits, moved_away_per_dd)
    
    movement["moved_to_kabul"] = get_percent_moved_to_specific_district(101, visits, moved_away_per_dd)
    movement["moved_to_kandahar"] = get_percent_moved_to_specific_district(2401, visits, moved_away_per_dd)
    movement["moved_to_hirat"] = get_percent_moved_to_specific_district(2001, visits, moved_away_per_dd)
    movement["moved_to_mazari_sharif"] = get_percent_moved_to_specific_district(1601, visits, moved_away_per_dd)
    movement["moved_to_jalalabad"] = get_percent_moved_to_specific_district(801, visits, moved_away_per_dd)
        
    return movement


movement_summary = pd.DataFrame()
version = 'v2'
group = 'treatment'
time_threshold = 30

for region in ['rural_districts', 'major_capital_districts', 'non_major_capital_districts', 'capital_districts']:

    impact_days = get_impact_days(group, region)
    movement = get_movement_summary(group, region, impact_days, time_threshold)
    movement_summary = movement_summary.append(movement, ignore_index = True)

movement_summary.to_csv('/data/tmp/smehra/aggregated_data/afgh-displacement/visits_descriptive_analysis/data/' + group + '_' + version + '.csv', index = False)
movement_summary.head()


2021-02-02 17:24:33.154083 reading data for people within rural_districts - 30 days after the event.
2021-02-02 17:24:56.591460 reading data for people within major_capital_districts - 30 days after the event.
2021-02-02 17:25:18.907608 reading data for people within non_major_capital_districts - 30 days after the event.
2021-02-02 17:25:41.486721 reading data for people within capital_districts - 30 days after the event.


Unnamed: 0,days_after_event,impact_region,moved_to_hirat,moved_to_jalalabad,moved_to_kabul,moved_to_kandahar,moved_to_mazari_sharif,moved_to_non_major_capital_districts,moved_to_rural
0,30.0,rural_districts,3.665824,3.333855,13.409695,3.394937,1.401653,27.326413,47.467623
1,30.0,major_capital_districts,4.604323,1.13721,20.348868,1.976137,2.344124,21.587767,58.008493
2,30.0,non_major_capital_districts,4.057839,1.190096,25.503939,3.466048,2.387735,15.423662,47.970681
3,30.0,capital_districts,4.210854,1.175604,24.451473,3.092203,2.374697,17.325132,51.067091


## Version 3
#### origin: rural_districts, capital_districts
#### destination: home / away rural / top 5 capital / non top 5 capital

In [20]:
def get_movement_summary(group, region, impact_days, time_threshold):
    
    print(datetime.now(), 'reading data for people within ' + region + ' - ' + str(time_threshold) + ' days after the event.')
    visits = ddf.read_csv('/data/afg_anon/displacement_metrics/visits_per_district_day/k_' + str(time_threshold) + '.csv')

    visits['visit_percent'] = (visits.visits/visits.impacted)*100

    visits = visits.merge(districts, left_on = "origin_district", right_on = "district_id", how = "left").drop(columns = ["district_id"])
    visits = visits.rename(columns = {"province_id": "origin_province", "capital_district": "origin_capital_district"})

    visits = visits.merge(districts, left_on = "destination_district", right_on = "district_id", how = "left").drop(columns = ["district_id", "capital_district"])
    visits = visits.rename(columns = {"province_id": "destination_province"})

    visits = visits.merge(impact_days, on = ['origin_district', 'impact_day'], how = 'inner')

    visits = visits[(~visits.impacted.isna()) & (visits.impacted > 0)]
    visits = visits.compute()

    movement = dict()   
    movement["impact_region"] = region
    movement["days_after_event"] = time_threshold
    movement["impacted"] = get_total_moved_away(visits)

    movement["moved_to_home_province_rural_district"] = get_total_moved_to_home_province_rural_district(visits)
    movement["moved_to_away_province_rural_district"] = get_total_moved_to_away_province_rural_district(visits)

    movement["moved_to_home_province_capital_district"] = get_total_moved_to_home_province_capital_district(visits)
    movement["moved_to_away_province_major_capital_district"] = get_total_moved_to_away_province_major_capital(visits)
    movement["moved_to_away_province_non_major_capital_district"] = get_total_moved_to_away_province_non_major_capital(visits)
    
    return movement


movement_summary = pd.DataFrame()
version = 'v3'
group = 'control'
time_threshold = 30

for region in ['rural_districts', 'capital_districts']:
    
    impact_days = get_impact_days(group, region)
    movement = get_movement_summary(group, region, impact_days, time_threshold)
    movement_summary = movement_summary.append(movement, ignore_index = True)

movement_summary.to_csv('/data/tmp/smehra/aggregated_data/afgh-displacement/visits_descriptive_analysis/data/' + group + '_' + version + '.csv', index = False)
movement_summary.head()


2021-02-02 17:58:57.202182 reading data for people within rural_districts - 30 days after the event.
2021-02-02 18:02:03.664140 reading data for people within capital_districts - 30 days after the event.


Unnamed: 0,days_after_event,impact_region,impacted,moved_to_away_province_major_capital_district,moved_to_away_province_non_major_capital_district,moved_to_away_province_rural_district,moved_to_home_province_capital_district,moved_to_home_province_rural_district
0,30.0,rural_districts,45917665.0,8874905.0,5341932.0,9055843.0,13775389.0,8869596.0
1,30.0,capital_districts,49905899.0,13105542.0,10357748.0,13223705.0,0.0,13218904.0
