In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gc
gc.enable()

import os
import glob
from datetime import datetime

from datetime import timedelta
import logging


from pandarallel import pandarallel
pandarallel.initialize(nb_workers=24)
# from dask.distributed import Client, LocalCluster

# dask_n_workers = 32
# dask_worker_memory_limit = 16
# dask_threads_per_worker = 16
# cluster = LocalCluster(dashboard_address = 'localhost:7920', 
#                        n_workers = dask_n_workers, 
#                        processes = True, 
#                        threads_per_worker = dask_threads_per_worker,
#                        memory_limit = str(dask_worker_memory_limit) + 'GB', 
#                        local_directory = "/data/tmp/snair/tmp/dask-worker-space")
# client = Client(cluster)

# import dask.dataframe as ddf

INFO: Pandarallel will run on 24 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [3]:
from collections import Counter
def nan_checker(x):
    
    return x[~np.isnan(x)]


def shifter(x):
    
    temp = x.set_index('day')
    days = pd.date_range(start = '2013-04-01', end = '2020-10-31', freq = 'D')
    temp = temp.reindex(days)
  
    #Find home location 30 days before and 30 days after
    temp['home_location_30_prev'] = temp.home_location.shift(30)
    temp['home_location_30_post'] = temp.home_location.shift(-30)

    temp['home_location_90_prev'] = temp.home_location.shift(90)
    temp['home_location_90_post'] = temp.home_location.shift(-90)
    
    #Flip dataframe to facilitate easy rolling window analysis
    temp = temp.sort_index(ascending = False)
    
    #in migrant: location on day d != location on day d - 30
    temp['in_migrant'] = ((temp['home_location_30_prev'] != temp['home_location']) & 
                      temp['home_location'].notna() & 
                      temp['home_location_30_prev'].notna()) * 1

    #in_migrant_stays: in_migrant, and location on day d + 30 == location on day d
    temp['in_migrant_stays'] = ((temp['in_migrant'] == 1) &  
                               (temp['home_location'] == temp['home_location_30_post']) & 
                               (temp['home_location_30_post'].notna()))* 1

    temp['in_migrant_stays_90'] = ((temp['in_migrant'] == 1) &  
                           (temp['home_location'] == temp['home_location_90_post']) & 
                           (temp['home_location_90_post'].notna()))* 1


    
    return temp


def location_finder(x, d = 30):
    """
    function to check
    1) if an individual is a migrant or not
    2) is stays in the destination on t + 30
    3) is back in source on t + 30
    """
    
    temp = x.set_index('day')
    days = pd.date_range(start = '2013-04-01', end = '2020-10-31', freq = 'D')
    temp = temp.reindex(days)
  
    #Find home location 30 days before and 30 days after
    temp[f'home_location_{d}_prev'] = temp.home_location.shift(d)
    temp[f'home_location_{d}_post'] = temp.home_location.shift(-d)
    
    #Flip dataframe to facilitate easy rolling window analysis
    temp = temp.sort_index(ascending = False)
    
    temp = temp.reset_index()
    
    #in migrant: location on day d != location on day d - 30
    temp['in_migrant'] = ((temp[f'home_location_{d}_prev'] != temp['home_location']) & 
                      temp['home_location'].notna() & 
                      temp[f'home_location_{d}_prev'].notna()) * 1

        #in_migrant_stays: in_migrant, and location on day d + 30 == location on day d
    temp['in_migrant_stays'] = ((temp['in_migrant'] == 1) &  
                               (temp['home_location'] == temp['home_location_30_post']) & 
                               (temp['home_location_30_post'].notna()))* 1

    
    temp['in_migrant_goes_back'] = ((temp['in_migrant'] == 1) &  
                               (temp[f'home_location_{d}_prev'] == temp[f'home_location_{d}_post']) & 
                               (temp[f'home_location_{d}_post'].notna()) & 
                               (temp[f'home_location_{d}_prev'].notna()))* 1
    
    return temp



def find_location_in_window(x, d = 30, go_back_window = 90):
    """
    function to check
    1) if an individual is a migrant or not
    2) is stays in the destination for [t, t+90)
    3) goes back home in [t, t+90)
    """
    temp = x.set_index('day')
    days = pd.date_range(start = '2013-04-01', end = '2020-10-31', freq = 'D')
    temp = temp.reindex(days)
    temp = temp.reset_index()


    #Find home location 30 days before and 30 days after
    temp[f'home_location_{d}_prev'] = temp.home_location.shift(d)

    temp['in_migrant'] = ((temp[f'home_location_{d}_prev'] != temp['home_location']) & 
                  temp['home_location'].notna() & 
                  temp[f'home_location_{d}_prev'].notna()) * 1
    

    indexer = pd.api.indexers.FixedForwardWindowIndexer(window_size = go_back_window)
    t = [Counter(w.dropna()) for w in temp[f'home_location'].rolling(window=indexer)]
    temp[f'locations_d_dplus_{go_back_window}'] = t


    temp[f'stays_all_{go_back_window}'] = temp.apply(lambda x: 1 if (x['home_location'] in x[f'locations_d_dplus_{go_back_window}']) & (len(x[f'locations_d_dplus_{go_back_window}']) == 1) & (x['in_migrant'] == 1) else 0, axis = 1)
    temp[f'goes_back_{go_back_window}'] = temp.apply(lambda x: 1 if ((x[f'home_location_{d}_prev'] in x[f'locations_d_dplus_{go_back_window}']) & 
                                                                     (x['in_migrant'] == 1)) else 0, axis = 1)

    
    
    temp = temp.drop(columns = [f'locations_d_dplus_{go_back_window}'])
    

    return temp


In [None]:
### Tracking dict
mig_stat_dict = [f.split(".")[0].split("_")[0] for f in os.listdir("/data/tmp/snair/migstats_v6")]
mig_stat_dict

In [None]:
### Load files
fpath = "/data/afg_anon/displacement_metrics/home_locations/daily_modal_voice_only_2013-2020_version/"
files = glob.glob(fpath + '*.csv')

print("No of Files", len(files))

In [None]:
### Calculate stats
for file in files:

    stub = file.split('/')[-1]
    stub = stub.split('.')[0]
    
    if stub not in mig_stat_dict:
    #     print(datetime.now(), stub, 'Started!')
        print(stub, datetime.now())

        df_base = pd.read_csv(file)
        if df_base.shape[0]>0:
            df_base['day'] = pd.to_datetime(df_base['day'])

            df_base['ym'] = df_base['day'].dt.to_period('M').parallel_apply(lambda x: str(x))
            df_base['year'] = df_base.day.dt.year
            #     test['ym'] = test['day'].dt.to_period('M').parallel_apply(lambda x: f'{x}-01')

            df_base = df_base.set_index('phoneHash1')

            print(f'Starting big job, {datetime.now()}')
            df_clean = df_base.groupby(level=0).parallel_apply(lambda x: find_location_in_window(x))
            df_clean = df_clean.reset_index(drop = True, level = 1)
            df_clean = df_clean.rename(columns = {'index':'day'})

            maincols = ['in_migrant', 'stays_all_90', 'goes_back_90' ]

            # logging.info(f'Calculating district-day stats, {datetime.now()}')
            mig_stats = df_clean.reset_index().groupby(['home_location', 'day'])[maincols].sum()
            mig_stats = mig_stats.reset_index()
            

#             mig_stat_dict[stub] = mig_stats.copy()
            mig_stats.to_csv(f"/data/tmp/snair/migstats_v6/{stub}_v6.csv")

            # logging.info(f'{datetime.now()}, {stub}, Completed!')
            df_base = None
            df_clean = None
            gc.collect()

        else:
            print(stub, "no data!", datetime.now())

In [None]:
files = glob.glob("/data/tmp/snair/migstats_v6/*.csv")
len(files)

df_stats = pd.concat([pd.read_csv(f) for f in files])
df_stats = df_stats.drop(columns = ['Unnamed: 0'])
df_stats = df_stats.groupby(['home_location', 'day']).sum().reset_index()
df_stats.to_csv("/data/afg_satellite/snair/mig_stats_oct_2025_stays_90days.csv", index = False)