In [None]:
from dask.distributed import Client, LocalCluster
cluster = LocalCluster(dashboard_address='localhost:7920', 
                       n_workers=16, 
                       processes=True, 
                       threads_per_worker=16,
                       memory_limit='24GB', 
                       local_directory="/path/to/dask-worker-space")
client = Client(cluster)

import dask.dataframe as ddf
import numpy as np
import pandas as pd

import shutil
import gc
gc.enable()

import multiprocessing as mp

import time
from datetime import timedelta  
from datetime import date
from datetime import datetime
from collections import Counter

import os

import matplotlib.pyplot as plt
%matplotlib inline


### get list of files

In [None]:
input_dir = '/path/to/data/daily_modal_location/using_rec_from_2013_to_2020/'
spatial_granularity = 'district_level'
output_dir = '/path/to/data/daily_modal_voice_version_bucketed/' + spatial_granularity + '/'

files = []

for year in os.listdir(input_dir + spatial_granularity):
    
    for month in os.listdir(input_dir + spatial_granularity + '/' + year):
        
        file_path = input_dir + spatial_granularity + '/' + year + '/' + month
        files = files + [file_path]


### get set of unique first letters of user id's

In [None]:
unique_first_letters = set()

for f in files:

    data = ddf.read_csv(f,
                        dtype = {'caller_msisdn': str,
                                 'day': str,
                                 'daily_modal_location': 'int64'},
                       usecols = ['caller_msisdn'])

    data['caller_msisdn'] = data.caller_msisdn.str[:2]

    data = data.compute()
    
    unique_first_letters.update(data.caller_msisdn.tolist())

print(unique_first_letters)

### sort datasets

In [None]:
for i in range(0, len(unique_first_letters)):

    letter = unique_first_letters[i]
    
    print('computing for .. ' + str(i + 1) + '/' + str(len(unique_first_letters)) + ' ' + letter)
    
    data = ddf.read_csv(files,
                        dtype = {'phoneHash1': str,
                                 'day': str,
                                 'daily_modal_location': 'float64'})

    data['phoneHash1_first_letter'] = data.phoneHash1.str[:2]
    
    data = data[data.phoneHash1_first_letter == letter]
    
    data = data[['phoneHash1', 'day', 'daily_modal_location']]

    data = data.compute()
    
    data.to_csv(output_dir + letter + '.csv', index = False)
