In [1]:
from dask.distributed import Client, LocalCluster
cluster = LocalCluster(dashboard_address='localhost:7910', 
                       n_workers=16, 
                       processes=True, 
                       threads_per_worker=16,
                       memory_limit='32GB', 
                       local_directory="/path/to/dask-worker-space")
client = Client(cluster)

import dask.dataframe as ddf
import numpy as np
import pandas as pd

import shutil
import gc
gc.enable()

import multiprocessing as mp

import time
import datetime as dt
from datetime import timedelta  
from datetime import date
from datetime import datetime

import os

import matplotlib.pyplot as plt
%matplotlib inline


In [2]:
# Load tower data.
towers = pd.read_csv('/path/to/towers_with_group_id.csv', 
                     dtype={'antennaId': str, 
                            'tower_group_id': 'int32',
                            'district_id': 'float64'})

towers = towers.rename(columns = {'antennaId': 'antenna_id'})

towers = towers[towers.district_id.notna()]
towers['district_id'] = towers.district_id.astype('int64')

towers = towers[['antenna_id', 'tower_group_id', 'district_id']]
towers.head()

Unnamed: 0,antenna_id,tower_group_id,district_id
0,412203110054163,0,2401
1,412203110054221,1,2401
2,412203110054222,1,2401
3,412203110054223,1,2401
4,412203110054224,1,2401


In [3]:
def compute_daily_modal_location(file_path, output_dir, year, month, spatial_granularity):
    
    def isInEvening(d):
        return (d.hour <= 7) or (d.hour >= 19)
    
    def isWithinReqMonth(d):
        return (d.month == int(month))
    
    def get_location_column(spatial_granularity):
        
        if(spatial_granularity == 'tower_level'):
            return 'tower_group_id'
        elif(spatial_granularity == 'district_level'):
            return 'district_id'
        
    def get_modal_location(data, location_column):
        # get list of all locations
        all_locations = data[location_column].tolist()
        # pick the most frequest location (tie breaker - just pick the first one)
        return Counter(all_locations).most_common(1)[0][0]
    
    print(str(datetime.now()) + ' computing - ' + str(year) + '-' + str(month))
                
    cdr = ddf.read_csv(file_path, 
                       dtype = {"phoneHash1": object, 
                                "numtype1": object, 
                                "ctrycode1":'int16',
                                "phoneHash2": object, 
                                "numtype2": object, 
                                "ctrycode2": 'int16',
                                "interaction": object, 
                                "year": object,
                                "month": object,
                                "date": object,
                                "call_duration": 'float64', 
                                "call_cost": 'float64', 
                                "antenna_id": object,
                                "charged_duration": 'float64', 
                                "product_id": object, 
                                "f_type": object, 
                                "f_subtype": object, 
                                "pay_type": object, 
                                "subcos_id": object},
                       parse_dates = ['datetime'], 
                       date_parser = (lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S')),
                       usecols = ['phoneHash1', 'datetime', 'antenna_id'])

    # get date and hour of the day
    cdr['date'] = cdr.datetime.dt.date
    cdr['month'] = cdr.datetime.dt.month
    cdr['hour'] = cdr.datetime.dt.hour
    
    # keep cdr only for required month
    cdr = cdr[cdr.month == int(month)]
    
    # remove unknown location values
    cdr = cdr.dropna(subset = ['antenna_id'])
    cdr = cdr[cdr.antenna_id != "-99"]
    cdr = cdr[cdr.antenna_id != "-999"]
    cdr = cdr[cdr.antenna_id != "-9999"]
    
    # keep cdr only for evening hours if required
    # cdr = cdr[cdr.datetime.apply(lambda d: (isInEvening(d)), meta=(None, 'bool'))]
    
    # Merge raw cdr data with tower data.
    cdr = cdr.merge(towers, how='inner', on='antenna_id')
    
    # get location column
    location_column = get_location_column(spatial_granularity)

    ###### compute hourly modal locations #####
    hourly = cdr.groupby(['phoneHash1', 'date', 'hour']).apply(lambda data: get_modal_location(data, location_column),
                                                                 meta=pd.Series(name='hourly_modal_location',
                                                                                dtype=object, 
                                                                                index=pd.MultiIndex(levels=[[], [], []], 
                                                                                                    codes=[[],[], []],
                                                                                                    names=['phoneHash1', 'date', 'hour'])))

    hourly = hourly.compute()
    hourly = hourly.reset_index()

    ###### compute daily modal locations #####
    daily = hourly.groupby(['phoneHash1', 'date']).apply(lambda data: get_modal_location(data, 'hourly_modal_location'))
    daily = daily.reset_index()
    daily.columns = ['phoneHash1', 'day', 'daily_modal_location']
    
    output_path = output_dir + str(year) + '/' + str(month) + '.csv'
    print(str(datetime.now()) + ' saving to ' + output_path)
    daily.to_csv(output_path, index = False)
                

In [4]:
# data directories
raw_cdr_dir = '/path/to/data/cdr/'
output_dir = '/path/to/data/daily_modal_location/district_level/'

# required spatial level of modal locations
# could be tower_level or district_level
location_spatial_level = 'district_level'

In [None]:
for year in ['2013', '2014', '2015', '2016']:
    
    voice_data_dir = raw_cdr_dir + year + "/rec/"
    for f in os.listdir(voice_data_dir):
        
        month = f[-6:-4]
        month = "{0:0=2d}".format(int(month))        
        
        file_path = voice_data_dir + f
        
        compute_daily_modal_location(file_path, output_dir, year, month, location_spatial_level)


In [None]:
for year in ['2017']:

    voice_data_dir = raw_cdr_dir + year + "/rec_part1/"
    for f in os.listdir(voice_data_dir):
        
        month = f[-6:-4]
        month = "{0:0=2d}".format(int(month))        
        
        file_path = voice_data_dir + f
        
        compute_daily_modal_location(file_path, output_dir, year, month, location_spatial_level)

    voice_data_dir = raw_cdr_dir + year + "/rec_part2/"
    for f in os.listdir(voice_data_dir):
        
        month = f[-6:-4]
        month = "{0:0=2d}".format(int(month))        
        
        file_path = voice_data_dir + f
        
        compute_daily_modal_location(file_path, output_dir, year, month, location_spatial_level)
        

In [None]:
for year in ['2018']:

    voice_data_dir = raw_cdr_dir + year + "/rec/"
    for f in os.listdir(voice_data_dir):
        
        month = f[4:6]
        month = "{0:0=2d}".format(int(month))        
        
        file_path = voice_data_dir + f
        
        compute_daily_modal_location(file_path, output_dir, year, month, location_spatial_level)


In [None]:
for year in ['2019', '2020']:
    
    voice_data_dir = raw_cdr_dir + year + "/rec/"
    for f in os.listdir(voice_data_dir):
        
        month = f[-6:-4]
        month = "{0:0=2d}".format(int(month))        
        
        file_path = voice_data_dir + f
        
        compute_daily_modal_location(file_path, output_dir, year, month, location_spatial_level)
        