In [1]:
import random
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import geopandas as gpd
from shapely.geometry import Point

In [2]:
synthetic_data_folder = 'synthetic_data'

In [3]:
# Generate table of towers
np.random.seed(100)
num_towers = 100

shapefile = gpd.read_file(synthetic_data_folder + '/regions.geojson')
shapefile = shapefile.dissolve(by='OBJECTID')
polygon = shapefile['geometry'][0]

minx, miny, maxx, maxy = polygon.bounds
lats, lons = [], []
while len(lats) < num_towers:
    point = Point(np.random.uniform(minx, maxx), np.random.uniform(miny, maxy))
    if polygon.contains(point):
        lats.append(point.y)
        lons.append(point.x)
points = pd.DataFrame([lats, lons]).T
points.columns = ['latitude', 'longitude']
points['tower_id'] = range(len(points))
points['tower_id'] = points['tower_id'].apply(lambda x: 't' + str(x))
points.to_csv(synthetic_data_folder + '/towers.csv', index=False)

In [4]:
# Generate table of antennas
np.random.seed(200)
num_extra_antennas = 10

antennas = []
towers = pd.read_csv(synthetic_data_folder + '/towers.csv')
towerids = towers['tower_id'].values
for towerid in towerids:
    num_antennas = np.random.randint(1, 6)
    for i in range(num_antennas):
        antennas.append(towerid)
antennas = pd.DataFrame(antennas)
antennas.columns = ['tower_id']
antennas['antenna_id'] = range(len(antennas))
antennas['antenna_id'] = antennas['antenna_id'].apply(lambda x: 'a' + str(x))
antennas = antennas[['antenna_id', 'tower_id']]
max_antenna = int(antennas['antenna_id'].values[-1][1:])
antennas_notower = ['a' + str(i) for i in range(max_antenna+1, max_antenna+1+num_extra_antennas)]
antennas_notower = pd.DataFrame([antennas_notower, [np.nan for _ in antennas_notower]]).T
antennas_notower.columns = ['antenna_id', 'tower_id']
antennas = pd.concat([antennas, antennas_notower])
antennas.to_csv(synthetic_data_folder + '/antennas.csv', index=False)

In [5]:
# Generate table of subscribers
num_subscribers = 1000
strlen = 10
random.seed(300)

subs = []
for i in range(num_subscribers):
    subs.append(''.join(random.choice(string.ascii_lowercase + string.ascii_uppercase) for _ in range(strlen)))
subs = pd.DataFrame(subs)
subs.columns = ['subscriber_id']
subs.to_csv(synthetic_data_folder + '/subscribers.csv', index=False)

In [6]:
# Generate call/SMS table
np.random.seed(400)
num_txns = 100000

subs = pd.read_csv(synthetic_data_folder + '/subscribers.csv')['subscriber_id'].values.flatten()
antenna_dict = pd.read_csv(synthetic_data_folder + '/antennas.csv')
antennas = antenna_dict['antenna_id'].values.flatten()
towers = pd.read_csv(synthetic_data_folder + '/towers.csv')

start = pd.to_datetime('2020-01-01 00:00:00')
end = pd.to_datetime('2020-03-01 00:00:00')
seconds = ((end - start).days)*24*60*60

txns = []
while len(txns) < num_txns:
    caller = np.random.choice(subs)
    recipient = np.random.choice(subs)
    caller_antenna = np.random.choice(antennas)
    recipient_antenna = np.random.choice(antennas)
    international = np.random.choice(['domestic', 'international', 'other'], p=[.98, .01, .01])
    if np.random.randint(0, 10) == 0:
        caller_antenna = np.nan
    if np.random.randint(0, 10) == 0:
        recipient_antenna = np.nan
    timedif = np.random.randint(0, seconds)
    timestamp = start + pd.Timedelta(seconds=timedif)
    if caller != recipient:
        txns.append([caller, recipient, caller_antenna, recipient_antenna, timestamp, international])

txns = pd.DataFrame(txns)
txns.columns = ['caller_id', 'recipient_id', 'caller_antenna', 'recipient_antenna', 'timestamp', 'international']
txns['interaction'] = np.random.choice(['voice', 'sms'], size=len(txns))
txns['duration'] = np.random.randint(0, 300, size=len(txns))
txns['duration'] = txns.apply(lambda row: row['duration'] if row['interaction'] == 'voice' else np.nan, axis=1)
txns = txns.sort_values('timestamp', ascending=True)
txns = txns[['interaction', 'caller_id', 'recipient_id', 'timestamp', 'duration', 'caller_antenna', 
             'recipient_antenna', 'international']]

txns.to_csv(synthetic_data_folder + '/cdr.csv', index=False)

In [7]:
txns.head()

Unnamed: 0,interaction,caller_id,recipient_id,timestamp,duration,caller_antenna,recipient_antenna,international
76053,voice,ptudOnjdRU,hZvsVsiGbO,2020-01-01 00:00:42,253.0,a101,a54,domestic
55660,sms,UGDojtMoWq,lpAchcvhBO,2020-01-01 00:02:04,,a44,a110,domestic
49363,sms,sQGOjnDvvt,PGiqQSpzov,2020-01-01 00:02:12,,a145,a96,domestic
76511,voice,oPbUuvHBgi,IdYycyWmjl,2020-01-01 00:02:23,96.0,a84,a36,domestic
90162,sms,pSwWTbHbug,sRLqqorjkH,2020-01-01 00:03:05,,a261,a268,domestic


In [8]:
# Generate mobile data usage table
np.random.seed(500)
num_data_txns = 10000
percent_use_data = 0.6

subs = pd.read_csv(synthetic_data_folder + '/subscribers.csv')
subs = subs.sample(frac=1, replace=False)[:int(len(subs)*percent_use_data)]
mobiledata = subs.sample(num_data_txns, replace=True)
mobiledata['volume'] = np.random.normal(100, 20, num_data_txns)

start = pd.to_datetime('2020-01-01 00:00:00')
end = pd.to_datetime('2020-03-01 00:00:00')
seconds = ((end - start).days)*24*60*60
timestamps = [start + pd.Timedelta(seconds=np.random.randint(0, seconds)) for _ in range(num_data_txns)]
mobiledata['timestamp'] = timestamps
mobiledata = mobiledata.sort_values('timestamp', ascending=True)

mobiledata.to_csv(synthetic_data_folder + '/mobiledata.csv', index=False)

In [9]:
mobiledata.head()

Unnamed: 0,subscriber_id,volume,timestamp
163,MJKWEawMBC,91.386518,2020-01-01 00:01:15
211,yofaYDcyWJ,118.898352,2020-01-01 00:02:27
507,XstqtfmwLR,67.682133,2020-01-01 00:16:33
105,ZxDgrIqzha,65.525068,2020-01-01 00:23:37
713,oPbUuvHBgi,99.461383,2020-01-01 00:41:44


In [10]:
# Generate top-ups table
np.random.seed(600)
num_topup_txns = 10000
percent_use_topups = 0.8

subs = pd.read_csv(synthetic_data_folder + '/subscribers.csv')
subs = subs.sample(frac=1, replace=False)[:int(len(subs)*percent_use_topups)]
topups = subs.sample(num_topup_txns, replace=True)
topups['amount'] = np.random.randint(0, 100, num_topup_txns)

start = pd.to_datetime('2020-01-01 00:00:00')
end = pd.to_datetime('2020-03-01 00:00:00')
seconds = ((end - start).days)*24*60*60
timestamps = [start + pd.Timedelta(seconds=np.random.randint(0, seconds)) for _ in range(num_data_txns)]
topups['timestamp'] = timestamps
topups = topups.sort_values('timestamp', ascending=True)

topups.to_csv(synthetic_data_folder + '/topups.csv', index=False)

In [11]:
topups.head()

Unnamed: 0,subscriber_id,amount,timestamp
257,WzwHpoldPp,96,2020-01-01 00:02:47
426,xkThzuCDAY,73,2020-01-01 00:04:33
747,OtFfOxcGMu,98,2020-01-01 00:08:36
74,pjjlDwunYH,7,2020-01-01 00:14:37
684,RBDklSsXkt,76,2020-01-01 00:24:36


In [26]:
# Generate mobile money table
np.random.seed(700)
num_mm_txns = 10000
percent_use_mm = 0.7
txn_types = ['cashin', 'cashout', 'p2p', 'billpay', 'other']
txn_probs = [.2, .2, .4, .1, .1]

subs = pd.read_csv(synthetic_data_folder + '/subscribers.csv')
subs = subs.sample(frac=1, replace=False)[:int(len(subs)*percent_use_mm)]
subs = subs['subscriber_id'].values.flatten()

start = pd.to_datetime('2020-01-01 00:00:00')
end = pd.to_datetime('2020-03-01 00:00:00')
seconds = ((end - start).days)*24*60*60

mm_txns = []
while len(mm_txns) < num_mm_txns:
    txn_type = np.random.choice(txn_types, p=txn_probs)
    caller = np.random.choice(subs)
    if txn_type in ['p2p', 'billpay', 'other']:
        recipient = np.random.choice(subs)
    else:
        recipient = np.nan
    timedif = np.random.randint(0, seconds)
    timestamp = start + pd.Timedelta(seconds=timedif)
    if caller != recipient:
        mm_txns.append([txn_type, caller, recipient, timestamp])

mm_txns = pd.DataFrame(mm_txns, columns = ['txn_type', 'sender', 'recipient', 'timestamp'])
mm_txns['amount'] = np.random.normal(50, 10, num_mm_txns)
#mm_txns['amount'] = mm_txns\
#    .apply(lambda row: -row['amount'] if row['txn_type'] == 'cashout' 
#           else row['amount'], axis=1)
mm_txns['sender_balance_before'] = np.random.normal(200, 40, num_mm_txns)
mm_txns['sender_balance_after'] = mm_txns\
    .apply(lambda row: row['sender_balance_before'] + row['amount'] if row['txn_type'] == 'cashin'
          else row['sender_balance_before'] - row['amount'], axis=1)
mm_txns['recipient_balance_before'] = np.random.normal(200, 40, num_mm_txns)
mm_txns['recipient_balance_before'] = mm_txns\
    .apply(lambda row: np.nan if row['txn_type'] in ['cashin', 'cashout'] else row['recipient_balance_before'],
          axis=1)
mm_txns['recipient_balance_after'] = mm_txns['recipient_balance_before'] + mm_txns['amount']
mm_txns = mm_txns.sort_values('timestamp', ascending=True)

mm_txns.to_csv(synthetic_data_folder + '/mobilemoney.csv', index=False)

In [27]:
mm_txns.head(50)

Unnamed: 0,txn_type,sender,recipient,timestamp,amount,sender_balance_before,sender_balance_after,recipient_balance_before,recipient_balance_after
8557,p2p,NFKHIoqbNt,bdvjmfSTsU,2020-01-01 00:15:51,55.163313,324.218747,269.055434,112.544987,167.7083
9904,cashout,yURSuqrHbG,,2020-01-01 00:18:59,81.860999,185.576198,103.715199,,
5188,cashin,IvOrlIzYfz,,2020-01-01 00:28:17,29.225047,98.638294,127.863341,,
6900,p2p,cyedGKnkBl,hTYedXuVwS,2020-01-01 00:28:29,45.561912,248.718378,203.156466,212.016381,257.578293
2245,cashout,vAwnWzAdpz,,2020-01-01 00:31:03,55.040771,150.886443,95.845672,,
9574,cashout,SFVkVBNAYV,,2020-01-01 00:46:50,36.49293,238.312792,201.819863,,
2606,other,HNQbVKqVtS,HmXwlwGPuT,2020-01-01 00:48:48,48.763501,163.332699,114.569198,147.233665,195.997166
6702,p2p,XnMzZAZGDz,HojSyqGASE,2020-01-01 00:57:57,64.593197,191.028815,126.435617,167.967608,232.560805
1135,cashout,CCjaSRuFZy,,2020-01-01 01:07:44,60.645694,183.500458,122.854765,,
2151,cashin,KbnRaCwoPO,,2020-01-01 01:09:32,32.571618,141.744339,174.315958,,
