In [1]:
import random
import os
import string
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import geopandas as gpd
from shapely.geometry import Point

In [2]:
synthetic_data_folder = Path(os.getcwd())

In [None]:
# Generate table of towers
np.random.seed(100)
num_towers = 100

shapefile = gpd.read_file(synthetic_data_folder / 'regions.geojson')
shapefile = shapefile.dissolve(by='OBJECTID')
polygon = shapefile['geometry'][0]

minx, miny, maxx, maxy = polygon.bounds
lats, lons = [], []
while len(lats) < num_towers:
    point = Point(np.random.uniform(minx, maxx), np.random.uniform(miny, maxy))
    if polygon.contains(point):
        lats.append(point.y)
        lons.append(point.x)
points = pd.DataFrame([lats, lons]).T
points.columns = ['latitude', 'longitude']
points['tower_id'] = range(len(points))
points['tower_id'] = points['tower_id'].apply(lambda x: 't' + str(x))
points.to_csv(synthetic_data_folder / 'towers.csv', index=False)

In [None]:
# Generate table of antennas
np.random.seed(200)
num_extra_antennas = 10

antennas = []
towers = pd.read_csv(synthetic_data_folder / 'towers.csv')
towerids = towers['tower_id'].values
for towerid in towerids:
    num_antennas = np.random.randint(1, 6)
    for i in range(num_antennas):
        antennas.append(towerid)
antennas = pd.DataFrame(antennas)
antennas.columns = ['tower_id']
antennas['antenna_id'] = range(len(antennas))
antennas['antenna_id'] = antennas['antenna_id'].apply(lambda x: 'a' + str(x))
antennas = antennas[['antenna_id', 'tower_id']]
max_antenna = int(antennas['antenna_id'].values[-1][1:])
antennas_notower = ['a' + str(i) for i in range(max_antenna+1, max_antenna+1+num_extra_antennas)]
antennas_notower = pd.DataFrame([antennas_notower, [np.nan for _ in antennas_notower]]).T
antennas_notower.columns = ['antenna_id', 'tower_id']
antennas = pd.concat([antennas, antennas_notower])
antennas.to_csv(synthetic_data_folder / 'antennas.csv', index=False)

In [6]:
# Generate table of subscribers
num_subscribers = 1000
random.seed(300)
number_length = 7
min_numeric_value = 10**(number_length - 1)
max_numeric_value = 10**(number_length)

subs = []
for i in range(num_subscribers):
    subs.append(str(random.randrange(min_numeric_value, max_numeric_value)))
subs = pd.DataFrame(subs)
subs.columns = ['subscriber_id']
subs.to_csv(synthetic_data_folder / 'subscribers.csv', index=False)

In [None]:
# This block generates already-anonymized phone numbers
if False:
    # Generate table of subscribers
    num_subscribers = 1000
    strlen = 10
    random.seed(300)

    subs = []
    for i in range(num_subscribers):
        subs.append(''.join(random.choice(string.ascii_lowercase + string.ascii_uppercase) for _ in range(strlen)))
    subs = pd.DataFrame(subs)
    subs.columns = ['subscriber_id']
    subs.to_csv(synthetic_data_folder / 'subscribers.csv', index=False)

In [7]:
# Generate call/SMS table
np.random.seed(400)
num_txns = 100000

subs = pd.read_csv(synthetic_data_folder / 'subscribers.csv')['subscriber_id'].values.flatten()
antenna_dict = pd.read_csv(synthetic_data_folder / 'antennas.csv')
antennas = antenna_dict['antenna_id'].values.flatten()
towers = pd.read_csv(synthetic_data_folder / 'towers.csv')

start = pd.to_datetime('2020-01-01 00:00:00')
end = pd.to_datetime('2020-03-01 00:00:00')
seconds = ((end - start).days)*24*60*60

txns = []
while len(txns) < num_txns:
    caller = np.random.choice(subs)
    recipient = np.random.choice(subs)
    caller_antenna = np.random.choice(antennas)
    recipient_antenna = np.random.choice(antennas)
    international = np.random.choice(['domestic', 'international', 'other'], p=[.98, .01, .01])
    if np.random.randint(0, 10) == 0:
        caller_antenna = np.nan
    if np.random.randint(0, 10) == 0:
        recipient_antenna = np.nan
    timedif = np.random.randint(0, seconds)
    timestamp = start + pd.Timedelta(seconds=timedif)
    if caller != recipient:
        txns.append([caller, recipient, caller_antenna, recipient_antenna, timestamp, international])

txns = pd.DataFrame(txns)
txns.columns = ['caller_id', 'recipient_id', 'caller_antenna', 'recipient_antenna', 'timestamp', 'international']
txns['txn_type'] = np.random.choice(['call', 'text'], size=len(txns))
txns['duration'] = np.random.randint(0, 300, size=len(txns))
txns['duration'] = txns.apply(lambda row: row['duration'] if row['txn_type'] == 'call' else np.nan, axis=1)
txns = txns.sort_values('timestamp', ascending=True)
txns = txns[['txn_type', 'caller_id', 'recipient_id', 'timestamp', 'duration', 'caller_antenna', 
             'recipient_antenna', 'international']]

txns.to_csv(synthetic_data_folder / 'cdr.csv', index=False)

In [8]:
txns.head()

Unnamed: 0,txn_type,caller_id,recipient_id,timestamp,duration,caller_antenna,recipient_antenna,international
76053,call,4295331,8728182,2020-01-01 00:00:42,253.0,a101,a54,domestic
55660,text,8908693,3785862,2020-01-01 00:02:04,,a44,a110,domestic
49363,text,2582376,3283017,2020-01-01 00:02:12,,a145,a96,domestic
76511,call,9251012,6707474,2020-01-01 00:02:23,96.0,a84,a36,domestic
90162,text,1275856,8935488,2020-01-01 00:03:05,,a261,a268,domestic


In [15]:
# Generate mobile data usage table
np.random.seed(500)
num_data_txns = 10000
percent_use_data = 0.6

subs = pd.read_csv(synthetic_data_folder / 'subscribers.csv')
subs = subs.sample(frac=1, replace=False)[:int(len(subs)*percent_use_data)]
mobiledata = subs.sample(num_data_txns, replace=True).rename(columns={'subscriber_id': 'caller_id'})
mobiledata['volume'] = np.random.normal(100, 20, num_data_txns)

start = pd.to_datetime('2020-01-01 00:00:00')
end = pd.to_datetime('2020-03-01 00:00:00')
seconds = ((end - start).days)*24*60*60
timestamps = [start + pd.Timedelta(seconds=np.random.randint(0, seconds)) for _ in range(num_data_txns)]
mobiledata['timestamp'] = timestamps
mobiledata = mobiledata.sort_values('timestamp', ascending=True)

mobiledata.to_csv(synthetic_data_folder / 'mobiledata.csv', index=False)

In [10]:
mobiledata.head()

Unnamed: 0,subscriber_id,volume,timestamp
163,8231353,91.386518,2020-01-01 00:01:15
211,6675814,118.898352,2020-01-01 00:02:27
507,1666393,67.682133,2020-01-01 00:16:33
105,8690863,65.525068,2020-01-01 00:23:37
713,9251012,99.461383,2020-01-01 00:41:44


In [17]:
# Generate recharges table
np.random.seed(600)
num_recharge_txns = 10000
percent_use_recharges = 0.8

subs = pd.read_csv(synthetic_data_folder / 'subscribers.csv').rename(columns={'subscriber_id': 'caller_id'})
subs = subs.sample(frac=1, replace=False)[:int(len(subs)*percent_use_recharges)]
recharges = subs.sample(num_recharge_txns, replace=True)
recharges['amount'] = np.random.randint(0, 100, num_recharge_txns)

start = pd.to_datetime('2020-01-01 00:00:00')
end = pd.to_datetime('2020-03-01 00:00:00')
seconds = ((end - start).days)*24*60*60
timestamps = [start + pd.Timedelta(seconds=np.random.randint(0, seconds)) for _ in range(num_data_txns)]
recharges['timestamp'] = timestamps
recharges = recharges.sort_values('timestamp', ascending=True)

recharges.to_csv(synthetic_data_folder / 'recharges.csv', index=False)

In [18]:
recharges.head()

Unnamed: 0,caller_id,amount,timestamp
257,2962629,96,2020-01-01 00:02:47
426,8450707,73,2020-01-01 00:04:33
747,7797892,98,2020-01-01 00:08:36
74,2474502,7,2020-01-01 00:14:37
684,1215791,76,2020-01-01 00:24:36


In [41]:
# Generate mobile money table
np.random.seed(700)
num_mm_txns = 10000
percent_use_mm = 0.7
txn_types = ['cashin', 'cashout', 'p2p', 'billpay', 'other']
txn_probs = [.2, .2, .4, .1, .1]

subs = pd.read_csv(synthetic_data_folder / 'subscribers.csv', dtype={'subscriber_id': str})
subs = subs.sample(frac=1, replace=False)[:int(len(subs)*percent_use_mm)]
subs = subs['subscriber_id'].values.flatten()

start = pd.to_datetime('2020-01-01 00:00:00')
end = pd.to_datetime('2020-03-01 00:00:00')
seconds = ((end - start).days)*24*60*60

mm_txns = []
while len(mm_txns) < num_mm_txns:
    txn_type = np.random.choice(txn_types, p=txn_probs)
    caller = np.random.choice(subs)
    if txn_type in ['p2p', 'billpay', 'other']:
        recipient = np.random.choice(subs)
    else:
        recipient = np.nan
    timedif = np.random.randint(0, seconds)
    timestamp = start + pd.Timedelta(seconds=timedif)
    if caller != recipient:
        mm_txns.append([txn_type, caller, recipient, timestamp])

mm_txns = pd.DataFrame(mm_txns, columns = ['txn_type', 'caller_id', 'recipient_id', 'timestamp'])
mm_txns['amount'] = np.random.normal(50, 10, num_mm_txns)
#mm_txns['amount'] = mm_txns\
#    .apply(lambda row: -row['amount'] if row['txn_type'] == 'cashout' 
#           else row['amount'], axis=1)
mm_txns['sender_balance_before'] = np.random.normal(200, 40, num_mm_txns)
mm_txns['sender_balance_after'] = mm_txns\
    .apply(lambda row: row['sender_balance_before'] + row['amount'] if row['txn_type'] == 'cashin'
          else row['sender_balance_before'] - row['amount'], axis=1)
mm_txns['recipient_balance_before'] = np.random.normal(200, 40, num_mm_txns)
mm_txns['recipient_balance_before'] = mm_txns\
    .apply(lambda row: np.nan if row['txn_type'] in ['cashin', 'cashout'] else row['recipient_balance_before'],
          axis=1)
mm_txns['recipient_balance_after'] = mm_txns['recipient_balance_before'] + mm_txns['amount']
mm_txns = mm_txns.sort_values('timestamp', ascending=True)

mm_txns.to_csv(synthetic_data_folder / 'mobilemoney.csv', index=False)

In [32]:
# Generate labels (using the features dataset which will be read in by the ML module).
np.random.seed(1)
feats = pd.read_csv('../synthetic_data/features.csv')
feats['label'] = feats['number_of_interactions_alldir_allweek_allday_call']**2 + \
    feats['active_days_weekend_day']**3 + \
    feats['location_count(regions)']**5
feats['label'] = feats['label'] + np.random.randint(0, 50000, len(feats))
feats = feats[['name', 'label']].sample(50, replace=True, random_state=2)
feats['weight'] = (np.random.rand(1, len(feats))*100).flatten()
feats.to_csv('../synthetic_data/labels.csv', index=False)


In [32]:
subscribers = pd.read_csv('../synthetic_data/subscribers.csv')
antennas = pd.read_csv('../synthetic_data/antennas.csv')
regions = gpd.read_file('../synthetic_data/regions.geojson')
prefectures = gpd.read_file('../synthetic_data/prefectures.geojson')
subscribers['antenna_id'] = antennas['antenna_id'].drop_duplicates()\
    .sample(len(subscribers), replace=True).values
subscribers['tower_id'] = antennas['tower_id'].drop_duplicates()\
    .sample(len(subscribers), replace=True).values
subscribers['regions'] = regions['region'].drop_duplicates()\
    .sample(len(subscribers), replace=True).values
subscribers['prefectures'] = prefectures['region'].drop_duplicates()\
    .sample(len(subscribers), replace=True).values
subscribers.to_csv('../synthetic_data/home_locations.csv', index=False)

### Survey data

In [None]:
np.random.seed(100)
obs = 1000

# Unique IDs
uids = np.arange(obs)

# Phone numbers
strlen = 10
subs = []
for i in range(obs):
    subs.append(''.join(random.choice(string.ascii_lowercase + string.ascii_uppercase) for _ in range(strlen)))
    
# Consumption
consumption = np.random.rand(obs)*50

# Binary questions
binary_questions = 10
binary_data = pd.DataFrame([np.round(np.random.rand(obs)) for i in range(1, 1+binary_questions)]).T
binary_columns = ['bin' + str(i) for i in range(binary_questions)]
binary_data.columns = binary_columns

# Continuous questions
continuous_questions = 10
continuous_data = pd.DataFrame([np.random.rand(obs)*i for i in range(1, 1+continuous_questions)]).T
continuous_columns = ['con' + str(i) for i in range(continuous_questions)]
continuous_data.columns = continuous_columns

# Categorical questions
categorical_questions = 10
categorical_data = pd.DataFrame([np.random.randint(low=0, high=10, size=obs) for i in \
                                 range(1, 1+categorical_questions)]).T
categorical_columns = ['cat' + str(i) for i in range(categorical_questions)]
categorical_data.columns = categorical_columns

# Combine data
data = pd.concat([binary_data, continuous_data, categorical_data], axis=1)
data['unique_id'] = uids
data['consumption'] = consumption
data['weight'] = np.random.rand(obs)*10
data['phone_number'] = subs
data = data[['unique_id', 'phone_number', 'weight', 'consumption'] + binary_columns + continuous_columns + \
            categorical_columns]

# Add some missing data
for col in ['consumption'] + binary_columns + categorical_columns + continuous_columns:
    data[col] = data[col].apply(lambda x: np.nan if np.random.rand() < .01 else x)

# Save to file
data.to_csv('survey.csv', index=False)

In [None]:
data.head()