# Anonymizer

In [1]:
from cider.datastore import DataStore
from cider.anonymizer import Anonymizer
from pandas.api.types import is_numeric_dtype

import os
import sys

import pandas as pd

# Prevents python version mismatches between spark driver and executor
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

Set up the configuration file and load some simulated data, including featurization results if present, using the datastore.

In [2]:
# This path should point to your cider installation, where configs and data for this demo are located. In particular, this file must
# contain a 
from pathlib import Path
cider_installation_directory = Path('../../cider')

datastore = DataStore(config_file_path_string= cider_installation_directory / 'configs' / 'config_anonymize.yml')
anonymizer = Anonymizer(datastore=datastore)

outputs_path = anonymizer.outputs_path

23/05/02 11:58:40 WARN Utils: Your hostname, Leos-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 192.168.50.152 instead (on interface en0)
23/05/02 11:58:40 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/05/02 11:58:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


                                                                                

Loading CDR...


                                                                                

Loading recharges...
SUCCESS!
Loading mobile data...
Loading mobile money...


## Anonymize input data

We can anonymize the five main categories of input data which contain phone numbers: 

- CDR (calls and texts)
- Mobile money transactions
- Mobile data transactions
- Recharges
- Labels (e.g. from a ground-truth survey)

It's important to anonymize these using the same anonymization salt, to ensure that all of
a given subscriber's data is indexed by the same obfuscated string.

In [3]:
anonymizer.anonymize_cdr()

anonymized_cdr = pd.read_csv(outputs_path / 'outputs' / 'cdr.csv')
anonymized_cdr.head()

                                                                                

Unnamed: 0,txn_type,caller_id,recipient_id,timestamp,duration,caller_antenna,recipient_antenna,international
0,call,bLXgD9re7YmQAanW,M0G6yPrmRxlDRK7p,2020-01-01 00:00:42,253.0,a101,a54,domestic
1,text,G5vAyAYdVGxyadO9,jVa6z1KrX1lQdMvE,2020-01-01 00:02:04,,a44,a110,domestic
2,text,Lw7K8zWqd2kQEPAm,Np7OPQL5w1OyvVaG,2020-01-01 00:02:12,,a145,a96,domestic
3,call,VpAaQaEpX0MQe9qv,O8xoDrklJrrQEGLA,2020-01-01 00:02:23,96.0,a84,a36,domestic
4,text,EVpAaQaelY8Qe9qv,0vK4yVb9ja7y71OZ,2020-01-01 00:03:05,,a261,a268,domestic


In [4]:
anonymizer.anonymize_mobilemoney()

anonymized_mobilemoney = pd.read_csv(outputs_path / 'outputs' / 'mobilemoney.csv')
anonymized_mobilemoney.head()

                                                                                

Unnamed: 0,txn_type,caller_id,recipient_id,timestamp,amount,sender_balance_before,sender_balance_after,recipient_balance_before,recipient_balance_after
0,p2p,gG5vAyA6pkxzadO9,bLXgD9o5xPayAanW,2020-01-01 00:15:51,55.163315,324.21875,269.05542,112.54498,167.7083
1,cashout,ZgErQY4EbL2DxP6V,,2020-01-01 00:18:59,81.861,185.5762,103.7152,,
2,cashin,MvZqyx7kRlxQen12,,2020-01-01 00:28:17,29.225048,98.63829,127.86334,,
3,p2p,o0vK4yVX4YMD71OZ,9eALxDXjrWBydlpO,2020-01-01 00:28:29,45.561913,248.71838,203.15646,212.01639,257.57828
4,cashout,76YPy0l8OAMyJpN3,,2020-01-01 00:31:03,55.04077,150.88644,95.84567,,


In [5]:
anonymizer.anonymize_mobiledata()

anonymized_mobiledata = pd.read_csv(outputs_path / 'outputs' / 'mobiledata.csv')
anonymized_mobiledata.head()

Unnamed: 0,caller_id,volume,timestamp
0,BWvLyeoJmaODdYGA,91.38652,2020-01-01 00:01:15
1,en2JQGr5A24D0pYA,118.89835,2020-01-01 00:02:27
2,gG5vAyAkN8VDadO9,67.68214,2020-01-01 00:16:33
3,LKlVDmdbE4Vzvq1N,65.52507,2020-01-01 00:23:37
4,VpAaQaEpX0MQe9qv,99.46138,2020-01-01 00:41:44


In [6]:
anonymizer.anonymize_recharges()

anonymized_recharges = pd.read_csv(outputs_path / 'outputs' / 'recharges.csv')
anonymized_recharges.head()

Unnamed: 0,caller_id,amount,timestamp
0,EBj0nDOBBPJzK2OR,96.0,2020-01-01 00:02:47
1,1Z7rQbwXr45ybpGg,73.0,2020-01-01 00:04:33
2,BkM4yRlYw11DRXve,98.0,2020-01-01 00:08:36
3,b7jV6QK6PZqD1YKw,7.0,2020-01-01 00:14:37
4,AqnGeQjRBqvQXWRj,76.0,2020-01-01 00:24:36


In [7]:
anonymizer.anonymize_labels()

anonymized_labels = pd.read_csv(outputs_path / 'outputs' / 'labels.csv')
anonymized_labels.head()

Unnamed: 0,name,label,weight
0,1Z7rQb4n838zbpGg,22023,3.711266
1,G5vAyA473wvQadO9,20709,69.385541
2,en2JQGr0ox8D0pYA,18707,67.035003
3,jNa6Q45Advlyxw83,20152,43.047178
4,G5vAyA4vo96QadO9,22256,76.778898


In [8]:
raw_cdr_file = pd.read_csv(datastore.cfg.path.input_data.file_paths.cdr)
raw_recharges_file = pd.read_csv(datastore.cfg.path.input_data.file_paths.recharges)
raw_mobiledata_file = pd.read_csv(datastore.cfg.path.input_data.file_paths.mobiledata)
raw_mobilemoney_file = pd.read_csv(datastore.cfg.path.input_data.file_paths.mobilemoney)
raw_labels_file = pd.read_csv(datastore.cfg.path.input_data.file_paths.labels)

## Anonymize features

We can also anonymize featurized data. This is not necessary (nor will it work) if the features are computed using already-anonymized data.

In [9]:
anonymizer.anonymize_features()
anonymized_features = pd.read_csv(outputs_path / 'outputs' / 'features.csv')
anonymized_features.head()

                                                                                

Unnamed: 0,name,active_days_allweek_allday,active_days_allweek_day,active_days_allweek_night,active_days_weekday_allday,active_days_weekday_day,active_days_weekday_night,active_days_weekend_allday,active_days_weekend_day,active_days_weekend_night,...,mobilemoney_outgoing_p2p_amount_min,mobilemoney_outgoing_p2p_amount_max,mobilemoney_outgoing_p2p_balance_before_mean,mobilemoney_outgoing_p2p_balance_before_min,mobilemoney_outgoing_p2p_balance_before_max,mobilemoney_outgoing_p2p_balance_after_mean,mobilemoney_outgoing_p2p_balance_after_min,mobilemoney_outgoing_p2p_balance_after_max,mobilemoney_outgoing_p2p_txns,mobilemoney_outgoing_p2p_contacts
0,x1Z7rQbM5O8DbpGg,54,46,46,38,33,32,16,13,14,...,30.222473,70.80371,206.221286,180.20338,239.67809,158.092635,109.39968,185.19756,6.0,6.0
1,7bLXgD9l64mzAanW,55,45,48,39,34,32,16,11,16,...,36.72406,65.197014,174.832893,94.9552,260.03677,125.46837,58.23114,209.75879,4.0,4.0
2,MLxA7zdWjO1ybVY1,52,43,42,36,30,29,16,13,13,...,25.40323,69.63084,197.661448,143.49661,251.89214,151.198061,101.53663,226.4889,9.0,9.0
3,b7MgqDw5jkXyjo6W,56,46,42,40,33,28,16,13,14,...,35.615395,63.297916,205.654711,158.98051,267.4122,152.560454,101.69055,214.87619,7.0,7.0
4,AqnGeQjNrYBQXWRj,53,42,42,38,32,28,15,10,14,...,43.85584,65.40527,197.622427,157.64932,236.83188,147.132861,111.95184,192.97603,5.0,5.0


## Use a custom format checker

**THIS STEP DOES NOT WORK! The checker assumes phone numbers complying with the format options described below, but Cider has a bug: In feature tables, it interprets phone numbers as integers, dropping leading +'s and 0's. Once that bug is fixed, it's necessary to re-generate synthetic data, run the featurization notebook on those updated labels, and copy its output to synthetic_datasets/features.csv, so that this step, and the ML notebook, both work.  **

You may want to check the format of numbers as you anonymize them. Because the anonymizer uses a hash function, similar numbers will *not* result in similar anonymized strings. For example, the numbers 1234567, 01234567, and 880 1234567 will all result in completely different anonymized strings.

The Anonymizer object accepts a format checker when it's constructed. The checker will be evaluated on inputs as strings, and should return `True` if the format is acceptable and `False` if not.

Anonymization will fail if any number fails this check. So we encourage you to clean your data prior to passing it through this step. 

This demo is based on a simplified version of Bangladesh's [mobile phone number format](https://en.wikipedia.org/wiki/Telephone_numbers_in_Bangladesh). We're assuming the following relatively-simple format options, with nothing else permitted:

- From within Bangladesh: 0 <3-digit operator prefix> \<subscriber number>
- From outside Bangladesh: +880 <3-digit operator prefix> \<subscriber number>

And where the operator prefix must start with the string '01'. 

There are some considerations not taken into account by this small example:

- It's important to be consistent here. For example, if a numbers is represented differently based on where the call is dialed from, it's necessary to normalize: Perhaps by stripping the country code and also the domestic prefix (here '0'), and by stripping the operator prefix. The anonymizer does not intelligently map two representations of the same number to the same string.
- Cider expects information about whether calls/texts are domestic or international to be provided. So it's likely users will already have checked number prefixes, and perhaps removed them.

In [10]:
def format_checker(raw):
    if raw.startswith('+'):
        no_plus = raw.strip('+')
        if not no_plus.startswith('880'):
            return False
        if not no_plus[3:].startswith('01'):
            return False
    
    else:
        if not raw.startswith('001'):
            return False
    
    return True

anonymizer_with_check = Anonymizer(datastore=datastore, format_checker=format_checker)

Loading CDR...
Loading recharges...
SUCCESS!
Loading mobile data...
Loading mobile money...


In [11]:
anonymizer_with_check.anonymize_cdr()

23/05/02 11:59:18 ERROR Executor: Exception in task 1.0 in stage 48.0 (TID 44)
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/Users/leo/Documents/gpl/cider/cider/anonymizer.py", line 142, in <lambda>
    lambda raw: Anonymizer._check_identifier_format_and_hash(raw, encoder, format_checker), StringType()
  File "/Users/leo/Documents/gpl/cider/cider/anonymizer.py", line 197, in _check_identifier_format_and_hash
    raise ValueError(f'Bad input to anonymization: {raw_string} rejected by provided format format_checker.')
ValueError: Bad input to anonymization: 1728786 rejected by provided format format_checker.

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:552)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$2.read(PythonUDFRunner.scala:86)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$2.read(PythonUDFRunner.scala:68)
	at org.apache.spark.api.python.BasePy

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "/Users/leo/Documents/gpl/cider/cider/anonymizer.py", line 142, in <lambda>
    lambda raw: Anonymizer._check_identifier_format_and_hash(raw, encoder, format_checker), StringType()
  File "/Users/leo/Documents/gpl/cider/cider/anonymizer.py", line 197, in _check_identifier_format_and_hash
    raise ValueError(f'Bad input to anonymization: {raw_string} rejected by provided format format_checker.')
ValueError: Bad input to anonymization: 1728786 rejected by provided format format_checker.
