# Anonymizer

In [24]:
from cider.datastore import DataStore
from cider.anonymizer import Anonymizer
from pandas.api.types import is_numeric_dtype

import os
import sys

import pandas as pd

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

Set up the configuration file and load some simulated data, including featurization results if present, using the datastore.

In [2]:
# This path should point to your cider installation, where configs and data for this demo are located.
from pathlib import Path
cider_installation_directory = Path('../../cider')

datastore = DataStore(config_file_path_string= cider_installation_directory / 'configs' / 'colab_walkthrough' / 'config_anonymize.yml')
anonymizer = Anonymizer(datastore=datastore)

outputs_path = anonymizer.outputs_path

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/04/17 11:56:34 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/04/17 11:56:35 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


                                                                                

Loading CDR...


                                                                                

Loading recharges...
SUCCESS!
Loading mobile data...
Loading mobile money...


## Anonymize input data

We can anonymize the five main categories of input data which contain phone numbers: 

- CDR (calls and texts)
- Mobile money transactions
- Mobile data transactions
- Recharges
- Labels (e.g. from a ground-truth survey)

It's important to anonymize these using the same anonymization salt, to ensure that all of
a given subscriber's data is indexed by the same obfuscated string.

In [3]:
anonymizer.anonymize_cdr()

anonymized_cdr = pd.read_csv(outputs_path / 'outputs' / 'cdr.csv')
# anonymized_cdr.head()

                                                                                

In [4]:
anonymizer.anonymize_mobilemoney()

anonymized_mobilemoney = pd.read_csv(outputs_path / 'outputs' / 'mobilemoney.csv')
# anonymized_mobilemoney.head()

                                                                                

In [5]:
anonymizer.anonymize_mobiledata()

anonymized_mobiledata = pd.read_csv(outputs_path / 'outputs' / 'mobiledata.csv')
# anonymized_mobiledata.head()

In [6]:
anonymizer.anonymize_recharges()

anonymized_recharges = pd.read_csv(outputs_path / 'outputs' / 'recharges.csv')
# anonymized_recharges.head()

In [7]:
anonymizer.anonymize_labels()

anonymized_labels = pd.read_csv(outputs_path / 'outputs' / 'labels.csv')
# anonymized_labels.head()

In [56]:
raw_cdr_file = pd.read_csv(datastore.cfg.path.input_data.file_paths.cdr)
raw_recharges_file = pd.read_csv(datastore.cfg.path.input_data.file_paths.recharges)
raw_mobiledata_file = pd.read_csv(datastore.cfg.path.input_data.file_paths.mobiledata)
raw_mobilemoney_file = pd.read_csv(datastore.cfg.path.input_data.file_paths.mobilemoney)
raw_labels_file = pd.read_csv(datastore.cfg.path.input_data.file_paths.labels)

## Anonymize features

We can also anonymize featurized data. This is not necessary (nor will it work) if the features are computed using already-anonymized data.

In [16]:
anonymizer.anonymize_features()
anonymized_features = pd.read_csv(outputs_path / 'outputs' / 'features.csv')
anonymized_features.name_anonymized.head()

                                                                                

0    x1Z7rQbM5O8DbpGg
1    x1Z7rQbM5O8DbpGg
2    7bLXgD9l64mzAanW
3    7bLXgD9l64mzAanW
4    MLxA7zdWjO1ybVY1
Name: name_anonymized, dtype: object

## Use a custom format checker

You may want to check the format of numbers as you anonymize them. Because the anonymizer uses a hash function, similar numbers will *not* result in similar anonymized strings. For example, the numbers 1234567, 01234567, and 880 1234567 will all result in completely different anonymized strings.

The Anonymizer object accepts a format checker when it's constructed. The checker will be evaluated on inputs as strings, and should return `True` if the format is acceptable and `False` if not.

Anonymization will fail if any number fails this check. So we encourage you to clean your data prior to passing it through this step. 

In [20]:
# A silly example: Some numbers will start with 1, so we'll see validation errors here.
def format_checker(raw):
    if raw.startswith('1'):
        return False
    
    return True

anonymizer_with_check = Anonymizer(datastore=datastore, format_checker=format_checker)

running
Loading CDR...
Loading recharges...
SUCCESS!
Loading mobile data...
Loading mobile money...


In [21]:
anonymizer_with_check.anonymize_cdr()

23/04/12 10:54:23 ERROR Executor: Exception in task 0.0 in stage 106.0 (TID 106)
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/Users/leo/Documents/gpl/cider/cider/anonymizer.py", line 132, in <lambda>
    new_column = udf(
  File "/Users/leo/Documents/gpl/cider/cider/anonymizer.py", line 180, in _check_identifier_format_and_hash
    raise ValueError(f'Bad input to anonymization: {raw_string} rejected by provided format format_checker.')
ValueError: Bad input to anonymization: 1275856 rejected by provided format format_checker.

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:552)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$2.read(PythonUDFRunner.scala:86)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$2.read(PythonUDFRunner.scala:68)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:505)
	at org.apache.spark.In

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "/Users/leo/Documents/gpl/cider/cider/anonymizer.py", line 132, in <lambda>
    new_column = udf(
  File "/Users/leo/Documents/gpl/cider/cider/anonymizer.py", line 180, in _check_identifier_format_and_hash
    raise ValueError(f'Bad input to anonymization: {raw_string} rejected by provided format format_checker.')
ValueError: Bad input to anonymization: 1275856 rejected by provided format format_checker.
