# DATA SYNTHESIS DHM

This Jupyter Notebook details the step-by-step process of synthesizing a dataset of Dutch Homicides using the Synthetic Data Vault.

In [None]:
import os
import pandas as pd
import seaborn as sns
import numpy as np
import sdv
import graphviz
import glob
import os
import matplotlib.pyplot as plt

## 1. Upload Data

In [None]:
# Original dataset added in csv form - retracted for privacy reasons

## 2. Inspect Data


In [None]:
case.head(3)

## 3. Clean Data

Takes data and preprocess it to correct input format for synthetic data generating model.

Includes:

- splitting individuals to victims and perpetrators
- selecting necessary columns 
- setting missing values
- setting correct data types
- encoding categories to correct (higher-order) categories
- ?

In [None]:
    #mapping nrvic to categories
    conditions = [
        (case['NRVIC'] == 1),
        (case['NRVIC'] >= 2),
        (np.isnan(case['NRVIC']))
    ]
    categories = ['Single', 'Multiple',np.nan]
    case['NRVIC_cat']=np.select(conditions, categories)

In [None]:
def preprocess_case(case,req_cols,missing_values,recoding):
    """
    Performs required preprocessing on the case table.
    
    case: table of cases
    req_cols_victims: list of columns to keep in the case table
    missing_values: list of potential missing values
    recoding: dictionary denoting recoding of categories for different columns
    returns: preprocessed case table
    """

    #recode categoricals
    for column,mapping in recoding.items():
        case[column] = case[column].map(mapping)
    
    #set missing values as NaN
    case = case.replace(missing_values,np.nan)
    
    #mapping ages to categories
    case['AGE_vic'] = case['AGE_vic'].astype(float)
    age_categories = {'<=6':(0,6),'7to17':(7,17),'18to24':(18,24),'25to39':(25,39),'40to64':(40,64),'>=65':(65,99), 'Missing':(150,999)}
    mapping = {age:category for age in case['AGE_vic'] for category,(lower,upper) in age_categories.items() \
        if lower<=age<=upper}
    case['AGE_vic'] = case['AGE_vic'].map(mapping)
    
    case['AGE_perp'] = case['AGE_perp'].astype(float)
    age_categories = {'<=6':(0,6),'7to17':(7,17),'18to24':(18,24),'25to39':(25,39),'40to64':(40,64),'>=65':(65,150)}
    mapping = {age:category for age in case['AGE_perp'] for category,(lower,upper) in age_categories.items() \
        if lower<=age<=upper}
    case['AGE_perp'] = case['AGE_perp'].map(mapping)
    
    #selecting necessary columns
    case = case[req_cols]
    
    #set correct data types
    case = case.replace(np.nan,'Missing')
    case = case.astype(str)
    
    return case
        
    

In [None]:
req_cols=['CITY','TIME','NRVIC_cat','NRPERP','CRIMESCENE','MODUS','TYPEHOM','RELAT','PROCESS','TYPE',\
         'PRINCIPAL','GENDER_vic','GENDER_perp','AGE_vic','AGE_perp','BIRTHCOUNTRY_vic','BIRTHCOUNTRY_perp']
missing_values = [' ','Unknown', "unknown", 999, -999, '999', '-999','Unknown Perpetrator', 'Perpetrator Unknown',\
                  'Perpetrator unknown', "Unknown, but under 15 years","Unknown, but over 15 years"]
numerical_columns = []
recoding = {
    'CRIMESCENE':
            {'Private home of the victim': 'Private home',
             'Private home of the perpetrator': 'Private home',
             'Private home of the victim and perpetrator': 'Private home',
             'Private home, resident unknown':'Private home',
             'Private home of other person (not vict. or perp.)': 'Private home',
             'Shop, restaurant or other place of entertainment and amusement': 'Shop, restaurant or other place of entertainment and amusement', 
             'Other': 'Other',
             'Institution, Dormitory': 'Other',
             'Hotel or model': 'Other',
             'Workplace': 'Other',
             'Street, road, public transportation or other public place': 'Street, road, public transportation or other public place',
             'Inside private vehicle': 'Other',
             'Park, forest or recreational area':'Park, forest or recreational area'
            },
            
            'RELAT':{
            'The perpetrator and victim are slightly known to each other (not friends)':'The perpetrator and victim are slightly known to each other (not friends)',
            'Perpetrator and victim do not know each other':'Perpetrator and victim do not know each other',
            'Wife':'Partner',
            'Girlfriend':'Partner',
            'Friend or longtime acquaintance':'Friend or longtime acquaintance',
            'Child':'(Step)Child',
            'Ex-girlfriend':'Ex-Partner(Wife, Husband, Girlfriend, Boyfriend)',
            'Mother':'(Step)Parent',
            'Housemate or flatmate (previous or present)':'Other',
            'Other relative':'Other relative',
            'Neighbour':'Other',
            'Husband':'Partner',
            'Sibling':'Other relative',
            'Ex-wife':'Ex-Partner(Wife, Husband, Girlfriend, Boyfriend)',
            'Boyfriend':'Partner',
            'Father':'(Step)Parent',
            'Co-worker (previous or present)':'Other',
            'New acquitance (met in the last 24 hours)':'The perpetrator and victim are slightly known to each other (not friends)',
            'Ex-boyfriend':'Ex-Partner(Wife, Husband, Girlfriend, Boyfriend)',
            'Stepchild':'(Step)Child',
            'Stepfather':'(Step)Parent',
            'Patient (previous or present)':'Other',
            'Prostitute (previous or present)':'Other',
            'Grandparent or great grandparent':'Other relative',
            'Purchaser of sexual services (previous or present)':'Other',
            'Classmate (previous or present)':'Other',
            'Partner or ex-partner (marital or engagement status unknown)':'Other relative',
            'Partner or ex-partner of the same sex; males (marital or engagement status unknown)':'Other relative',
            'Mistaken identity':'Other',
            'Ex-husband':'Ex-Partner(Wife, Husband, Girlfriend, Boyfriend)',
            'Therapist (previous or present)':'Other'
            },
            
            'TYPEHOM':{
            'Criminal milieu':'Criminal milieu',
            'Partner killing':'Partner killing',
            'Other in non-criminal milieu':'Other in non-criminal milieu',
            'Other familial killing':'Other familial killing',
            'Robbery killing: private home':'Robbery killing',
            'Robbery killing: street robbery':'Robbery killing',
            'Robbery killing: commercial business':'Robbery killing',
            'Killing by mentally disturbed person (non-family)':'Killing by mentally disturbed person (non-family)',
            'Other':'Other',
            'Child killing (in family)':'Child killing (family and non-family)',
            'Nightlife violence':'Nightlife violence',
            'Infanticide':'Child killing (family and non-family)',
            'Sexual':'Other',
            'Killing by children (non-family)':'Child killing (family and non-family)',
            'Child killing by adult (non-family)':'Child killing (family and non-family)'
            },
            
           'MODUS':
            {
            'Firearm':'Firearm',
            'Knife or sharp object/weapon':'Knife or other sharp object/weapon',
            'Hanging/Strangulation/Suffocation':'Hanging/Strangulation/Suffocation',
            'Hitting, kicking or other similar physical violence without weapon,':'Hitting, kicking or other similar physical violence without weapon,',
            'Blunt object':'Blunt object',
            'Motor vehicle':'Motor vehicle',
            'Smoke or fire':'Smoke or fire',
            'Poisoning':'Other',
            'Other':'Other',
            'Push or shove':'Hitting, kicking or other similar physical violence without weapon,',
            'Axe':'Knife or other sharp object/weapon',
            'Drowning':'Other',
            'Bomb or explosive':'Other',
            'Exposure to corrosive or hot substances':'Other'
            },
    
            'PROCESS':
            {
            'Homicide remains unsolved':'Unsolved',
            'Suspect is identified but not arrested':'Solved',
            'Suspect has been arrested': 'Solved',
            'Suspect has been prosecuted':'Solved',
            'Suspect has been sentenced':'Solved',
            'Suspect has been sanctioned':'Solved',
            'Suspect is deceased':'Solved'
            },
           'NRPERP':
    {'1':'Single',
    '2':'Multiple',
     '3':'Multiple',
    '4':'Multiple',
     '5':'Multiple',
     '6':'Multiple',
     '7':'Multiple',
     '8':'Multiple',
     '9':'Multiple',
     '10':'Multiple',
     '11':'Multiple',
     '12':'Multiple',
     '13':'Multiple',
     '14':'Multiple',
     '15':'Multiple',
     '16':'Multiple',
     '17':'Multiple',
     '18':'Multiple',
     '19':'Multiple',
     '20':'Multiple'
     'Missing':'Missing'
    }
# recoding of Dutch cities into urban/rural based on CBS categorization, retracted for privacy reasons
# recoding of birthcountries of victims and perpetrators into Netherlands, other European, other non-European - retracted for privacy reasons
}
case = preprocess_case(case=case,req_cols=req_cols,missing_values=missing_values,recoding=recoding)

## 6. Create Meta-Data for Real Data

### 6.1 Meta-Data for public version

In [None]:
from sdv.metadata import SingleTableMetadata

metadata = SingleTableMetadata()
metadata.detect_from_dataframe(case)
metadata.visualize(
    show_table_details='summarized',
    output_filepath='my_metadata.png'
)


In [None]:
### Correcting meta-data


metadata.update_columns(
    column_names=['CITY'],
    sdtype='categorical')

In [None]:
print('Corrected meta data:\n')
metadata.visualize()

In [None]:
metadata.save_to_json('s_metadata.json')

## 7. Synthesis

### 7.1 Synthesis for public version

In [None]:
from sdv.single_table import GaussianCopulaSynthesizer

synthesizer = GaussianCopulaSynthesizer(metadata,
                            locales=['nl_NL'])


# all the following code will be added in the existing synthetic data generating script/notebook

# relevant column names for the constraint
# note that ordering matters; age should be first and type second, else we have to modify the constraint file
constraint_cols = ["AGE_vic","TYPEHOM"]

# filepath of the constraint file
constraint_filepath = "constraints_age.py"

synthesizer.load_custom_constraint_classes(
    filepath=constraint_filepath, class_names=["kindermoordConstraintClass"]
)

# apply the constraint to the table
kindermoordConstraint = {
    "constraint_class": "kindermoordConstraintClass",
    "constraint_parameters": {
        "column_names": constraint_cols,
        # 'extra parameter' below simply stores what the relevant categories are for inside the constraint
        # this way we dont have to hardcode categories inside the constraints file
        "extra_parameter": {"cat1": ["<=6",'7to17','Missing'], "cat2": ["Child killing (family and non-family)"]},
    },
}

constraint_iph_cols = ["RELAT","TYPEHOM"]

# filepath of the constraint file
constraint_filepath = "constraints_iph.py"

synthesizer.load_custom_constraint_classes(
    filepath=constraint_filepath, class_names=["IPHConstraintClass"]
)

# apply the constraint to the table
IPHConstraint = {
    "constraint_class": "IPHConstraintClass",
    "constraint_parameters": {
        "column_names": constraint_iph_cols,
        # 'extra parameter' below simply stores what the relevant categories are for inside the constraint
        # this way we dont have to hardcode categories inside the constraints file
        "extra_parameter": {"cat1": ['Partner','Ex-Partner(Wife, Husband, Girlfriend, Boyfriend)'], "cat2": ['Partner Killing']},
    },
}
synthesizer.add_constraints([IPHConstraint, kindermoordConstraint])

# after this we can fit the synthetic data model and sample synthetic data, according to applied constraints.


synthesizer.fit(case)
synthetic = synthesizer.sample(num_rows=1364)

## 8. Inspect Synthetic Data

### 8.1 Synthetic Data for public use

In [None]:
synthetic.head(5)

## 9. Evaluation of Utility of Synthetic Data

### 9.1 Utility of synthetic data for public use

In [None]:
from sdv.evaluation.single_table import run_diagnostic, evaluate_quality
from sdv.evaluation.single_table import get_column_plot

# 1. perform basic validity checks
diagnostic = run_diagnostic(case, synthetic, metadata)

# 2. measure the statistical similarity
quality_report = evaluate_quality(case, synthetic, metadata)



In [None]:
quality_report.get_details(property_name='Column Shapes')

#### 9.1.1 Univariate comparisons

In [None]:
# 3. plot the data
fig = get_column_plot(
    real_data=case,
    synthetic_data=synthetic,
    metadata=metadata,
    column_name='NRVIC_cat'
)
    
fig.show()

# This analysis has been repeated for all variables

## 10. Evaluation of Privacy

### 10.1 Privacy of synthetic data for public use

In [None]:
from sdmetrics.single_table import CategoricalKNN,CategoricalRF
#filter UserWarnings which have no impact on output
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

CategoricalKNN.compute(
    real_data=case,
    synthetic_data=synthetic,
    key_fields=['GENDER_vic','CRIMESCENE'],
    sensitive_fields=['RELAT']
)

# this metric has been repeated for all variable combinations

## 11. Export synthetic data

In [None]:
# synthetic data exported in csv form - retracted for privacy