## Sanction Screening Tool

## Index

* [Imports](#Imports)
* [Auxiliar Functions](#Auxiliar-Functions)
* [Variation Categories](#Variation-Categories)

## Imports

In [262]:
import pandas as pd
import numpy as np
import string
from itertools import permutations
import random

[Back to index](#Index)

## Auxiliar Functions

In [279]:
def get_vowel_index(name):
    vowels = ["a","e","i","o","u","A","E","I","O","U"]
    return [i for i,j in enumerate(name) if j in vowels]

def get_vowel_lowercase_index(name):
    vowels = ["a","e","i","o","u"]
    return [i for i,j in enumerate(name) if j in vowels]

def get_vowel_uppercase_index(name):
    vowels = ["A","E","I","O","U"]
    return [i for i,j in enumerate(name) if j in vowels]

def get_consonant_lowercase_index(name):
    consonants = [x for x in string.ascii_lowercase if x not in "aeiou"]
    return [i for i,j in enumerate(name) if j in consonants]

def swap_letter(c, i, j):
    c = list(c)
    c[i], c[j] = c[j], c[i]
    return ''.join(c)

def swap_element(c, i, j):
    c = list(c)
    c[i], c[j] = c[j], c[i]
    return ' '.join(c)

[Back to index](#Index)

## Variation Categories
* [Initials](#Initials)
* [Character Extension](#Character-Extension)
* [Character Replacement](#Character-Replacement)
* [Character Reduction](#Character-Reduction)
* [Transposition](#Transposition)
* [Double Characters](#Double-Characters)
* [Spaces](#Spaces)
* [Name Order](#Name-Order)
* [Titles](#Titles)
* [Missing Name Component](#Missing-Name-Component)
* [Also Known As](#Also-Known-As)

### Initials

* firstname_initials_single: BUITRAGO DE HERRERA, Luz Mery ->  L. Mery Buitrago de Herrera
* firstname_initials: BUITRAGO DE HERRERA, Luz Mery ->  L. M. Buitrago de Herrera
* initials_lastname: BUITRAGO DE HERRERA, Luz Mery -> Luz Mery Buitrago de H.
* lastname_initials_clear_first_name: BUITRAGO DE HERRERA, Luz Mery -> B. de Herrera

In [3]:
def firstname_initials_single(name):
    '''Returns first element as initial if more than one element in name'''
    name_elements = name.split(" ")
    if len(name_elements)>=2:
        output = name_elements[0][0] + ". "
        return output + " ".join(name_elements[1:])
    else:
        return name
    
def firstname_initials(name):
    '''Returns first and second element as initials if more than two elements in name,
        otherwise first element as initial '''
    name_elements = name.split(" ")
    if len(name_elements)>=3:
        output = name_elements[0][0] + "." + name_elements[1][0] + ". "
        return output + " ".join(name_elements[2:])
    else:
        return firstname_initials_single(name)

def initials_lastname(name):
    '''Returns last element as initial if more than one element in name'''
    name_elements = name.split(" ")
    if len(name_elements)>=2:
        output = name_elements[-1][0] + "."
        return " ".join(name_elements[:-1]) + " " + output
    else:
        return firstname_initials_single(name)

def lastname_initials_clear_first_name(name):
    '''xxx'''
    name_elements = name.split(" ")
    if len(name_elements)>=2:
        output = name_elements[-1][0] + "."
        return " ".join(name_elements[1:-1]) + " " + output
    else:
        return firstname_initials_single(name)

lastname_initials_clear_first_name

In [4]:
name = "Luz Mery Buitrago de Herrera"
print("firstname_initials_single: " + firstname_initials_single(name))
print("firstname_initials: " + firstname_initials(name))
print("initials_lastname: " + initials_lastname(name))
print("lastname_initials_clear_first_name: " + lastname_initials_clear_first_name(name))

firstname_initials_single: L. Mery Buitrago de Herrera
firstname_initials: L.M. Buitrago de Herrera
initials_lastname: Luz Mery Buitrago de H.
lastname_initials_clear_first_name: Mery Buitrago de H.


[Back to Variation Categories](#Variation-Categories)

### Character Extension

* add_spc_in_letters: BUITRAGO DE HERRERA, Luz Mery -> Lu z Mery Buitrago de Herrera
* add_prefix:
    * add_al-: QADHAFI, Muammar  -> Muammar al-Qadhafi
    * add_Al_and_space: QADHAFI, Muammar  -> Muammar Al Qadhafi
    * add_El_and_space: QADHAFI, Muammar  -> Muammar El Qadhafi
* add_vowel: QADHAFI, Muammar  -> Mouammar El Qadhafi
* add_h_before_vowel: BUITRAGO DE HERRERA, Luz Mery -> Luz Mhery Buitrago de Herreras
* add_consonant:
    * add_consonant_final_s: BUITRAGO DE HERRERA, Luz Mery -> Luz Mery Buitrago de Herreras
    * add_consonant_final_r: BUITRAGO DE HERRERA, Luz Mery -> Luz Mery Buitragor de Herrera
    * add_consonant_final_z: BUITRAGO DE HERRERA, Luz Mery -> Luz Mery Buitrago de Herreraz
    * add_consonant_uncommon: BUITRAGO DE HERRERA, Luz Mery -> Luz Mery Buitragozz de Herrera
    * add_consonant_double: BUITRAGO DE HERRERA, Luz Mery -> Luz Mery Buitragopz de Herrera
* add_special_character:
    * add_special_character_apostrophe: QADHAFI, Muammar  -> Mu'Ammar El Qadhafi
    * add_special_character_accent: LINARES REYES, Ricardo Jose -> Ricardo José Linares Reyes
    * add_special_character_uncommon: LINARES REYES, Ricardo Jose -> Ricardo Jos! Linares Reyes

In [5]:
def add_spc_in_letters(name):
    '''Randomly adds a space in the middle of name element'''
    name_elements = name.split(" ")
    cne = np.random.choice(name_elements) # chosen name element
    cli = np.random.choice([x+1 for x in range(len(cne)-1)]) # chosen letter index
    return name.replace(cne,cne[:cli] + " " + cne[cli:])

def add_al_and_hyphen(name):
    '''Adds 'al-'' to the last element of a name'''
    name_elements = name.split(" ")
    return " ".join(name_elements[:-1]) + " al-" + name_elements[-1]

def add_Al_and_space(name):
    '''Adds 'Al-'' to the last element of a name'''
    name_elements = name.split(" ")
    return " ".join(name_elements[:-1]) + " Al-" + name_elements[-1]

def add_El_and_space(name):
    '''Adds 'El-'' to the last element of a name'''
    name_elements = name.split(" ")
    return " ".join(name_elements[:-1]) + " El-" + name_elements[-1]

def add_vowel(name):
    '''Adds a lowecase vowel to a randomly chosen element of the name'''
    vowels = ["a","e","i","o","u"]
    name_elements = name.split(" ")
    cv = np.random.choice(vowels) # chosen vowel
    cne = np.random.choice(name_elements) # chosen name element
    cli = np.random.choice([x+1 for x in range(len(cne)-1)]) # chosen letter index
    return name.replace(cne,cne[:cli] + cv + cne[cli:])

def add_h_before_vowel(name):
    '''Adds an h before a vowel to a randomly chosen element of the name'''
    vlcindx = get_vowel_lowercase_index(name) # vowels lower case index
    rcvi    = np.random.choice(vlcindx)# randomly chosen vowel index
    return name[:rcvi] + "h" +  name[rcvi:]

def add_consonant_final_s(name):
    '''Adds an s after a randomly chosen element of the name'''
    name_elements = name.split(" ")
    cne = np.random.choice(name_elements) # chosen name element
    return name.replace(cne,cne + "s")

def add_consonant_final_r(name):
    '''Adds an s after a randomly chosen element of the name'''
    name_elements = name.split(" ")
    cne = np.random.choice(name_elements) # chosen name element
    return name.replace(cne,cne + "r")

def add_consonant_final_z(name):
    '''Adds an s after a randomly chosen element of the name'''
    name_elements = name.split(" ")
    cne = np.random.choice(name_elements) # chosen name element
    return name.replace(cne,cne + "z")

def add_consonant_uncommon(name):
    '''Adds a non r,s,z consonant in a randomly chosen element of the name'''
    consonants = [x for x in string.ascii_lowercase if x not in "aeioursz"]
    name_elements = name.split(" ")
    cc = np.random.choice(consonants)
    cne = np.random.choice(name_elements) # chosen name element
    cli = np.random.choice([x+1 for x in range(len(cne)-1)]) # chosen letter index
    return name.replace(cne,cne[:cli] + cc + cne[cli:])

def add_consonant_double(name):
    '''Adds a double consonant in a randomly chosen element of the name'''
    consonants = [x for x in string.ascii_lowercase if x not in "aeiou"]
    cindx      = get_consonant_lowercase_index(name) # consonants index
    ccindx     = np.random.choice(cindx) # chosen consonant index
    cdc         = name[ccindx] #chosen doubled consonant
    return name[:ccindx] + cdc +name[ccindx:]

def add_special_character_apostrophe(name):
    '''Adds an apostrophe in a randomly chosen element of the name'''
    name_elements = name.split(" ")
    cne = np.random.choice(name_elements) # chosen name element
    ci = np.random.choice([x+1 for x in range(len(cne))]) #chosen index
    return name.replace(cne,cne[:ci] + "'" + cne[ci:])

def add_special_character_accent(name):
    '''Adds an accent to a vowel from a randomly chosen element of the name'''
    accents_dict = {"a":"á","e":"é","i":"í","o":"ó","u":"ú"}
    name_elements = name.split(" ")
    vlcindx = get_vowel_lowercase_index(name) # vowels lower case index
    rcvi    = np.random.choice(vlcindx)# randomly chosen vowel index
    return name[:rcvi] +accents_dict[name[rcvi]] + name[rcvi+1:]

def add_special_character_uncommon(name):
    '''Adds a punctuation simbol to a randomly chosen element of the name'''
    simbol = np.random.choice([x for x in string.punctuation if x not in ["'"]])
    ci = np.random.choice(range(len(name))) # chosen index
    return name[:ci] + simbol + name[ci:]

In [6]:
name = "Luz Mery Buitrago de Herrera"
print("add_spc_in_letters: " + add_spc_in_letters(name))
print("add_al_and_hyphen: " + add_al_and_hyphen(name))
print("add_Al_and_space: " + add_Al_and_space(name))
print("add_El_and_space: " + add_El_and_space(name))
print("add_vowel: " + add_vowel(name))
print("add_h_before_vowel: " + add_h_before_vowel(name))
print("add_consonant_final_s: " + add_consonant_final_s(name))
print("add_consonant_final_r: " + add_consonant_final_r(name))
print("add_consonant_final_z: " + add_consonant_final_z(name))
print("add_consonant_uncommon: " + add_consonant_uncommon(name))
print("add_consonant_double: " + add_consonant_double(name))
print("add_special_character_apostrophe: " + add_special_character_apostrophe(name))
print("add_special_character_accent: " + add_special_character_accent(name))

add_spc_in_letters: Lu z Mery Buitrago de Herrera
add_al_and_hyphen: Luz Mery Buitrago de al-Herrera
add_Al_and_space: Luz Mery Buitrago de Al-Herrera
add_El_and_space: Luz Mery Buitrago de El-Herrera
add_vowel: Luz Mery Buitrago die Herrera
add_h_before_vowel: Luz Mery Buitrago de Herrhera
add_consonant_final_s: Luz Mery Buitrago des Herrera
add_consonant_final_r: Luz Meryr Buitrago de Herrera
add_consonant_final_z: Luz Mery Buitragoz de Herrera
add_consonant_uncommon: Luz Meryy Buitrago de Herrera
add_consonant_double: Luz Mery Buitrrago de Herrera
add_special_character_apostrophe: Luz Mery B'uitrago de Herrera
add_special_character_accent: Luz Mery Buitrágo de Herrera


[Back to Variation Categories](#Variation-Categories)

### Character Replacement

* phonetic_replacement: s$\rightarrow$z, z$\rightarrow$s, s$\rightarrow$c, c$\rightarrow$s, c$\rightarrow$z, z$\rightarrow$c, u$\rightarrow$o, o$\rightarrow$u, ua$\rightarrow$o, o$\rightarrow$ua, ph$\rightarrow$f, f$\rightarrow$ph, y$\rightarrow$i ,i$\rightarrow$y, d$\rightarrow$th, t$\rightarrow$d, v$\rightarrow$w, w$\rightarrow$v, k$\rightarrow$q, q$\rightarrow$k, v$\rightarrow$b, b$\rightarrow$v, a$\rightarrow$e, e$\rightarrow$a, x$\rightarrow$j, j$\rightarrow$x, v$\rightarrow$f, f$\rightarrow$v, c$\rightarrow$k, k$\rightarrow$c 

* typographic_error: t$\rightarrow$y, y$\rightarrow$t, o$\rightarrow$p, p$\rightarrow$o, d$\rightarrow$e, e$\rightarrow$d, f$\rightarrow$d, d$\rightarrow$f, r$\rightarrow$f, f$\rightarrow$r, e$\rightarrow$r, r$\rightarrow$e

* alphabet_alternative: n$\rightarrow$ñ, ñ$\rightarrow$n

* character_replacement_uncommon: random character replacement not consider in previous cases

 

In [8]:
phonetic_replacements = [("s","z") , ("z","s") , ("s","c"), ("c","s") , ("c","z"),
                        ("z","c") , ("u","o") , ("o","u"), ("ua","o"), ("o","ua"),
                        ("ph","f"), ("f","ph"), ("y","i"), ("i","y") , ("d","th"), 
                        ("t","d") , ("v","w") , ("w","v"), ("k","q") , ("q","k"), 
                        ("v","b") , ("b","v") , ("a","e"), ("e","a") , ("x","j"), 
                        ("j","x") , ("v","f") , ("f","v"), ("c","k") , ("k","c")]      

typographic_errors = [("t","y"), ("y","t"), ("o","p"), ("p","o"), 
                          ("d","e"), ("e","d"), ("f","d"), ("d","f"),
                          ("r","f"), ("f","r"), ("e","r"), ("r","e"),
                          ("r","t"), ("t","r")]

alphabet_alternatives = [("n","ñ"), ("ñ","n")]

already_used_permutations = phonetic_replacements + typographic_errors + alphabet_alternatives
potential_comb = set(list(permutations(string.ascii_lowercase,2)))
uncommon_replacements = potential_comb.difference(already_used_permutations)

def phonetic_replacement(name):
    '''Selects a letter at random and replace it with a similar phonetic letter'''
        
    # Check if potential replacements, otherwise return original name
    pr = set([x[0] for x in phonetic_replacements]) #potential replacements
    if len([x for x in name if x in pr])==0:
        return name
    
    # Choose a character that it is a candidate phonetic replacement
    rci = np.random.choice([i for i,j in enumerate(name)
                            if j.lower() in pr]) #randomnly chosen index   
    
    # Find potential character replacements
    ptr = [x[1] for x in phonetic_replacements
                       if x[0]==name[rci].lower()] #potential to replace

    # Replace the character to a random phonetic replacement
    if name[rci].islower():
        
        return name[:rci] + np.random.choice(ptr) + name[rci+1:]
    else:
        return name[:rci] + np.random.choice(ptr).upper() + name[rci+1:]

def typographic_error(name):
    '''Selects a letter at random and replace it with a potential typographic error'''
    
    # Check if potential replacements, otherwise return original name
    pr = set([x[0] for x in typographic_errors]) #potential replacements
    if len([x for x in name if x in pr])==0:
        return name
    
    # Choose a character that it is a candidate typographic_error
    rci = np.random.choice([i for i,j in enumerate(name)
                            if j.lower() in pr]) #randomnly chosen index   

    # Find potential character replacements
    ptr = [x[1] for x in typographic_errors
                       if x[0]==name[rci].lower()] #potential to replace

    # Replace the character to a random typographic error
    if name[rci].islower():
        return name[:rci] + np.random.choice(ptr) + name[rci+1:]
    else:
        return name[:rci] + np.random.choice(ptr).upper() + name[rci+1:]
    
def alphabet_alternative(name):
    '''Selects a letter at random and replace it with a potential typographic error'''

    # Check if potential replacements, otherwise return original name
    pr = set([x[0] for x in alphabet_alternatives]) #potential replacements
    if len([x for x in name if x in pr])==0:
        return name
        
    # Choose a character that it is a candidate alphabet_alternative
    rci = np.random.choice([i for i,j in enumerate(name)
                            if j.lower() in pr]) #randomnly chosen index
    
    # Find potential character replacements
    ptr = [x[1] for x in alphabet_alternatives
                       if x[0]==name[rci].lower()] #potential to replace
    
    # Replace the character to a random alphabet alternative
    if name[rci].islower():
        return name[:rci] + np.random.choice(ptr) + name[rci+1:]
    else:
        return name[:rci] + np.random.choice(ptr).upper() + name[rci+1:]
    
def character_replacement_uncommon(name):
        # Check if potential replacements, otherwise return original name
    pr = set([x[0] for x in uncommon_replacements]) #potential replacements
    if len([x for x in name if x in pr])==0:
        return name
        
    # Choose a character that it is a candidate alphabet_alternative
    rci = np.random.choice([i for i,j in enumerate(name)
                            if j.lower() in pr]) #randomnly chosen index
    
    # Find potential character replacements
    ptr = [x[1] for x in uncommon_replacements
                       if x[0]==name[rci].lower()] #potential to replace
    
    # Replace the character to a random uncommon replacements
    if name[rci].islower():
        return name[:rci] + np.random.choice(ptr) + name[rci+1:]
    else:
        return name[:rci] + np.random.choice(ptr).upper() + name[rci+1:]

In [9]:
print("phonetic_replacement: "+ phonetic_replacement(name))
print("typographic_error: "   + typographic_error(name))
print("alphabet_alternative: "+ alphabet_alternative(name))
print("character_replacement_uncommon: "+character_replacement_uncommon(name))

phonetic_replacement: Loz Mery Buitrago de Herrera
typographic_error: Luz Mery Buiyrago de Herrera
alphabet_alternative: Luz Mery Buitrago de Herrera
character_replacement_uncommon: Luz Mery Buitrago de Hsrrera



[Back to Variation Categories](#Variation-Categories)

### Character Reduction

* phonetic_miss:
    * remove_h: FADLALLAH, Shaykh Muhammad Husayn -> Saykh Muhammad Husayn Fadlallah
    * remove_c_before_k: CHINAMASA, Patrick -> Patrik Chinamasa
    * remove_e_before_s: ESPARRAGOZA MORENO, Juan Jose -> Juan Jose Sparragoza Moreno
    * remove_final_s: AL-MUHAMMAD, Khamis Sirhan -> Khami Sirhan al-Muhammad
    * remove_final_z: MENDEZ SALAZAR, John Jairo -> John Jairo Mende Salazar
    * remove_final_d: SIEIRO DE NORIEGA, Felicidad -> Felicidad Sieiro de Noriega
* character_reduction_uncommon

In [10]:
def remove_h(name):
    ''' If name has h remove one at random, otherwise return same name '''
    if "h" in name: 
        rci = np.random.choice([i for i,j in enumerate(name)
                                if j.lower() == "h"]) #randomnly chosen index
        return name[0:rci:] + name[rci+1::]
    else:
        return name
    
def remove_c_before_k(name):
    ''' If c before k, it removes c, otherwise return same name '''
    if "ck" in name:
        return name.replace("ck","k")
    if "Ck" in name:
        return name.replace("Ck","K")
    else:
        return name

def remove_e_before_s(name):
    ''' If c before k, it removes c, otherwise return same name '''
    if "es" in name:
        return name.replace("es","s")
    if "Es" in name:
        return name.replace("Es","S")
    else:
        return name
    
def remove_final_s(name):
    '''Remove "s" if at the end of a name element'''
    name_elements = name.split(" ")
    ees = [x for x in name_elements if x[-1]=="s"] #elements ending in "s"
    if ees:
        etrs = np.random.choice(ees) #element to remove "s"
        return name.replace(etrs,etrs[:-1])
    else:
        return name
    
def remove_final_z(name):
    '''Remove "z" if at the end of a name element'''
    name_elements = name.split(" ")
    ees = [x for x in name_elements if x[-1]=="z"] #elements ending in "z"
    if ees:
        etrs = np.random.choice(ees) #element to remove "z"
        return name.replace(etrs,etrs[:-1])
    else:
        return name
    
def remove_final_d(name):
    '''Remove "d" if at the end of a name element'''
    name_elements = name.split(" ")
    ees = [x for x in name_elements if x[-1]=="d"] #elements ending in "d"
    if ees:
        etrs = np.random.choice(ees) #element to remove "d"
        return name.replace(etrs,etrs[:-1])
    else:
        return name
    
def character_reduction_uncommon(name):
    '''Remove a character at random'''
    ccitr = [i for i,j in enumerate(name) if j!=" "] #candidate character index to remove
    citr  = np.random.choice(ccitr) #character index to remove
    return name[0:citr:] + name[citr+1::]

In [11]:
name = "Mariah Chickenz Espains Chad"
print("remove_h: "           + remove_h(name))
print("remove_c_before_k: "  + remove_c_before_k(name))
print("remove_e_before_s: "  + remove_e_before_s(name))
print("remove_final_s: "     + remove_final_s(name))
print("remove_final_z: "     + remove_final_z(name))
print("remove_final_d: "     + remove_final_d(name))
print("character_reduction_uncommon: "     + character_reduction_uncommon(name))

remove_h: Maria Chickenz Espains Chad
remove_c_before_k: Mariah Chikenz Espains Chad
remove_e_before_s: Mariah Chickenz Spains Chad
remove_final_s: Mariah Chickenz Espain Chad
remove_final_z: Mariah Chicken Espains Chad
remove_final_d: Mariah Chickenz Espains Cha
character_reduction_uncommon: Mariah Chickenz spains Chad



[Back to Variation Categories](#Variation-Categories)

### Transposition

* phonetic_transposition: (ed$\rightarrow$de, de$\rightarrow$ed, ei$\rightarrow$ie, ie$\rightarrow$ei, ae$\rightarrow$ea, ea$\rightarrow$ae)
* transposition_uncommon:

In [280]:
alphabet = list(string.ascii_lowercase)

phonetic_transpositions = [("ed","de"),("de","ed"),
                           ("ei","ie"),("ie","ei"),
                           ("ae","ea"),("ea","ae"),
                           ("th","ht"),("au","ua"),
                           ("re","er"),("er","re"),
                           ("rt","tr"),("is","si"),
                           ("sm","ms"),("se","es"),
                           ("oe","eo"),("or","ro"),
                           ("ro","or"),("sh","hs"),
                           ("hs","sh"),("ch","hc"),
                           ("hc","ch"),("eh","he"),
                           ("he","eh"),("tu","ut"),
                           ("ts","st"),("oa","ao"),]
potential_permutations = set(list(permutations(alphabet,2))) # n* (n-1)
potential_transpositions = set([(x[0]+x[1],x[1]+x[0]) for x in potential_permutations])
uncommon_transpositions = potential_transpositions.difference(phonetic_transpositions)

def phonetic_transposition(name):
    '''Perform common phonetic transpositions'''
    pt = [(i,j) for i,j in [x for x in phonetic_transpositions if x[0] in name]] #potential traspositions
    if pt:
        i = np.random.choice([i for i,j in enumerate(pt)])
        return name.replace(pt[i][0],pt[i][1])
    else:
        return name
    
def transposition_uncommon(name):
    '''Perform uncommon phonetic transpositions'''
    pt = [(i,j) for i,j in uncommon_transpositions
                        if i in name] #potential traspositions
    
    np.random.shuffle(pt) # shuffle potential transpositions 
    random_pt = pt.pop() # select the first one from the shuffled list

    pt_letters = list(random_pt[0])
    
    # Get letter positions
    first_letter = [pos for pos, char in enumerate(name) if char == pt_letters[0]]
    second_letter = [pos for pos, char in enumerate(name) if char == pt_letters[1]]

    let_pos = []
    for i in range(len(first_letter)):
        if first_letter[i]+1 in second_letter:
            let_pos.append(first_letter[i])

    np.random.shuffle(let_pos)
    let_pos = let_pos.pop()
    final_name = swap_letter(name, let_pos, let_pos+1)
    return final_name

In [281]:
name = "Mared Marea Mue"
print("phonetic_transposition: " + phonetic_transposition(name))
print("transposition_uncommon: " + transposition_uncommon(name))

phonetic_transposition: Mared Marae Mue
transposition_uncommon: Mraed Marea Mue


<font color="red"><b>It is currently transposing only vowels. We may want to have two functions. One to transpose vowels and other to transpose consonants.

[Back to Variation Categories](#Variation-Categories)

### Double Characters

* double_character_reduction: MENDEZ SALAZZAR, John Jairo -> John Jair Mendez Salazar
* double_character_insertion: AL-AWADI, Husein Qaid -> Hussein Qaid al-Waadi

In [14]:
def double_character_reduction(name):
    '''Finds double characters and removes one of the two characters'''
    double_characters_reps = [x+x for x in string.ascii_lowercase]
    pdcr = [dcr for dcr in double_characters_reps if dcr in name]
    if pdcr:
        ci = np.random.choice([i for i,j in enumerate(pdcr)])
        return name.replace(pdcr[ci], pdcr[ci][1])
    else:
        return name

def double_character_insertion(name):
    '''Takes any character at random and doubles it'''
    ci = np.random.choice([i for i,j in enumerate(name) if j!= " "])
    return name[:ci] + name[ci] + name[ci:]

In [15]:
name = "Mmaria Parrales Meeting"
print("double_character_reduction: " +  double_character_reduction(name))
print("double_character_insertion: " +  double_character_insertion(name))

double_character_reduction: Mmaria Parales Meeting
double_character_insertion: Mmaria Paarrales Meeting



[Back to Variation Categories](#Variation-Categories)

### Spaces

* space_reduction: MENDEZ SALAZAR, John Jairo -> John Jairo MendezSalazar
* space_insertion: MENDEZ SALAZAR, John Jairo -> John Jairo Salazar Men dez

In [16]:
def spaces_reduction(name):
    '''Deletes a space randomly in the string'''
    if len(name)>1:
        ci = np.random.choice([i for i,j in enumerate(name) if j== " "])
        return name[:ci] + name[ci+1:]
    else:
        return name

def spaces_insertion(name):
    '''Inserts a space randomly in the string'''
    ci = np.random.choice([i for i,j in enumerate(name) if j!= " "])
    return name[:ci] + " " + name[ci:]

In [17]:
name = "Mmaria Parrales Meeting"
print("spaces_reduction: " +  spaces_reduction(name))
print("spaces_insertion: " +  spaces_insertion(name))

spaces_reduction: MmariaParrales Meeting
spaces_insertion: Mmaria  Parrales Meeting



[Back to Variation Categories](#Variation-Categories)

### Name Order

<font color="red"><b>Pending transposition_name</font> 
* transposition_name: MENDEZ SALAZAR, John Jairo -> Salazar John Mendez Jairo 
* random_name_transposition: MENDEZ SALAZAR, John Jairo -> Salazar John Mendez Jairo 

In [329]:
def transposition_name(name):
    '''Transposes two elements of the name at random'''
    name_elements = name.split(" ")
    if len(name_elements)==1:
        return name
    elif len(name_elements)==2:
        return name_elements[1] + " " + name_elements[0]
    else:
        random_element_to_swap = random.randint(0,len(name_elements)-1)
        random_element_to_be_swapped = random.randint(0,len(name_elements)-1)
        index = list(range(0,len(name_elements)))
        if random_element_to_swap != random_element_to_be_swapped:
            return swap_element(name_elements, random_element_to_swap, random_element_to_be_swapped)
        else:
            index.pop(random_element_to_be_swapped)
            random_element_to_be_swapped = random.choice(index)
            return swap_element(name_elements, random_element_to_swap, random_element_to_be_swapped)
    
def random_name_transposition(name):
    '''Suffles name order at random'''
    name_elements_original = name.split(" ")
    name_elements_shuffle = name.split(" ")
    if len(name_elements_original)>=2:
        while name_elements_shuffle == name_elements_original:
            np.random.shuffle(name_elements_shuffle)
        return " ".join(name_elements_shuffle)
    else:
        return name

In [338]:
name = "Name1 Name2 Name3"
print("random_name_transposition: " + random_name_transposition(name))
print("random_name_transposition: " + transposition_name(name))

random_name_transposition: Name2 Name3 Name1
random_name_transposition: Name2 Name1 Name3



[Back to Variation Categories](#Variation-Categories)

[Back to index](#Index)

### Titles

* title_insertion: MENDEZ SALAZAR, John Jairo -> Mr. John Jairo Salazar Mendez 
* title_removal: MENDEZ SALAZAR, MR.John Jairo -> John Jairo Salazar Mendez 

In [20]:
titles = ["Mr. ","Mr ","Mrs. ","Mrs ","Mx. ", "Mx ", "Ms. ","Ms ","Miss ","Madam "]
def title_insertion(name):
    '''Adds title at the beginning of the name'''
    title = np.random.choice(titles)
    return title + name

def title_removal(name):
    '''Removes title if present'''
    if [x for x in titles if x in name]:
        for t in titles:
            name = name.replace(t,"")
    return name

In [21]:
name = "Javier Pérez González"
print("title_insertion: " +  title_insertion(name))
name = "Mr. Juan Pérez González"
print("title_insertion: " + title_removal(name))

title_insertion: Miss Javier Pérez González
title_insertion: Juan Pérez González



[Back to Variation Categories](#Variation-Categories)

### Missing Name Component

In [22]:
def name_component_deletion(name):
    '''Deletes one element at random if name has more than one element'''
    name_elements = name.split(" ")
    if len(name_elements)>1:
        element_to_delete = np.random.choice(name_elements)
        return " ".join([x for x in name.split(" ") if x != element_to_delete])
    else:
        return name

In [23]:
name = "Javier Pérez González"
name_component_deletion("Pepe")

'Pepe'


[Back to Variation Categories](#Variation-Categories)

### Also Known As

<font color="red"><b>Pending<br /></font> 
<font color="red"><b>TODO:</font> 
<li><font color="red"><b>Create a dictionary of AKAs. Use OFAC list for this.</font> </li>

In [341]:
data = pd.read_csv('https://www.treasury.gov/ofac/downloads/sdn.csv',
                   names=['ent_num', 'SDN_Name','SDN_Type','Program',
                          'Title','Call_Sign','Vess_type','Tonnage',
                          'GRT', 'Vess_flag', 'Vess_owner', 'Remarks'])
data.head()

Unnamed: 0,ent_num,SDN_Name,SDN_Type,Program,Title,Call_Sign,Vess_type,Tonnage,GRT,Vess_flag,Vess_owner,Remarks
0,36,AEROCARIBBEAN AIRLINES,-0-,CUBA,-0-,-0-,-0-,-0-,-0-,-0-,-0-,-0-
1,173,"ANGLO-CARIBBEAN CO., LTD.",-0-,CUBA,-0-,-0-,-0-,-0-,-0-,-0-,-0-,-0-
2,306,BANCO NACIONAL DE CUBA,-0-,CUBA,-0-,-0-,-0-,-0-,-0-,-0-,-0-,a.k.a. 'BNC'.
3,424,BOUTIQUE LA MAISON,-0-,CUBA,-0-,-0-,-0-,-0-,-0-,-0-,-0-,-0-
4,475,CASA DE CUBA,-0-,CUBA,-0-,-0-,-0-,-0-,-0-,-0-,-0-,-0-


In [344]:
data['SDN_Type'].unique()

array(['-0- ', 'individual', 'vessel', 'aircraft', nan], dtype=object)

In [345]:
# Get only the individuals
ind_df = data.loc[data['SDN_Type'] == 'individual']

In [361]:
# Find the individuals that have alias names
aka_df = ind_df[ind_df['Remarks'].str.contains("a.k.a.")]

In [578]:
aka_df.head()

Unnamed: 0,ent_num,SDN_Name,SDN_Type,Program,Title,Call_Sign,Vess_type,Tonnage,GRT,Vess_flag,Vess_owner,Remarks
76,3754,"ABU MARZOOK, Mousa Mohammed",individual,SDGT,"Political Leader in Amman, Jordan and Damascus...",-0-,-0-,-0-,-0-,-0-,-0-,"DOB 09 Feb 1951; POB Gaza, Egypt; Passport 92/..."
77,4106,"HERRERA BUITRAGO, Helmer",individual,SDNT,-0-,-0-,-0-,-0-,-0-,-0-,-0-,DOB 24 Aug 1951; alt. DOB 05 Jul 1951; Cedula ...
78,4107,"RODRIGUEZ OREJUELA, Gilberto Jose",individual,SDNT,-0-,-0-,-0-,-0-,-0-,-0-,-0-,DOB 31 Jan 1939; Cedula No. 6068015 (Colombia)...
79,4108,"RODRIGUEZ OREJUELA, Miguel Angel",individual,SDNT,-0-,-0-,-0-,-0-,-0-,-0-,-0-,DOB 23 Nov 1943; alt. DOB 15 Jul 1943; Cedula ...
80,4109,"SANTACRUZ LONDONO, Jose",individual,SDNT,-0-,-0-,-0-,-0-,-0-,-0-,-0-,DOB 01 Oct 1943; Passport AB149814 (Colombia);...


In [568]:
def get_aliases(data):
    
    # Make a list of words
    words_list = data.split()
    
    # Get the index of the word "a.k.a."
    counter = []
    for i, j in enumerate(words_list):
        if j == 'a.k.a.':
            counter.append(i)
    
    counter.reverse()
    
    # Get the aliases
    aliases = []
    for i, j in enumerate(counter):
        if i == 0:
            aliases.append(words_list[j+1:])
        else:
            aliases.append(words_list[j+1:counter[i-1]])
    
    special_chars = ["'", '.', ',', ';', '"'] 

    for special_char in special_chars:
        for i, j in enumerate(aliases):
            for name in range(0, len(j)):
                aliases[i][name] = aliases[i][name].replace(special_char,'')

    aliases_clean = []
    for name in enumerate(aliases):
        aliases_clean.append(' '.join(name[1]))

    return aliases_clean

In [576]:
get_aliases(aka_df['Remarks'].iloc[29])

['FRENKI']

In [577]:
aka_df['Remarks'].iloc[29]

"DOB 01 Apr 1950; POB Belgrade, Serbia and Montenegro; ICTY indictee in Serb custody; a.k.a. 'FRENKI'."

In [None]:
# To do:
# Find a way to clean the aliases
# Apply the function to all the rows and create a new column with just the aliases
# Create a Dictionary

In [611]:
temp_df = aka_df.copy()
temp_df['Aliases'] = temp_df['Remarks'].apply(lambda x : get_aliases(x))
temp_df.head()

Unnamed: 0,ent_num,SDN_Name,SDN_Type,Program,Title,Call_Sign,Vess_type,Tonnage,GRT,Vess_flag,Vess_owner,Remarks,Aliases
76,3754,"ABU MARZOOK, Mousa Mohammed",individual,SDGT,"Political Leader in Amman, Jordan and Damascus...",-0-,-0-,-0-,-0-,-0-,-0-,"DOB 09 Feb 1951; POB Gaza, Egypt; Passport 92/...",[ABU-UMAR]
77,4106,"HERRERA BUITRAGO, Helmer",individual,SDNT,-0-,-0-,-0-,-0-,-0-,-0-,-0-,DOB 24 Aug 1951; alt. DOB 05 Jul 1951; Cedula ...,"[H7, PACHO]"
78,4107,"RODRIGUEZ OREJUELA, Gilberto Jose",individual,SDNT,-0-,-0-,-0-,-0-,-0-,-0-,-0-,DOB 31 Jan 1939; Cedula No. 6068015 (Colombia)...,"[LUCAS, THE CHESS PLAYER]"
79,4108,"RODRIGUEZ OREJUELA, Miguel Angel",individual,SDNT,-0-,-0-,-0-,-0-,-0-,-0-,-0-,DOB 23 Nov 1943; alt. DOB 15 Jul 1943; Cedula ...,"[DOCTOR MRO, MAURO, MIKE, MANOLO, MANUEL, PAT,..."
80,4109,"SANTACRUZ LONDONO, Jose",individual,SDNT,-0-,-0-,-0-,-0-,-0-,-0-,-0-,DOB 01 Oct 1943; Passport AB149814 (Colombia);...,"[EL GORDO CHEPE, DON CHEPE, CHEPE]"


In [612]:
temp_df = temp_df[['SDN_Name', 'Aliases']]
temp_df.head()

Unnamed: 0,SDN_Name,Aliases
76,"ABU MARZOOK, Mousa Mohammed",[ABU-UMAR]
77,"HERRERA BUITRAGO, Helmer","[H7, PACHO]"
78,"RODRIGUEZ OREJUELA, Gilberto Jose","[LUCAS, THE CHESS PLAYER]"
79,"RODRIGUEZ OREJUELA, Miguel Angel","[DOCTOR MRO, MAURO, MIKE, MANOLO, MANUEL, PAT,..."
80,"SANTACRUZ LONDONO, Jose","[EL GORDO CHEPE, DON CHEPE, CHEPE]"


In [613]:
keys = temp_df['SDN_Name'].to_list()
values = temp_df['Aliases'].to_list()

In [614]:
dictionary = dict(zip(keys, values))

In [615]:
dictionary

{'ABU MARZOOK, Mousa Mohammed': ['ABU-UMAR'],
 'HERRERA BUITRAGO, Helmer': ['H7', 'PACHO'],
 'RODRIGUEZ OREJUELA, Gilberto Jose': ['LUCAS', 'THE CHESS PLAYER'],
 'RODRIGUEZ OREJUELA, Miguel Angel': ['DOCTOR MRO',
  'MAURO',
  'MIKE',
  'MANOLO',
  'MANUEL',
  'PAT',
  'PATTY',
  'PATRICIO',
  'PATRICIA',
  'EL SENOR'],
 'SANTACRUZ LONDONO, Jose': ['EL GORDO CHEPE', 'DON CHEPE', 'CHEPE'],
 'AL-MASRI, Abu Hafs': ['ABU HAFS', 'TAYSIR'],
 "MUSA, Rifa'i Ahmad Taha": ['ABU YASIR', 'ABD-AL-IZ'],
 'ABU ZUBAYDAH': ['TARIQ'],
 'ABDULLAH, Abdullah Ahmed': ['SALEH', 'ABU MARIAM'],
 'ALI, Ahmed Mohammed Hamed': ['SHUAIB',
  'AHMED THE EGYPTIAN',
  'AHMED HAMED',
  'ABU KHADIIJAH',
  'ABU ISLAM',
  'ABU FATIMA'],
 'AL-MUGHASSIL, Ahmad Ibrahim': ['ABU OMRAN'],
 'FADHIL, Mustafa Mohamed': ['KHALID', 'YUSSRR Abu', 'ANIS Abu', 'HUSSEIN'],
 'GHAILANI, Ahmed Khalfan': ['AHMED THE TANZANIAN',
  'FUPI',
  'FOOPIE',
  'BAKR Abu',
  'KHABAR Abu',
  'AHMED A'],
 'IZZ-AL-DIN, Hasan': ['SA-ID', 'GARBAYA AHMED'],


[Back to Variation Categories](#Variation-Categories)

[Back to index](#Index)