**This is the simplified Opensantions data in csv format**

In [1]:
#import packages
import requests
import pandas as pd
import numpy as np
import datetime
import os
import re

# Retrieval of the data

In [2]:
#test
url = "https://data.opensanctions.org/datasets/20231224/peps/targets.simple.csv"
response = requests.get(url)
print(response.status_code)

200


In [3]:
#create urls for all relevant dates
date_list = pd.date_range(start='20210801',end='20231231',freq='D').strftime('%Y%m%d')
date_list

Index(['20210801', '20210802', '20210803', '20210804', '20210805', '20210806',
       '20210807', '20210808', '20210809', '20210810',
       ...
       '20231222', '20231223', '20231224', '20231225', '20231226', '20231227',
       '20231228', '20231229', '20231230', '20231231'],
      dtype='object', length=883)

In [4]:
#get all the urls for all the dates we need the data for
#make each day an individual list so we can compare and match them
websites = []

for i in date_list:
    test = 'https://data.opensanctions.org/datasets/'+(i)+'/peps/targets.simple.csv'
    websites.append(test)
print(websites)

['https://data.opensanctions.org/datasets/20210801/peps/targets.simple.csv', 'https://data.opensanctions.org/datasets/20210802/peps/targets.simple.csv', 'https://data.opensanctions.org/datasets/20210803/peps/targets.simple.csv', 'https://data.opensanctions.org/datasets/20210804/peps/targets.simple.csv', 'https://data.opensanctions.org/datasets/20210805/peps/targets.simple.csv', 'https://data.opensanctions.org/datasets/20210806/peps/targets.simple.csv', 'https://data.opensanctions.org/datasets/20210807/peps/targets.simple.csv', 'https://data.opensanctions.org/datasets/20210808/peps/targets.simple.csv', 'https://data.opensanctions.org/datasets/20210809/peps/targets.simple.csv', 'https://data.opensanctions.org/datasets/20210810/peps/targets.simple.csv', 'https://data.opensanctions.org/datasets/20210811/peps/targets.simple.csv', 'https://data.opensanctions.org/datasets/20210812/peps/targets.simple.csv', 'https://data.opensanctions.org/datasets/20210813/peps/targets.simple.csv', 'https://da

#first check for missing days
for site in websites:
    try:
        response = requests.get(site)
        response.raise_for_status()  # Raises an HTTPError for bad response
    except requests.exceptions.RequestException as e:
        print(f"Error accessing {site}: {e}")
        continue

In [5]:
#now loop over the batches to get all the data and append the dataframes to one another
#this gives us a list of dataframes
#the loop must contain a date identifyer as a new column so each dataframe is marked with the retrieval date that is then needed to match the different days against each other
peps_list = []
date_pattern = r'/datasets/(\d{8})/'
for site in websites:
    response = requests.get(site)
    if response.status_code != 200:
        continue
    data = pd.read_csv(site, low_memory=False)
    match = re.search(date_pattern, site) #extract the date from the url
    if match:
        date = match.group(1)
        data['date_stamp'] = date #set an individual date stamp for each dataframe matching the url date
        #ata =  data.set_index('date_stamp') #set as an index
    peps_list.append(data)
res = pd.concat(peps_list)  # concatenate list of dataframes

In [6]:
res

Unnamed: 0,id,schema,name,aliases,birth_date,countries,addresses,identifiers,sanctions,phones,emails,dataset,last_seen,first_seen,date_stamp,last_change
0,coe-1-aiken,Person,Frank AIKEN,,,ie,,,,,,Politically Exposed Persons (PEPs),2021-07-27 13:29:13,2021-08-20 20:39:54,20210820,
1,coe-100-kristensen,Person,Thorkil KRISTENSEN,,,dk,,,,,,Politically Exposed Persons (PEPs),2021-07-27 13:29:13,2021-08-20 20:39:54,20210820,
2,coe-1001-helgadottir,Person,Ragnhildur V. HELGADOTTIR,,,is,,,,,,Politically Exposed Persons (PEPs),2021-07-27 13:29:13,2021-08-20 20:39:54,20210820,
3,coe-1002-hellige,Person,Walther HELLIGE,,,de,,,,,,Politically Exposed Persons (PEPs),2021-07-27 13:29:13,2021-08-20 20:39:54,20210820,
4,coe-1003-irving,Person,Sydney IRVING,,,gb,,,,,,Politically Exposed Persons (PEPs),2021-07-27 13:29:13,2021-08-20 20:39:54,20210820,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
358491,us-dos-william-h-moser,Person,William H. Moser,,,us,,,,,,US State Department Senior Officials,2023-12-31T04:20:09,2023-10-18T10:49:15,20231231,2023-10-18T10:49:15
358492,us-dos-william-hybl,Person,William Hybl,,,us,,,,,,US State Department Senior Officials,2023-12-31T04:20:09,2023-10-18T09:19:21,20231231,2023-10-18T09:19:21
358493,us-dos-yolanda-robinson,Person,Yolanda Robinson,,,us,,,,,,US State Department Senior Officials,2023-12-31T04:20:09,2023-11-30T04:16:01,20231231,2023-11-30T04:16:01
358494,us-dos-yuen-hao-huang,Person,Yuen-Hao Huang,,,us,,,,,,US State Department Senior Officials,2023-12-31T04:20:09,2023-10-18T09:19:21,20231231,2023-10-18T09:19:21


In [None]:
import zipfile

def compress_csv(input_csv, output_zip):
    with zipfile.ZipFile(output_zip, 'w', compression=zipfile.ZIP_DEFLATED) as zipf:
        zipf.write(input_csv)
input_csv = 'res.csv'
output_zip = 'peps_csv.zip'
compress_csv(input_csv, output_zip)

# alternatively, get the names as txt file

In [17]:
import requests
import re


websites = []

# Construct list of URLs based on date_list
for date in date_list:
    test = f'https://data.opensanctions.org/datasets/{date}/peps/names.txt'
    websites.append(test)

peps_data = []  # List to store the text data and date identifier

# Regex pattern to extract date from the URL
date_pattern = r'/datasets/(\d{8})/'

# Iterate over each website URL
for site in websites:
    response = requests.get(site)
    if response.status_code != 200:
        continue
    
    data = response.text
    
    # Extract date from the URL
    match = re.search(date_pattern, site)
    if match:
        date_identifier = match.group(1)  # Extract the date from the URL
        peps_data.append({'date': date_identifier, 'text': data})  # Append data along with the date identifier

print(peps_data)  # Output the list of text data along with the corresponding date identifier


MemoryError: 

In [15]:
#url_txt = "https://data.opensanctions.org/datasets/20240411/peps/names.txt"
websites = []
data = []
date = []
for i in date_list:
    test = 'https://data.opensanctions.org/datasets/'+(i)+'/peps/names.txt'
    websites.append(test)
data = []
peps_list = []
date_pattern = r'/datasets/(\d{8})/'
for site in websites:
    response = requests.get(site)
    if response.status_code != 200:
        continue
    data = response.text
    match = re.search(date_pattern, site) #extract the date from the url
    if match:
        date = match.Group(1)
        date.append(data)
        #data_full = dict(zip(date,data)) #set an individual date stamp for each dataframe matching the url date
        #data =  data.set_index('date_stamp') #set as an index
    peps_list.append(data)
peps_list

AttributeError: 're.Match' object has no attribute 'Group'

In [None]:
def compress_file(input_file, output_zip):
    with zipfile.ZipFile(output_zip, 'w', compression=zipfile.ZIP_DEFLATED) as zipf:
        zipf.write(input_file)

# Example usage:
input_file = 'peps_list.txt'
output_zip = 'peps_text.zip'
compress_file(input_file, output_zip)


# Match the dates against each other

In [6]:
#now match the different dates against each other and return an indicator per row for a new listing or delisting
# Sort the DataFrame based on date_stamp and id
res.sort_values(by=['id', 'date_stamp'], inplace=True)

# Identify new entries and deletions based on the index and columns you want to compare
new_entries = ~res.duplicated(subset=['id'], keep='first')
deletions = ~res.duplicated(subset=['id'], keep='last')

# Identify unchanged rows by checking for duplicates based on the id and date_stamp
unchanged_rows = ~new_entries & ~deletions

# Set the values of new_entry, deletion, and unchanged columns
res['new_entry'] = new_entries
res['deletion'] = deletions
res['unchanged'] = unchanged_rows

# Reset index for the final result
res.reset_index(drop=True, inplace=True)

In [7]:
res

Unnamed: 0,id,schema,name,aliases,birth_date,countries,addresses,identifiers,sanctions,phones,emails,dataset,last_seen,first_seen,date_stamp,last_change,new_entry,deletion,unchanged
0,NK-226JTDehVMSXex35EbKPmi,Organization,BENA PROPERTIES,BANNA PROPERTIES;BENA,,sy,"Cham Holding Building, Daraa Highway, Sahnaya ...",,Block - Program - SDN List - Executive Order 1...,,,US OFAC Specially Designated Nationals (SDN) List,2021-09-30 11:39:21,2021-09-30 11:39:21,20210930,,True,False,False
1,NK-226JTDehVMSXex35EbKPmi,Organization,BENA PROPERTIES,BANNA PROPERTIES;BENA,,sy,"Cham Holding Building, Daraa Highway, Sahnaya ...",,Program - SDN List - Block - Executive Order 1...,,,US OFAC Specially Designated Nationals (SDN) List,2021-09-30 11:39:21,2021-10-01 03:02:43,20211001,,False,False,True
2,NK-226JTDehVMSXex35EbKPmi,Organization,BENA PROPERTIES,BANNA PROPERTIES;BENA,,sy,"Cham Holding Building, Daraa Highway, Sahnaya ...",,Program - SDN List - Block - Executive Order 1...,,,US OFAC Specially Designated Nationals (SDN) List,2021-09-30 11:39:21,2021-10-02 03:03:02,20211002,,False,False,True
3,NK-226JTDehVMSXex35EbKPmi,Organization,BENA PROPERTIES,BANNA PROPERTIES;BENA,,sy,"Cham Holding Building, Daraa Highway, Sahnaya ...",,Block - SDN List - Program - Executive Order 1...,,,US OFAC Specially Designated Nationals (SDN) List,2021-09-30 11:39:21,2021-10-03 08:46:36,20211003,,False,False,True
4,NK-226JTDehVMSXex35EbKPmi,Organization,BENA PROPERTIES,BANNA PROPERTIES;BENA,,sy,"Cham Holding Building, Daraa Highway, Sahnaya ...",,Program - Block - SDN List - Executive Order 1...,,,US OFAC Specially Designated Nationals (SDN) List,2021-09-30 11:39:21,2021-10-04 03:02:58,20211004,,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9487479,ofac-sdn-9999,Person,Jose Aldemar RENDON RAMIREZ,MECHAS,1950-07-24,co,"Carrera 13 No. 18-50, Cartago, Valle, Colombia...",16202349,SDN List - Block - Program - Unknown - 2006-10-25,,,US OFAC Specially Designated Nationals (SDN) List,2021-07-26 11:55:45,2021-09-24 03:02:55,20210925,,False,False,True
9487480,ofac-sdn-9999,Person,Jose Aldemar RENDON RAMIREZ,MECHAS,1950-07-24,co,"Carrera 13 No. 18-50, Cartago, Valle;Carrera 2...",16202349,Program - SDN List - Block - Unknown - 2006-10-25,,,US OFAC Specially Designated Nationals (SDN) List,2021-09-26 14:52:11,2021-09-26 14:52:11,20210926,,False,False,True
9487481,ofac-sdn-9999,Person,Jose Aldemar RENDON RAMIREZ,MECHAS,1950-07-24,co,"Carrera 13 No. 18-50, Cartago, Valle;Carrera 2...",16202349,Block - SDN List - Program - Unknown - 2006-10-25,,,US OFAC Specially Designated Nationals (SDN) List,2021-09-26 14:52:11,2021-09-27 09:09:30,20210927,,False,False,True
9487482,ofac-sdn-9999,Person,Jose Aldemar RENDON RAMIREZ,MECHAS,1950-07-24,co,"Carrera 13 No. 18-50, Cartago, Valle;Carrera 2...",16202349,Program - Block - SDN List - Unknown - 2006-10-25,,,US OFAC Specially Designated Nationals (SDN) List,2021-09-26 14:52:11,2021-09-28 03:03:19,20210928,,False,False,True


In [8]:
#test thematching function
test = res.query('new_entry == True')
test

Unnamed: 0,id,schema,name,aliases,birth_date,countries,addresses,identifiers,sanctions,phones,emails,dataset,last_seen,first_seen,date_stamp,last_change,new_entry,deletion,unchanged
0,NK-226JTDehVMSXex35EbKPmi,Organization,BENA PROPERTIES,BANNA PROPERTIES;BENA,,sy,"Cham Holding Building, Daraa Highway, Sahnaya ...",,Block - Program - SDN List - Executive Order 1...,,,US OFAC Specially Designated Nationals (SDN) List,2021-09-30 11:39:21,2021-09-30 11:39:21,20210930,,True,False,False
5,NK-22HtK7WrxZ2sU3rmhz6PuZ,Person,Michael Kuajien,Michael Kuajian;Michael Kuajien Duer Mayok,1979-01-01,ke;ss,Nairobi;South Sudan,,Program - Block - SDN List - Executive Order 1...,,,US OFAC Specially Designated Nationals (SDN) List,2021-09-26 14:52:11,2021-09-28 03:03:19,20210928,,True,False,False
827,NK-22iRdVwQHXiYMCAASM93qH,Organization,AL RASHID TRUST,AL AMEEN TRUST;AL AMIN TRUST;AL AMIN WELFARE T...,,pk,"Jamia Masjid, Sulaiman Park, Begum Pura, Lahor...",,Block - Program - SDN List - Unknown - 2001-09-24,,,US OFAC Specially Designated Nationals (SDN) List,2021-07-26 11:55:45,2021-09-18 12:23:43,20210918,,True,False,False
844,NK-22jYwFUGYJxd6bKdMnxb22,Organization,NEGIN PARTO KHAVAR,ERTEBATE EGHTESSADE MONIR;NEGIN PARTO;NEGIN PA...,,ir,"Tehran, Fatmi Gharabi Street, between Sindokht...",,Block - Program - SDN List - Executive Order 1...,,,US OFAC Specially Designated Nationals (SDN) List,2021-07-26 11:55:45,2021-09-18 12:23:43,20210918,,True,False,False
856,NK-22oMG6jqPQknWaMjzTn4hK,Company,Limited Liability Company Garantiya,Garantiya OOO,,ru,"bulvar Tverskoi, d. 15 str. 2, Moscow",5067746901426;7703610362,Block - SDN List - Program - Executive Order 1...,,,US OFAC Specially Designated Nationals (SDN) List,2023-02-26 18:35:08,2023-02-24 18:37:06,20230226,,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9487281,ofac-sdn-9993,Person,Yolanda Sofia CANO ALZATE,,1957-04-25,co,"c/o GAVIOTAS LTDA., Colombia",31399608,SDN List - Block - Program - Unknown - 2006-10-25,,,US OFAC Specially Designated Nationals (SDN) List,2021-07-26 11:55:45,2021-08-20 20:39:54,20210820,,True,False,False
9487320,ofac-sdn-9994,Person,Davinson GOMEZ OCAMPO,GORDO,1960-07-10,co,"Calle 16 No. 1-58, Cartago, Valle, Colombia;c/...",2470433,SDN List - Block - Program - Unknown - 2006-10-25,,,US OFAC Specially Designated Nationals (SDN) List,2021-07-26 11:55:45,2021-08-20 20:39:54,20210820,,True,False,False
9487361,ofac-sdn-9995,Person,Carlos Arturo PATINO RESTREPO,PATE MURO;PATEMURO,1964-04-27,co,"Calle 20 No. 6-30, Ofc. 1304, Pereira, Risaral...",9991679,SDN List - Block - Program - Unknown - 2006-10-25,,,US OFAC Specially Designated Nationals (SDN) List,2021-07-26 11:55:45,2021-08-20 20:39:54,20210820,,True,False,False
9487402,ofac-sdn-9996,Person,Luis Alfonso GOMEZ BUSTAMANTE,,1953-11-01,co,"Calle 1B No. 1-26, Barrio El Prado, Cartago, V...",4451571,SDN List - Block - Program - Unknown - 2006-10-25,,,US OFAC Specially Designated Nationals (SDN) List,2021-07-26 11:55:45,2021-08-20 20:39:54,20210820,,True,False,False


In [11]:
# Add a 'month' column
res['month'] = pd.to_datetime(res['date_stamp']).dt.to_period('M')
#get a count of all listings per month
res['listing_count'] = 0  # Initialize the 'listing_count' column with zeros
res['listing_count'] = res.groupby('month')['id'].transform('nunique')
res

Unnamed: 0,id,schema,name,aliases,birth_date,countries,addresses,identifiers,sanctions,phones,...,dataset,last_seen,first_seen,date_stamp,last_change,new_entry,deletion,unchanged,month,listing_count
0,NK-226JTDehVMSXex35EbKPmi,Organization,BENA PROPERTIES,BANNA PROPERTIES;BENA,,sy,"Cham Holding Building, Daraa Highway, Sahnaya ...",,Block - Program - SDN List - Executive Order 1...,,...,US OFAC Specially Designated Nationals (SDN) List,2021-09-30 11:39:21,2021-09-30 11:39:21,20210930,,True,False,False,2021-09,18450
1,NK-226JTDehVMSXex35EbKPmi,Organization,BENA PROPERTIES,BANNA PROPERTIES;BENA,,sy,"Cham Holding Building, Daraa Highway, Sahnaya ...",,Program - SDN List - Block - Executive Order 1...,,...,US OFAC Specially Designated Nationals (SDN) List,2021-09-30 11:39:21,2021-10-01 03:02:43,20211001,,False,False,True,2021-10,9410
2,NK-226JTDehVMSXex35EbKPmi,Organization,BENA PROPERTIES,BANNA PROPERTIES;BENA,,sy,"Cham Holding Building, Daraa Highway, Sahnaya ...",,Program - SDN List - Block - Executive Order 1...,,...,US OFAC Specially Designated Nationals (SDN) List,2021-09-30 11:39:21,2021-10-02 03:03:02,20211002,,False,False,True,2021-10,9410
3,NK-226JTDehVMSXex35EbKPmi,Organization,BENA PROPERTIES,BANNA PROPERTIES;BENA,,sy,"Cham Holding Building, Daraa Highway, Sahnaya ...",,Block - SDN List - Program - Executive Order 1...,,...,US OFAC Specially Designated Nationals (SDN) List,2021-09-30 11:39:21,2021-10-03 08:46:36,20211003,,False,False,True,2021-10,9410
4,NK-226JTDehVMSXex35EbKPmi,Organization,BENA PROPERTIES,BANNA PROPERTIES;BENA,,sy,"Cham Holding Building, Daraa Highway, Sahnaya ...",,Program - Block - SDN List - Executive Order 1...,,...,US OFAC Specially Designated Nationals (SDN) List,2021-09-30 11:39:21,2021-10-04 03:02:58,20211004,,False,True,False,2021-10,9410
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9487479,ofac-sdn-9999,Person,Jose Aldemar RENDON RAMIREZ,MECHAS,1950-07-24,co,"Carrera 13 No. 18-50, Cartago, Valle, Colombia...",16202349,SDN List - Block - Program - Unknown - 2006-10-25,,...,US OFAC Specially Designated Nationals (SDN) List,2021-07-26 11:55:45,2021-09-24 03:02:55,20210925,,False,False,True,2021-09,18450
9487480,ofac-sdn-9999,Person,Jose Aldemar RENDON RAMIREZ,MECHAS,1950-07-24,co,"Carrera 13 No. 18-50, Cartago, Valle;Carrera 2...",16202349,Program - SDN List - Block - Unknown - 2006-10-25,,...,US OFAC Specially Designated Nationals (SDN) List,2021-09-26 14:52:11,2021-09-26 14:52:11,20210926,,False,False,True,2021-09,18450
9487481,ofac-sdn-9999,Person,Jose Aldemar RENDON RAMIREZ,MECHAS,1950-07-24,co,"Carrera 13 No. 18-50, Cartago, Valle;Carrera 2...",16202349,Block - SDN List - Program - Unknown - 2006-10-25,,...,US OFAC Specially Designated Nationals (SDN) List,2021-09-26 14:52:11,2021-09-27 09:09:30,20210927,,False,False,True,2021-09,18450
9487482,ofac-sdn-9999,Person,Jose Aldemar RENDON RAMIREZ,MECHAS,1950-07-24,co,"Carrera 13 No. 18-50, Cartago, Valle;Carrera 2...",16202349,Program - Block - SDN List - Unknown - 2006-10-25,,...,US OFAC Specially Designated Nationals (SDN) List,2021-09-26 14:52:11,2021-09-28 03:03:19,20210928,,False,False,True,2021-09,18450


# Deduplicate

In [12]:
res_deduplicated = res.copy()
res_deduplicated.sort_values(by=['id', 'date_stamp'], inplace=True)

# Identify new entries and deletions based on the index and columns you want to compare
new_entries = ~res_deduplicated.duplicated(subset=['id'], keep='first')
deletions = ~res_deduplicated.duplicated(subset=['id'], keep='last')

# Identify unchanged rows by checking for duplicates based on the id and date_stamp
unchanged_rows = ~new_entries & ~deletions

# Create new columns for first seen and last seen dates
res_deduplicated['listing_date'] = res_deduplicated['date_stamp'].where(new_entries)
res_deduplicated['delisting_date'] = res_deduplicated['date_stamp'].where(deletions)

# Deduplicate the entries (keep the first occurrence for each entity)
res_deduplicated = res_deduplicated.drop_duplicates(subset=['id'], keep='first')

# Reset index for the final result
res_deduplicated.reset_index(drop=True, inplace=True)

# Print or further analyze the deduplicated DataFrame 'res_deduplicated'
print(res_deduplicated)

                              id        schema  \
0      NK-226JTDehVMSXex35EbKPmi  Organization   
1      NK-22HtK7WrxZ2sU3rmhz6PuZ        Person   
2      NK-22iRdVwQHXiYMCAASM93qH  Organization   
3      NK-22jYwFUGYJxd6bKdMnxb22  Organization   
4      NK-22oMG6jqPQknWaMjzTn4hK       Company   
...                          ...           ...   
29648              ofac-sdn-9993        Person   
29649              ofac-sdn-9994        Person   
29650              ofac-sdn-9995        Person   
29651              ofac-sdn-9996        Person   
29652              ofac-sdn-9999        Person   

                                      name  \
0                          BENA PROPERTIES   
1                          Michael Kuajien   
2                          AL RASHID TRUST   
3                       NEGIN PARTO KHAVAR   
4      Limited Liability Company Garantiya   
...                                    ...   
29648            Yolanda Sofia CANO ALZATE   
29649                Davinson G

In [13]:
res_deduplicated

Unnamed: 0,id,schema,name,aliases,birth_date,countries,addresses,identifiers,sanctions,phones,...,first_seen,date_stamp,last_change,new_entry,deletion,unchanged,month,listing_count,listing_date,delisting_date
0,NK-226JTDehVMSXex35EbKPmi,Organization,BENA PROPERTIES,BANNA PROPERTIES;BENA,,sy,"Cham Holding Building, Daraa Highway, Sahnaya ...",,Block - Program - SDN List - Executive Order 1...,,...,2021-09-30 11:39:21,20210930,,True,False,False,2021-09,18450,20210930,
1,NK-22HtK7WrxZ2sU3rmhz6PuZ,Person,Michael Kuajien,Michael Kuajian;Michael Kuajien Duer Mayok,1979-01-01,ke;ss,Nairobi;South Sudan,,Program - Block - SDN List - Executive Order 1...,,...,2021-09-28 03:03:19,20210928,,True,False,False,2021-09,18450,20210928,
2,NK-22iRdVwQHXiYMCAASM93qH,Organization,AL RASHID TRUST,AL AMEEN TRUST;AL AMIN TRUST;AL AMIN WELFARE T...,,pk,"Jamia Masjid, Sulaiman Park, Begum Pura, Lahor...",,Block - Program - SDN List - Unknown - 2001-09-24,,...,2021-09-18 12:23:43,20210918,,True,False,False,2021-09,18450,20210918,
3,NK-22jYwFUGYJxd6bKdMnxb22,Organization,NEGIN PARTO KHAVAR,ERTEBATE EGHTESSADE MONIR;NEGIN PARTO;NEGIN PA...,,ir,"Tehran, Fatmi Gharabi Street, between Sindokht...",,Block - Program - SDN List - Executive Order 1...,,...,2021-09-18 12:23:43,20210918,,True,False,False,2021-09,18450,20210918,
4,NK-22oMG6jqPQknWaMjzTn4hK,Company,Limited Liability Company Garantiya,Garantiya OOO,,ru,"bulvar Tverskoi, d. 15 str. 2, Moscow",5067746901426;7703610362,Block - SDN List - Program - Executive Order 1...,,...,2023-02-24 18:37:06,20230226,,True,False,False,2023-02,12137,20230226,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29648,ofac-sdn-9993,Person,Yolanda Sofia CANO ALZATE,,1957-04-25,co,"c/o GAVIOTAS LTDA., Colombia",31399608,SDN List - Block - Program - Unknown - 2006-10-25,,...,2021-08-20 20:39:54,20210820,,True,False,False,2021-08,9124,20210820,
29649,ofac-sdn-9994,Person,Davinson GOMEZ OCAMPO,GORDO,1960-07-10,co,"Calle 16 No. 1-58, Cartago, Valle, Colombia;c/...",2470433,SDN List - Block - Program - Unknown - 2006-10-25,,...,2021-08-20 20:39:54,20210820,,True,False,False,2021-08,9124,20210820,
29650,ofac-sdn-9995,Person,Carlos Arturo PATINO RESTREPO,PATE MURO;PATEMURO,1964-04-27,co,"Calle 20 No. 6-30, Ofc. 1304, Pereira, Risaral...",9991679,SDN List - Block - Program - Unknown - 2006-10-25,,...,2021-08-20 20:39:54,20210820,,True,False,False,2021-08,9124,20210820,
29651,ofac-sdn-9996,Person,Luis Alfonso GOMEZ BUSTAMANTE,,1953-11-01,co,"Calle 1B No. 1-26, Barrio El Prado, Cartago, V...",4451571,SDN List - Block - Program - Unknown - 2006-10-25,,...,2021-08-20 20:39:54,20210820,,True,False,False,2021-08,9124,20210820,


# Descriptives and clean up

In [14]:
#the countries are coded as iso2 codes -> transform to iso 3
#keep both countries? could search for "ru" over regex functions
print(res_deduplicated.countries.unique())
print(res_deduplicated.schema.unique())
print(res_deduplicated.listing_date.unique())

['sy' 'ke;ss' 'pk' 'ir' 'ru' 'cd;rw' 'de' 'by' 'kp' 'co' 'ae' 'mx' 'be'
 'il;md;ru' 'iq' 'gr' 'gb;ir' nan 'zw' 'ch;fr' 'cn' 've' 'tw' 'li'
 'co;es;pa' 'cf' 'ru;ua' 'cy' 'ao' 'gb' 'mt' 'lb;ps' 'vc' 'gm' 'ni'
 'mx;us' 'ch;co' 'ir;sy' 'lb' 'cz;ru' 'ch' 'my' 'jp' 'ye' 'tn' 'us'
 'cy;gb;gh;gr;sy' 'sa' 'km' 'ae;ru;uz' 'eg' 'pk;sa' 'sk;ve' 'au' 'iq;sy'
 'tr' 'fr;ml' 'af;ir;pk' 'mh' 'mm' 'lb;ng;sl' 'cn;hk' 'sg' 'cd;kp' 'cr'
 'jo' 'ar' 'fr' 'lv;tr' 'ca' 'rs' 'nl' 'cn;kp' 'it;tn' 'bg' 'it' 'sv' 'at'
 'pa' 'lv;ru' 'uz' 'ae;in' 'sd' 'cy;it;mc' 'aw;co;ve'
 'ar;bg;ca;de;es;it;kp;mx;nl;pl;ru' 'gb;sd' 'iq;tr' 'id' 'er;et' 'bi'
 'co;ve' 'ae;sy' 'ht' 'af;pk' 'fi;ru' 'ec' 'ly' 'ph' 'pa;sl;tr' 'ae;lr'
 'za' 'ng' 'de;jo' 'af' 'es' 'de;iq' 'cd' 'bh' 'iq;ir' 'er;ke' 'tj' 'ke'
 'gb;ru' 'il;it;ru' 'gb;sy' 'gt;pa;sv' 'co;lb' 'ch;iq' 'bj;cf' 'ps'
 'ae;kg' 'vn' 'ch;it' 'ir;ve' 'cn;vg' 'do' 'gt' 'ae;cn;hk' 'ch;it;mt' 'je'
 'vg' 'il' 'cn;gb' 'cz;sc' 'kh' 'th' 'si' 'kg' 'in' 'jo;qa' 'gb;jo'
 'ao;ar;bb;ca;cz;de;es;fr

In [15]:
#get missingness
is_null = res_deduplicated.isnull().sum()
display(is_null)

id                    0
schema                0
name                  0
aliases           11639
birth_date        14956
countries          4581
addresses          6185
identifiers       14588
sanctions            13
phones            29593
emails            29287
dataset               0
last_seen             0
first_seen            0
date_stamp            0
last_change       26834
new_entry             0
deletion              0
unchanged             0
month                 0
listing_count         0
listing_date          0
delisting_date    28795
dtype: int64

In [16]:
res_deduplicated.columns

Index(['id', 'schema', 'name', 'aliases', 'birth_date', 'countries',
       'addresses', 'identifiers', 'sanctions', 'phones', 'emails', 'dataset',
       'last_seen', 'first_seen', 'date_stamp', 'last_change', 'new_entry',
       'deletion', 'unchanged', 'month', 'listing_count', 'listing_date',
       'delisting_date'],
      dtype='object')

In [17]:
res_deduplicated['new_listing_count'] = 0  # Initialize the 'listing_count' column with zeros
res_deduplicated['new_listing_count'] = res_deduplicated.groupby('month')['new_entry'].transform('nunique')
res_deduplicated

Unnamed: 0,id,schema,name,aliases,birth_date,countries,addresses,identifiers,sanctions,phones,...,date_stamp,last_change,new_entry,deletion,unchanged,month,listing_count,listing_date,delisting_date,new_listing_count
0,NK-226JTDehVMSXex35EbKPmi,Organization,BENA PROPERTIES,BANNA PROPERTIES;BENA,,sy,"Cham Holding Building, Daraa Highway, Sahnaya ...",,Block - Program - SDN List - Executive Order 1...,,...,20210930,,True,False,False,2021-09,18450,20210930,,1
1,NK-22HtK7WrxZ2sU3rmhz6PuZ,Person,Michael Kuajien,Michael Kuajian;Michael Kuajien Duer Mayok,1979-01-01,ke;ss,Nairobi;South Sudan,,Program - Block - SDN List - Executive Order 1...,,...,20210928,,True,False,False,2021-09,18450,20210928,,1
2,NK-22iRdVwQHXiYMCAASM93qH,Organization,AL RASHID TRUST,AL AMEEN TRUST;AL AMIN TRUST;AL AMIN WELFARE T...,,pk,"Jamia Masjid, Sulaiman Park, Begum Pura, Lahor...",,Block - Program - SDN List - Unknown - 2001-09-24,,...,20210918,,True,False,False,2021-09,18450,20210918,,1
3,NK-22jYwFUGYJxd6bKdMnxb22,Organization,NEGIN PARTO KHAVAR,ERTEBATE EGHTESSADE MONIR;NEGIN PARTO;NEGIN PA...,,ir,"Tehran, Fatmi Gharabi Street, between Sindokht...",,Block - Program - SDN List - Executive Order 1...,,...,20210918,,True,False,False,2021-09,18450,20210918,,1
4,NK-22oMG6jqPQknWaMjzTn4hK,Company,Limited Liability Company Garantiya,Garantiya OOO,,ru,"bulvar Tverskoi, d. 15 str. 2, Moscow",5067746901426;7703610362,Block - SDN List - Program - Executive Order 1...,,...,20230226,,True,False,False,2023-02,12137,20230226,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29648,ofac-sdn-9993,Person,Yolanda Sofia CANO ALZATE,,1957-04-25,co,"c/o GAVIOTAS LTDA., Colombia",31399608,SDN List - Block - Program - Unknown - 2006-10-25,,...,20210820,,True,False,False,2021-08,9124,20210820,,1
29649,ofac-sdn-9994,Person,Davinson GOMEZ OCAMPO,GORDO,1960-07-10,co,"Calle 16 No. 1-58, Cartago, Valle, Colombia;c/...",2470433,SDN List - Block - Program - Unknown - 2006-10-25,,...,20210820,,True,False,False,2021-08,9124,20210820,,1
29650,ofac-sdn-9995,Person,Carlos Arturo PATINO RESTREPO,PATE MURO;PATEMURO,1964-04-27,co,"Calle 20 No. 6-30, Ofc. 1304, Pereira, Risaral...",9991679,SDN List - Block - Program - Unknown - 2006-10-25,,...,20210820,,True,False,False,2021-08,9124,20210820,,1
29651,ofac-sdn-9996,Person,Luis Alfonso GOMEZ BUSTAMANTE,,1953-11-01,co,"Calle 1B No. 1-26, Barrio El Prado, Cartago, V...",4451571,SDN List - Block - Program - Unknown - 2006-10-25,,...,20210820,,True,False,False,2021-08,9124,20210820,,1


In [18]:
#res_deduplicated = res_deduplicated.drop(['last_seen', 'first_seen','new_entry', 'deletion','unchanged'],axis = 1)

In [19]:
res_deduplicated.to_csv("us_ofac_sdn.csv")