**This is the simplified Opensantions data in csv format**

In [1]:
#import packages
import requests
import pandas as pd
import numpy as np
import datetime
import os
import re

# Retrieval of the data

In [2]:
#I can access the data via pandas very easily, output already in tabular form
#data = pd.read_csv('https://data.opensanctions.org/datasets/20240121/us_ofac_sdn/targets.simple.csv')
#data

In [3]:
#create urls for all relevant dates
date_list = pd.date_range(start='20210801',end='20211231',freq='D').strftime('%Y%m%d')
date_list

Index(['20210801', '20210802', '20210803', '20210804', '20210805', '20210806',
       '20210807', '20210808', '20210809', '20210810',
       ...
       '20211222', '20211223', '20211224', '20211225', '20211226', '20211227',
       '20211228', '20211229', '20211230', '20211231'],
      dtype='object', length=153)

In [4]:
#get all the urls for all the dates we need the data for
#make each day an individual list so we can compare and match them
websites = []

for i in date_list:
    test = 'https://data.opensanctions.org/datasets/'+(i)+'/default/targets.simple.csv'
    websites.append(test)
print(websites)

['https://data.opensanctions.org/datasets/20210801/default/targets.simple.csv', 'https://data.opensanctions.org/datasets/20210802/default/targets.simple.csv', 'https://data.opensanctions.org/datasets/20210803/default/targets.simple.csv', 'https://data.opensanctions.org/datasets/20210804/default/targets.simple.csv', 'https://data.opensanctions.org/datasets/20210805/default/targets.simple.csv', 'https://data.opensanctions.org/datasets/20210806/default/targets.simple.csv', 'https://data.opensanctions.org/datasets/20210807/default/targets.simple.csv', 'https://data.opensanctions.org/datasets/20210808/default/targets.simple.csv', 'https://data.opensanctions.org/datasets/20210809/default/targets.simple.csv', 'https://data.opensanctions.org/datasets/20210810/default/targets.simple.csv', 'https://data.opensanctions.org/datasets/20210811/default/targets.simple.csv', 'https://data.opensanctions.org/datasets/20210812/default/targets.simple.csv', 'https://data.opensanctions.org/datasets/20210813/d

#first check for missing days
for site in websites:
    try:
        response = requests.get(site)
        response.raise_for_status()  # Raises an HTTPError for bad response
    except requests.exceptions.RequestException as e:
        print(f"Error accessing {site}: {e}")
        continue

In [5]:
#now loop over the batches to get all the data and append the dataframes to one another
#this gives us a list of dataframes
#the loop must contain a date identifyer as a new column so each dataframe is marked with the retrieval date that is then needed to match the different days against each other
entities_list = []
date_pattern = r'/datasets/(\d{8})/'
for site in websites:
    response = requests.get(site)
    if response.status_code != 200:
        continue
    data = pd.read_csv(site, low_memory=False)
    match = re.search(date_pattern, site) #extract the date from the url
    if match:
        date = match.group(1)
        data['date_stamp'] = date #set an individual date stamp for each dataframe matching the url date
        #ata =  data.set_index('date_stamp') #set as an index
    entities_list.append(data)
res = pd.concat(entities_list)  # concatenate list of dataframes

In [6]:
res

Unnamed: 0,id,schema,name,aliases,birth_date,countries,addresses,identifiers,sanctions,phones,emails,dataset,last_seen,first_seen,date_stamp
0,au-dfat-100,Person,Nazir Mohammad Abdul Basir,Nazar Mohammad;نظر محمد,,af,,,1988 (Taliban),,,Due Diligence List,2021-07-26 11:55:45,2021-08-23 19:57:27,20210823
1,au-dfat-1001,LegalEntity,STATE TRADE ORGANIZATION FOR CAPITAL GOODS,,,,,,1518 (Iraq),,,Due Diligence List,2021-07-26 11:55:45,2021-08-23 19:57:27,20210823
2,au-dfat-1002,LegalEntity,STATE TRADE ORGANIZATION FOR CONSUMER GOODS,,,,,,1518 (Iraq),,,Due Diligence List,2021-07-26 11:55:45,2021-08-23 19:57:27,20210823
3,au-dfat-1006,LegalEntity,STATE TRADING ENTERPRISE FOR PRECISION INSTRUM...,STATE ESTABLISHMENT FOR PRECISION INSTRUMENTS,,,,,1518 (Iraq),,,Due Diligence List,2021-07-26 11:55:45,2021-08-23 19:57:27,20210823
4,au-dfat-101,Person,Mohammad Eshaq Akhunzada,Mohammad Ishaq Akhund;محمد اسحاق آخوند زاده,,af,,,1988 (Taliban),,,Due Diligence List,2021-07-26 11:55:45,2021-08-23 19:57:27,20210823
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117917,wbdeb-631417,LegalEntity,INGENIEROS CONSULTORES Y ASESORES TÉCNICOS SOC...,,,hn,"COLONIA HATO DE ENMEDIO, SECTOR 8, BLOQUE 108,...",,Cross Debarment: IDB - 2021-12-23 - 2033-11-21,,,WorldBank Debarred Providers,2021-12-26 03:03:57,2021-12-31 03:03:35,20211231
117918,wbdeb-631419,LegalEntity,WALTERIO ROMERO VALLADARES,,,hn,"COLONIA HATO DE ENMEDIO, SECTOR 8,BLOQUE 108, ...",,Cross Debarment: IDB - 2021-12-23 - 2033-12-21,,,WorldBank Debarred Providers,2021-12-26 03:03:57,2021-12-31 03:03:35,20211231
117919,wbdeb-631420,LegalEntity,TRACTEBEL ENGINEERING S.A.,,,be,"BOULEVARD SIMON BOLIVAR, 34-36, 1000 BRUSSELS",,Cross Debarment: IDB - 2021-12-23 - 2025-09-28,,,WorldBank Debarred Providers,2021-12-26 03:03:57,2021-12-31 03:03:35,20211231
117920,wbdeb-68368,LegalEntity,RIBALCO INTERNATIONAL,,,gb,BERKSHIRE,,Procurement Guidelines 1.15(a)(i) & (ii) - 200...,,,WorldBank Debarred Providers,2021-07-26 11:55:45,2021-12-31 03:03:35,20211231


# Match the dates against each other

In [7]:
#now match the different dates against each other and return an indicator per row for a new listing or delisting
# Sort the DataFrame based on date_stamp and id
res.sort_values(by=['id', 'date_stamp'], inplace=True)

# Identify new entries and deletions based on the index and columns you want to compare
new_entries = ~res.duplicated(subset=['id'], keep='first')
deletions = ~res.duplicated(subset=['id'], keep='last')

# Identify unchanged rows by checking for duplicates based on the id and date_stamp
unchanged_rows = ~new_entries & ~deletions

# Set the values of new_entry, deletion, and unchanged columns
res['new_entry'] = new_entries
res['deletion'] = deletions
res['unchanged'] = unchanged_rows

# Reset index for the final result
res.reset_index(drop=True, inplace=True)

In [8]:
res

Unnamed: 0,id,schema,name,aliases,birth_date,countries,addresses,identifiers,sanctions,phones,emails,dataset,last_seen,first_seen,date_stamp,new_entry,deletion,unchanged
0,NK-226JTDehVMSXex35EbKPmi,Organization,BENA PROPERTIES,BANNA PROPERTIES;BENA,,sy,"Cham Holding Building, Daraa Highway, Sahnaya ...",,Block - Program - SDN List - Executive Order 1...,,,US OFAC Specially Designated Nationals (SDN) List,2021-09-30 11:39:21,2021-09-30 11:39:21,20210930,True,False,False
1,NK-226JTDehVMSXex35EbKPmi,Organization,BENA PROPERTIES,BANNA PROPERTIES;BENA,,sy,"Cham Holding Building, Daraa Highway, Sahnaya ...",,Program - SDN List - Block - Executive Order 1...,,,US OFAC Specially Designated Nationals (SDN) List,2021-09-30 11:39:21,2021-10-01 03:02:43,20211001,False,False,True
2,NK-226JTDehVMSXex35EbKPmi,Organization,BENA PROPERTIES,BANNA PROPERTIES;BENA,,sy,"Cham Holding Building, Daraa Highway, Sahnaya ...",,Program - SDN List - Block - Executive Order 1...,,,US OFAC Specially Designated Nationals (SDN) List,2021-09-30 11:39:21,2021-10-02 03:03:02,20211002,False,False,True
3,NK-226JTDehVMSXex35EbKPmi,Organization,BENA PROPERTIES,BANNA PROPERTIES;BENA,,sy,"Cham Holding Building, Daraa Highway, Sahnaya ...",,Block - SDN List - Program - Executive Order 1...,,,US OFAC Specially Designated Nationals (SDN) L...,2021-09-30 11:39:21,2021-10-03 08:46:36,20211003,False,False,True
4,NK-226JTDehVMSXex35EbKPmi,Organization,BENA PROPERTIES,BANNA PROPERTIES;BENA,,sy,"Cham Holding Building, Daraa Highway, Sahnaya ...",,Program - Block - SDN List - Executive Order 1...,,,US OFAC Specially Designated Nationals (SDN) L...,2021-09-30 11:39:21,2021-10-04 03:02:58,20211004,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16176849,wbdeb-99814,LegalEntity,"CLOSED JOINT STOCK CO ""SHARORA""",,,tj,"KARIM DEVONA STREET 59, HISSAR RAYON, SHARORA ...",,,,,WorldBank Debarred Providers,2021-07-26 11:55:45,2021-11-04 03:03:12,20211104,False,False,True
16176850,wbdeb-99814,LegalEntity,"CLOSED JOINT STOCK CO ""SHARORA""",,,tj,"KARIM DEVONA STREET 59, HISSAR RAYON, SHARORA ...",,,,,WorldBank Debarred Providers,2021-07-26 11:55:45,2021-11-05 03:02:53,20211105,False,False,True
16176851,wbdeb-99814,LegalEntity,"CLOSED JOINT STOCK CO ""SHARORA""",,,tj,"KARIM DEVONA STREET 59, HISSAR RAYON, SHARORA ...",,"Procurement Guidelines, 1.15(a)(ii) - 2011-11-...",,,WorldBank Debarred Providers,2021-07-26 11:55:45,2021-11-06 11:02:58,20211106,False,False,True
16176852,wbdeb-99814,LegalEntity,"CLOSED JOINT STOCK CO ""SHARORA""",,,tj,"KARIM DEVONA STREET 59, HISSAR RAYON, SHARORA ...",,"Procurement Guidelines, 1.15(a)(ii) - 2011-11-...",,,WorldBank Debarred Providers,2021-07-26 11:55:45,2021-11-07 03:02:51,20211107,False,False,True


In [9]:
#test thematching function
test = res.query('new_entry == True')
test

Unnamed: 0,id,schema,name,aliases,birth_date,countries,addresses,identifiers,sanctions,phones,emails,dataset,last_seen,first_seen,date_stamp,new_entry,deletion,unchanged
0,NK-226JTDehVMSXex35EbKPmi,Organization,BENA PROPERTIES,BANNA PROPERTIES;BENA,,sy,"Cham Holding Building, Daraa Highway, Sahnaya ...",,Block - Program - SDN List - Executive Order 1...,,,US OFAC Specially Designated Nationals (SDN) List,2021-09-30 11:39:21,2021-09-30 11:39:21,20210930,True,False,False
5,NK-22FBSypYXKBCkX2JoWwgrk,Company,"GRUPO MECÁNICA DEL VUELO SISTEMAS, S.A.U.","Grupo Mecánica del Vuelo Sistemas, S.A.U.;Grup...",,es;vn,"ISAAC NEWTON 11, PARQUE TECNOLÓGICO DE MADRID,...",,CROSS-DEBARMENT: WBG - 2021-03-02 - 2024-09-01...,,,African Development Bank Debarred Entities;Int...,2021-07-26 11:55:45,2021-11-09 03:03:03,20211109,True,False,False
58,NK-22HtK7WrxZ2sU3rmhz6PuZ,Person,Michael Kuajien,"KUAJIAN, Michael;KUAJIEN DUER MAYOK, Michael;K...",1979-01-01,ke;ss,Nairobi;South Sudan,,GLOMAG;Program - Block - SDN List - Executive ...,,,US OFAC Specially Designated Nationals (SDN) L...,2021-07-26 11:55:45,2021-09-28 03:03:19,20210928,True,False,False
153,NK-22baaNEkbgBeZokNLmpaKd,Person,DZHAMBULAT GALIMOV,Джанбулат Магомедтагирович Галимов,1989-01-05,ru,,,террорист - Письмо Федеральной службы по финан...,,,INTERPOL Red Notices;Kyrgyz National List,2021-07-26 11:55:45,2021-09-30 11:39:21,20210930,True,False,False
246,NK-22iRdVwQHXiYMCAASM93qH,Organization,AL RASHID TRUST,AL AMEEN TRUST;AL AMIN TRUST;AL AMIN WELFARE T...,,pk,"Jamia Masjid, Sulaiman Park, Begum Pura, Lahor...",,Block - Program - SDN List - Unknown - 2001-09...,,,Due Diligence List,2021-07-26 11:55:45,2021-09-18 12:23:43,20210918,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16176412,wbdeb-83317,LegalEntity,SEYDOU IDANI,,,bf,Burkina Faso,,Consultant Guidelines 1.25(a)(i),,,Due Diligence List,2021-07-26 11:55:45,2021-08-23 19:57:27,20210823,True,False,False
16176542,wbdeb-85420,LegalEntity,TIS CONSULTANTS COMPANY LIMITED,,,th,"151 TEAM BUILDING, MOO 12, NUANCHAN ROAD, KLON...",,Consultant Guidelines 1.25(a)(i),,,Due Diligence List,2021-07-26 11:55:45,2021-08-23 19:57:27,20210823,True,False,False
16176620,wbdeb-90379,LegalEntity,EASTERN BUILDERS AND ENGINEERS LIMITED,,,ug,"P.O. BOX 9062, KAMPALA, Uganda",,"""Procurement Guidelines, 1.14(a)(i); 1.14(a)(ii)""",,,Due Diligence List,2021-07-26 11:55:45,2021-08-23 19:57:27,20210823,True,False,False
16176698,wbdeb-96552,LegalEntity,KWAPLAH INTERNATIONAL TRADING CO. INC.,,,us,"425 S.W., MADISON AVENUE, SUITE U, CORVALLIS, ...",,"Procurement Guidelines 1.15(a)(i) & (ii), 1.14...",,,Due Diligence List,2021-07-26 11:55:45,2021-08-23 19:57:27,20210823,True,False,False


In [10]:
# Add a 'month' column
res['month'] = pd.to_datetime(res['date_stamp']).dt.to_period('M')
#get a count of all listings per month
res['listing_count'] = 0  # Initialize the 'listing_count' column with zeros
res['listing_count'] = res.groupby('month')['id'].transform('nunique')
res

Unnamed: 0,id,schema,name,aliases,birth_date,countries,addresses,identifiers,sanctions,phones,emails,dataset,last_seen,first_seen,date_stamp,new_entry,deletion,unchanged,month,listing_count
0,NK-226JTDehVMSXex35EbKPmi,Organization,BENA PROPERTIES,BANNA PROPERTIES;BENA,,sy,"Cham Holding Building, Daraa Highway, Sahnaya ...",,Block - Program - SDN List - Executive Order 1...,,,US OFAC Specially Designated Nationals (SDN) List,2021-09-30 11:39:21,2021-09-30 11:39:21,20210930,True,False,False,2021-09,147560
1,NK-226JTDehVMSXex35EbKPmi,Organization,BENA PROPERTIES,BANNA PROPERTIES;BENA,,sy,"Cham Holding Building, Daraa Highway, Sahnaya ...",,Program - SDN List - Block - Executive Order 1...,,,US OFAC Specially Designated Nationals (SDN) List,2021-09-30 11:39:21,2021-10-01 03:02:43,20211001,False,False,True,2021-10,130093
2,NK-226JTDehVMSXex35EbKPmi,Organization,BENA PROPERTIES,BANNA PROPERTIES;BENA,,sy,"Cham Holding Building, Daraa Highway, Sahnaya ...",,Program - SDN List - Block - Executive Order 1...,,,US OFAC Specially Designated Nationals (SDN) List,2021-09-30 11:39:21,2021-10-02 03:03:02,20211002,False,False,True,2021-10,130093
3,NK-226JTDehVMSXex35EbKPmi,Organization,BENA PROPERTIES,BANNA PROPERTIES;BENA,,sy,"Cham Holding Building, Daraa Highway, Sahnaya ...",,Block - SDN List - Program - Executive Order 1...,,,US OFAC Specially Designated Nationals (SDN) L...,2021-09-30 11:39:21,2021-10-03 08:46:36,20211003,False,False,True,2021-10,130093
4,NK-226JTDehVMSXex35EbKPmi,Organization,BENA PROPERTIES,BANNA PROPERTIES;BENA,,sy,"Cham Holding Building, Daraa Highway, Sahnaya ...",,Program - Block - SDN List - Executive Order 1...,,,US OFAC Specially Designated Nationals (SDN) L...,2021-09-30 11:39:21,2021-10-04 03:02:58,20211004,False,True,False,2021-10,130093
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16176849,wbdeb-99814,LegalEntity,"CLOSED JOINT STOCK CO ""SHARORA""",,,tj,"KARIM DEVONA STREET 59, HISSAR RAYON, SHARORA ...",,,,,WorldBank Debarred Providers,2021-07-26 11:55:45,2021-11-04 03:03:12,20211104,False,False,True,2021-11,134236
16176850,wbdeb-99814,LegalEntity,"CLOSED JOINT STOCK CO ""SHARORA""",,,tj,"KARIM DEVONA STREET 59, HISSAR RAYON, SHARORA ...",,,,,WorldBank Debarred Providers,2021-07-26 11:55:45,2021-11-05 03:02:53,20211105,False,False,True,2021-11,134236
16176851,wbdeb-99814,LegalEntity,"CLOSED JOINT STOCK CO ""SHARORA""",,,tj,"KARIM DEVONA STREET 59, HISSAR RAYON, SHARORA ...",,"Procurement Guidelines, 1.15(a)(ii) - 2011-11-...",,,WorldBank Debarred Providers,2021-07-26 11:55:45,2021-11-06 11:02:58,20211106,False,False,True,2021-11,134236
16176852,wbdeb-99814,LegalEntity,"CLOSED JOINT STOCK CO ""SHARORA""",,,tj,"KARIM DEVONA STREET 59, HISSAR RAYON, SHARORA ...",,"Procurement Guidelines, 1.15(a)(ii) - 2011-11-...",,,WorldBank Debarred Providers,2021-07-26 11:55:45,2021-11-07 03:02:51,20211107,False,False,True,2021-11,134236


# Deduplicate

In [11]:
res_deduplicated = res.copy()
res_deduplicated.sort_values(by=['id', 'date_stamp'], inplace=True)

# Identify new entries and deletions based on the index and columns you want to compare
new_entries = ~res_deduplicated.duplicated(subset=['id'], keep='first')
deletions = ~res_deduplicated.duplicated(subset=['id'], keep='last')

# Identify unchanged rows by checking for duplicates based on the id and date_stamp
unchanged_rows = ~new_entries & ~deletions

# Create new columns for first seen and last seen dates
res_deduplicated['listing_date'] = res_deduplicated['date_stamp'].where(new_entries)
res_deduplicated['delisting_date'] = res_deduplicated['date_stamp'].where(deletions)

# Deduplicate the entries (keep the first occurrence for each entity)
res_deduplicated = res_deduplicated.drop_duplicates(subset=['id'], keep='first')

# Reset index for the final result
res_deduplicated.reset_index(drop=True, inplace=True)

# Print or further analyze the deduplicated DataFrame 'res_deduplicated'
print(res_deduplicated)

                               id        schema  \
0       NK-226JTDehVMSXex35EbKPmi  Organization   
1       NK-22FBSypYXKBCkX2JoWwgrk       Company   
2       NK-22HtK7WrxZ2sU3rmhz6PuZ        Person   
3       NK-22baaNEkbgBeZokNLmpaKd        Person   
4       NK-22iRdVwQHXiYMCAASM93qH  Organization   
...                           ...           ...   
220885                wbdeb-83317   LegalEntity   
220886                wbdeb-85420   LegalEntity   
220887                wbdeb-90379   LegalEntity   
220888                wbdeb-96552   LegalEntity   
220889                wbdeb-99814   LegalEntity   

                                             name  \
0                                 BENA PROPERTIES   
1       GRUPO MECÁNICA DEL VUELO SISTEMAS, S.A.U.   
2                                 Michael Kuajien   
3                              DZHAMBULAT GALIMOV   
4                                 AL RASHID TRUST   
...                                           ...   
220885          

In [12]:
res_deduplicated

Unnamed: 0,id,schema,name,aliases,birth_date,countries,addresses,identifiers,sanctions,phones,...,last_seen,first_seen,date_stamp,new_entry,deletion,unchanged,month,listing_count,listing_date,delisting_date
0,NK-226JTDehVMSXex35EbKPmi,Organization,BENA PROPERTIES,BANNA PROPERTIES;BENA,,sy,"Cham Holding Building, Daraa Highway, Sahnaya ...",,Block - Program - SDN List - Executive Order 1...,,...,2021-09-30 11:39:21,2021-09-30 11:39:21,20210930,True,False,False,2021-09,147560,20210930,
1,NK-22FBSypYXKBCkX2JoWwgrk,Company,"GRUPO MECÁNICA DEL VUELO SISTEMAS, S.A.U.","Grupo Mecánica del Vuelo Sistemas, S.A.U.;Grup...",,es;vn,"ISAAC NEWTON 11, PARQUE TECNOLÓGICO DE MADRID,...",,CROSS-DEBARMENT: WBG - 2021-03-02 - 2024-09-01...,,...,2021-07-26 11:55:45,2021-11-09 03:03:03,20211109,True,False,False,2021-11,134236,20211109,
2,NK-22HtK7WrxZ2sU3rmhz6PuZ,Person,Michael Kuajien,"KUAJIAN, Michael;KUAJIEN DUER MAYOK, Michael;K...",1979-01-01,ke;ss,Nairobi;South Sudan,,GLOMAG;Program - Block - SDN List - Executive ...,,...,2021-07-26 11:55:45,2021-09-28 03:03:19,20210928,True,False,False,2021-09,147560,20210928,
3,NK-22baaNEkbgBeZokNLmpaKd,Person,DZHAMBULAT GALIMOV,Джанбулат Магомедтагирович Галимов,1989-01-05,ru,,,террорист - Письмо Федеральной службы по финан...,,...,2021-07-26 11:55:45,2021-09-30 11:39:21,20210930,True,False,False,2021-09,147560,20210930,
4,NK-22iRdVwQHXiYMCAASM93qH,Organization,AL RASHID TRUST,AL AMEEN TRUST;AL AMIN TRUST;AL AMIN WELFARE T...,,pk,"Jamia Masjid, Sulaiman Park, Begum Pura, Lahor...",,Block - Program - SDN List - Unknown - 2001-09...,,...,2021-07-26 11:55:45,2021-09-18 12:23:43,20210918,True,False,False,2021-09,147560,20210918,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
220885,wbdeb-83317,LegalEntity,SEYDOU IDANI,,,bf,Burkina Faso,,Consultant Guidelines 1.25(a)(i),,...,2021-07-26 11:55:45,2021-08-23 19:57:27,20210823,True,False,False,2021-08,133396,20210823,
220886,wbdeb-85420,LegalEntity,TIS CONSULTANTS COMPANY LIMITED,,,th,"151 TEAM BUILDING, MOO 12, NUANCHAN ROAD, KLON...",,Consultant Guidelines 1.25(a)(i),,...,2021-07-26 11:55:45,2021-08-23 19:57:27,20210823,True,False,False,2021-08,133396,20210823,
220887,wbdeb-90379,LegalEntity,EASTERN BUILDERS AND ENGINEERS LIMITED,,,ug,"P.O. BOX 9062, KAMPALA, Uganda",,"""Procurement Guidelines, 1.14(a)(i); 1.14(a)(ii)""",,...,2021-07-26 11:55:45,2021-08-23 19:57:27,20210823,True,False,False,2021-08,133396,20210823,
220888,wbdeb-96552,LegalEntity,KWAPLAH INTERNATIONAL TRADING CO. INC.,,,us,"425 S.W., MADISON AVENUE, SUITE U, CORVALLIS, ...",,"Procurement Guidelines 1.15(a)(i) & (ii), 1.14...",,...,2021-07-26 11:55:45,2021-08-23 19:57:27,20210823,True,False,False,2021-08,133396,20210823,


# Descriptives and clean up

In [13]:
#the countries are coded as iso2 codes -> transform to iso 3
#keep both countries? could search for "ru" over regex functions
print(res_deduplicated.countries.unique())
print(res_deduplicated.schema.unique())
print(res_deduplicated.listing_date.unique())

['sy' 'es;vn' 'ke;ss' ... 'be;us' 'ch;gb' 'na;sd']
['Organization' 'Company' 'Person' 'LegalEntity' 'Airplane' 'Vessel'
 'PublicBody']
['20210930' '20211109' '20210928' '20210918' '20211229' '20211004'
 '20210922' '20211113' '20211030' '20211220' '20211106' '20211228'
 '20211003' '20211102' '20210927' '20211117' '20211205' '20211019'
 '20211014' '20211005' '20211103' '20210924' '20211023' '20210923'
 '20211022' '20210929' '20211207' '20211216' '20211202' '20211231'
 '20211209' '20210926' '20211208' '20211224' '20211121' '20211118'
 '20211214' '20211112' '20211001' '20211115' '20211110' '20211215'
 '20211201' '20211123' '20211221' '20211230' '20211026' '20211227'
 '20211223' '20211222' '20211125' '20211217' '20211127' '20210823'
 '20211211' '20211028' '20211203' '20211116' '20211027' '20211111'
 '20211126' '20210914' '20210907' '20211128' '20211010' '20210831'
 '20211114' '20211226' '20211012' '20211204' '20210909' '20210916'
 '20211013' '20211104' '20211107' '20211031' '20211213' '2021

In [14]:
#get missingness
is_null = res_deduplicated.isnull().sum()
display(is_null)

id                     0
schema                 0
name                   0
aliases           106889
birth_date         83128
countries          20584
addresses         175777
identifiers       108337
sanctions         147674
phones            212980
emails            203898
dataset                0
last_seen              0
first_seen             0
date_stamp             0
new_entry              0
deletion               0
unchanged              0
month                  0
listing_count          0
listing_date           0
delisting_date    220166
dtype: int64

In [15]:
res_deduplicated.columns

Index(['id', 'schema', 'name', 'aliases', 'birth_date', 'countries',
       'addresses', 'identifiers', 'sanctions', 'phones', 'emails', 'dataset',
       'last_seen', 'first_seen', 'date_stamp', 'new_entry', 'deletion',
       'unchanged', 'month', 'listing_count', 'listing_date',
       'delisting_date'],
      dtype='object')

In [16]:
res_deduplicated['new_listing_count'] = 0  # Initialize the 'listing_count' column with zeros
res_deduplicated['new_listing_count'] = res_deduplicated.groupby('month')['new_entry'].transform('nunique')
res_deduplicated

Unnamed: 0,id,schema,name,aliases,birth_date,countries,addresses,identifiers,sanctions,phones,...,first_seen,date_stamp,new_entry,deletion,unchanged,month,listing_count,listing_date,delisting_date,new_listing_count
0,NK-226JTDehVMSXex35EbKPmi,Organization,BENA PROPERTIES,BANNA PROPERTIES;BENA,,sy,"Cham Holding Building, Daraa Highway, Sahnaya ...",,Block - Program - SDN List - Executive Order 1...,,...,2021-09-30 11:39:21,20210930,True,False,False,2021-09,147560,20210930,,1
1,NK-22FBSypYXKBCkX2JoWwgrk,Company,"GRUPO MECÁNICA DEL VUELO SISTEMAS, S.A.U.","Grupo Mecánica del Vuelo Sistemas, S.A.U.;Grup...",,es;vn,"ISAAC NEWTON 11, PARQUE TECNOLÓGICO DE MADRID,...",,CROSS-DEBARMENT: WBG - 2021-03-02 - 2024-09-01...,,...,2021-11-09 03:03:03,20211109,True,False,False,2021-11,134236,20211109,,1
2,NK-22HtK7WrxZ2sU3rmhz6PuZ,Person,Michael Kuajien,"KUAJIAN, Michael;KUAJIEN DUER MAYOK, Michael;K...",1979-01-01,ke;ss,Nairobi;South Sudan,,GLOMAG;Program - Block - SDN List - Executive ...,,...,2021-09-28 03:03:19,20210928,True,False,False,2021-09,147560,20210928,,1
3,NK-22baaNEkbgBeZokNLmpaKd,Person,DZHAMBULAT GALIMOV,Джанбулат Магомедтагирович Галимов,1989-01-05,ru,,,террорист - Письмо Федеральной службы по финан...,,...,2021-09-30 11:39:21,20210930,True,False,False,2021-09,147560,20210930,,1
4,NK-22iRdVwQHXiYMCAASM93qH,Organization,AL RASHID TRUST,AL AMEEN TRUST;AL AMIN TRUST;AL AMIN WELFARE T...,,pk,"Jamia Masjid, Sulaiman Park, Begum Pura, Lahor...",,Block - Program - SDN List - Unknown - 2001-09...,,...,2021-09-18 12:23:43,20210918,True,False,False,2021-09,147560,20210918,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
220885,wbdeb-83317,LegalEntity,SEYDOU IDANI,,,bf,Burkina Faso,,Consultant Guidelines 1.25(a)(i),,...,2021-08-23 19:57:27,20210823,True,False,False,2021-08,133396,20210823,,1
220886,wbdeb-85420,LegalEntity,TIS CONSULTANTS COMPANY LIMITED,,,th,"151 TEAM BUILDING, MOO 12, NUANCHAN ROAD, KLON...",,Consultant Guidelines 1.25(a)(i),,...,2021-08-23 19:57:27,20210823,True,False,False,2021-08,133396,20210823,,1
220887,wbdeb-90379,LegalEntity,EASTERN BUILDERS AND ENGINEERS LIMITED,,,ug,"P.O. BOX 9062, KAMPALA, Uganda",,"""Procurement Guidelines, 1.14(a)(i); 1.14(a)(ii)""",,...,2021-08-23 19:57:27,20210823,True,False,False,2021-08,133396,20210823,,1
220888,wbdeb-96552,LegalEntity,KWAPLAH INTERNATIONAL TRADING CO. INC.,,,us,"425 S.W., MADISON AVENUE, SUITE U, CORVALLIS, ...",,"Procurement Guidelines 1.15(a)(i) & (ii), 1.14...",,...,2021-08-23 19:57:27,20210823,True,False,False,2021-08,133396,20210823,,1


In [17]:
#res_deduplicated = res_deduplicated.drop(['last_seen', 'first_seen','new_entry', 'deletion','unchanged'],axis = 1)

In [18]:
res_deduplicated.to_csv("default_batch1.csv")