**This is the simplified Opensantions data in csv format**

In [1]:
#import packages
import requests
import pandas as pd
import numpy as np
import datetime
import os
import re

# Retrieval of the data

In [2]:
#I can access the data via pandas very easily, output already in tabular form
#data = pd.read_csv('https://data.opensanctions.org/datasets/20240121/us_ofac_sdn/targets.simple.csv')
#data

In [3]:
#create urls for all relevant dates
date_list = pd.date_range(start='20230101',end='20231231',freq='D').strftime('%Y%m%d')
date_list

Index(['20230101', '20230102', '20230103', '20230104', '20230105', '20230106',
       '20230107', '20230108', '20230109', '20230110',
       ...
       '20231222', '20231223', '20231224', '20231225', '20231226', '20231227',
       '20231228', '20231229', '20231230', '20231231'],
      dtype='object', length=365)

In [4]:
#get all the urls for all the dates we need the data for
#make each day an individual list so we can compare and match them
websites = []

for i in date_list:
    test = 'https://data.opensanctions.org/datasets/'+(i)+'/default/targets.simple.csv'
    websites.append(test)
print(websites)

['https://data.opensanctions.org/datasets/20230101/default/targets.simple.csv', 'https://data.opensanctions.org/datasets/20230102/default/targets.simple.csv', 'https://data.opensanctions.org/datasets/20230103/default/targets.simple.csv', 'https://data.opensanctions.org/datasets/20230104/default/targets.simple.csv', 'https://data.opensanctions.org/datasets/20230105/default/targets.simple.csv', 'https://data.opensanctions.org/datasets/20230106/default/targets.simple.csv', 'https://data.opensanctions.org/datasets/20230107/default/targets.simple.csv', 'https://data.opensanctions.org/datasets/20230108/default/targets.simple.csv', 'https://data.opensanctions.org/datasets/20230109/default/targets.simple.csv', 'https://data.opensanctions.org/datasets/20230110/default/targets.simple.csv', 'https://data.opensanctions.org/datasets/20230111/default/targets.simple.csv', 'https://data.opensanctions.org/datasets/20230112/default/targets.simple.csv', 'https://data.opensanctions.org/datasets/20230113/d

#first check for missing days
for site in websites:
    try:
        response = requests.get(site)
        response.raise_for_status()  # Raises an HTTPError for bad response
    except requests.exceptions.RequestException as e:
        print(f"Error accessing {site}: {e}")
        continue

In [5]:
#now loop over the batches to get all the data and append the dataframes to one another
#this gives us a list of dataframes
#the loop must contain a date identifyer as a new column so each dataframe is marked with the retrieval date that is then needed to match the different days against each other
entities_list = []
date_pattern = r'/datasets/(\d{8})/'
for site in websites:
    response = requests.get(site)
    if response.status_code != 200:
        continue
    data = pd.read_csv(site, low_memory=False)
    match = re.search(date_pattern, site) #extract the date from the url
    if match:
        date = match.group(1)
        data['date_stamp'] = date #set an individual date stamp for each dataframe matching the url date
        #ata =  data.set_index('date_stamp') #set as an index
    entities_list.append(data)
res = pd.concat(entities_list)  # concatenate list of dataframes

In [6]:
res

Unnamed: 0,id,schema,name,aliases,birth_date,countries,addresses,identifiers,sanctions,phones,emails,dataset,first_seen,last_seen,date_stamp,last_change
0,acf-00040861bc3f593000830d987d09967ef3503ef1,Person,Kolyvanov Egor,Колыванов Егор,1980-11-15,,,,,,,ACF List of bribetakers and warmongers,2022-05-05 15:26:25,2023-01-01 18:12:56,20230101,
1,acf-0011c68a768924609dc5da5707ac7fa4c4d645a2,Person,Shipov Sergei Yurievich,Шипов Сергей Юрьевич,1966-04-17,,,,,,,ACF List of bribetakers and warmongers,2022-07-09 18:14:08,2023-01-01 18:12:56,20230101,
2,acf-001e7e4c0363f08f1e784c230457960b84a6416f,Person,Egorov Ivan Mikhailovich,Егоров Иван Михайлович,1961-01-21,,,,,,,ACF List of bribetakers and warmongers,2022-05-05 15:26:25,2023-01-01 18:12:56,20230101,
3,acf-002c208139012c8d93b6298358188d7cadafe648,Person,Goreslavsky Alexey Sergeyevich,Гореславский Алексей Сергеевич,1977-07-13,,,,,,,ACF List of bribetakers and warmongers,2022-09-20 01:05:26,2023-01-01 18:12:56,20230101,
4,acf-002cc8fdf8fe41185091a7cb6c598663e7a22eb5,Person,Samoilova Natalya Vladimirovna,Самойлова Наталья Владимировна,1987-06-24,,,,,,,ACF List of bribetakers and warmongers,2022-09-20 01:05:26,2023-01-01 18:12:56,20230101,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
550942,wbdeb-811412,LegalEntity,MULTI-TECH CONSULT (PTY) LTD.,,,bw,"PLOT 20746, BLOCK 3, INDUSTRIAL, GABORONE, Bot...",,Fraudulent Practices - 2023 - 2027,,,WorldBank Debarred Providers,2023-12-06T15:03:01,2023-12-31T15:03:01,20231231,2023-12-06T15:03:01
550943,wbdeb-811413,LegalEntity,MR. PETER LAMBILEKI,,,bw,"PLOT 20746, BLOCK 3, INDUSTRIAL, GABORONE, Bot...",,Fraudulent Practices - 2023 - 2027,,,WorldBank Debarred Providers,2023-12-06T15:03:01,2023-12-31T15:03:01,20231231,2023-12-06T15:03:01
550944,wbdeb-816302,LegalEntity,LEI SITAO,,,cn,"NO.6 EPANG 1ST ROAD, WEIYANG DISTRICT, XI'AN C...",,Cross Debarment: ADB - 2020 - 2999-12-31,,,WorldBank Debarred Providers,2023-12-22T15:03:01,2023-12-31T15:03:01,20231231,2023-12-22T15:03:01
550945,wbdeb-83317,LegalEntity,SEYDOU IDANI,,,bf,,,Consultant Guidelines 1.25(a)(i) - 2004 - 2999...,,,WorldBank Debarred Providers,2023-04-20T12:18:15,2023-12-31T15:03:01,20231231,2023-04-20T12:18:15


# Match the dates against each other

In [7]:
#now match the different dates against each other and return an indicator per row for a new listing or delisting
# Sort the DataFrame based on date_stamp and id
res.sort_values(by=['id', 'date_stamp'], inplace=True)

# Identify new entries and deletions based on the index and columns you want to compare
new_entries = ~res.duplicated(subset=['id'], keep='first')
deletions = ~res.duplicated(subset=['id'], keep='last')

# Identify unchanged rows by checking for duplicates based on the id and date_stamp
unchanged_rows = ~new_entries & ~deletions

# Set the values of new_entry, deletion, and unchanged columns
res['new_entry'] = new_entries
res['deletion'] = deletions
res['unchanged'] = unchanged_rows

# Reset index for the final result
res.reset_index(drop=True, inplace=True)

In [8]:
res

Unnamed: 0,id,schema,name,aliases,birth_date,countries,addresses,identifiers,sanctions,phones,emails,dataset,first_seen,last_seen,date_stamp,last_change,new_entry,deletion,unchanged
0,NK-223yQP6hRaMuiALDCJ6xbY,Organization,"Limited Liability Company ""Zelinsky Group""","""Общество с ограниченной ответственностью """"Зе...",,ru,"115054, Russian Federation, Moscow, 57 Dubinin...",1187746408761;7725491052,"813/2023 - valid - 2023-12-07 - 2033-12-07;""Th...",,,Ukraine NABC Sanctions Tracker;Ukraine Nationa...,2023-12-08T12:30:01,2023-12-13T18:30:01,20231213,2023-12-13T09:22:01,True,False,False
1,NK-223yQP6hRaMuiALDCJ6xbY,Organization,"Limited Liability Company ""Zelinsky Group""","""Общество с ограниченной ответственностью """"Зе...",,ru,"115054, Russian Federation, Moscow, 57 Dubinin...",1187746408761;7725491052,"813/2023 - valid - 2023-12-07 - 2033-12-07;""Яв...",,,Ukraine NABC Sanctions Tracker;Ukraine Nationa...,2023-12-08T12:30:01,2023-12-14T18:30:01,20231214,2023-12-13T09:22:01,False,False,True
2,NK-223yQP6hRaMuiALDCJ6xbY,Organization,"Limited Liability Company ""Zelinsky Group""","""Общество с ограниченной ответственностью """"Зе...",,ru,"115054, Russian Federation, Moscow, 57 Dubinin...",1187746408761;7725491052,"813/2023 - valid - 2023-12-07 - 2033-12-07;""Th...",,,Ukraine NABC Sanctions Tracker;Ukraine Nationa...,2023-12-08T12:30:01,2023-12-15T18:30:01,20231215,2023-12-13T09:22:01,False,False,True
3,NK-223yQP6hRaMuiALDCJ6xbY,Organization,"Limited Liability Company ""Zelinsky Group""","""Общество с ограниченной ответственностью """"Зе...",,ru,"115054, Russian Federation, Moscow, 57 Dubinin...",1187746408761;7725491052,"813/2023 - valid - 2023-12-07 - 2033-12-07;""Th...",,,Ukraine NABC Sanctions Tracker;Ukraine Nationa...,2023-12-08T12:30:01,2023-12-16T18:30:01,20231216,2023-12-13T09:22:01,False,False,True
4,NK-223yQP6hRaMuiALDCJ6xbY,Organization,"Limited Liability Company ""Zelinsky Group""","""Общество с ограниченной ответственностью """"Зе...",,ru,"115054, Russian Federation, Moscow, 57 Dubinin...",1187746408761;7725491052,"813/2023 - valid - 2023-12-07 - 2033-12-07;""Є ...",,,Ukraine NABC Sanctions Tracker;Ukraine Nationa...,2023-12-08T12:30:01,2023-12-17T18:30:01,20231217,2023-12-13T09:22:01,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128036606,zafic-93,Person,MEHREZ BEN MAHMOUD BEN SASSI AL-AMDOUNI,,1969-12-18,it;tn,Italy,"Passport, G737411","""""",,,South African Targeted Financial Sanctions,2023-02-06 12:11:45,2023-02-06 18:23:11,20230206,,True,True,False
128036607,zafic-94,Person,ABU UBAYDAH YUSUF AL-ANABI,,1969-02-07,dz,Algeria,,"""""",,,South African Targeted Financial Sanctions,2023-02-06 12:11:45,2023-02-06 18:23:11,20230206,,True,True,False
128036608,zafic-96,Person,MOHAMED BEN BELGACEM BEN ABDALLAH AL-AOUADI,,1974-12-11,tn,"50th Street, Number 23, Zehrouni, Tunis",,"""""",,,South African Targeted Financial Sanctions,2023-02-06 12:11:45,2023-02-06 18:23:11,20230206,,True,True,False
128036609,zafic-97,Person,IBRAHIM HASSAN TALI AL-ASIRI,,1982-04-18,sa;ye,Yemen,"National Identification Number, 1028745097, Pa...","""""",,,South African Targeted Financial Sanctions,2023-02-06 12:11:45,2023-02-06 18:23:11,20230206,,True,True,False


In [9]:
#test thematching function
test = res.query('new_entry == True')
test

Unnamed: 0,id,schema,name,aliases,birth_date,countries,addresses,identifiers,sanctions,phones,emails,dataset,first_seen,last_seen,date_stamp,last_change,new_entry,deletion,unchanged
0,NK-223yQP6hRaMuiALDCJ6xbY,Organization,"Limited Liability Company ""Zelinsky Group""","""Общество с ограниченной ответственностью """"Зе...",,ru,"115054, Russian Federation, Moscow, 57 Dubinin...",1187746408761;7725491052,"813/2023 - valid - 2023-12-07 - 2033-12-07;""Th...",,,Ukraine NABC Sanctions Tracker;Ukraine Nationa...,2023-12-08T12:30:01,2023-12-13T18:30:01,20231213,2023-12-13T09:22:01,True,False,False
19,NK-224TRezPqwzhQZ37exWxtX,Person,SANAVBARI NIKITENKO,,1992-06-28,ru,,,,,,INTERPOL Red Notices,2023-05-06T06:10:15,2023-05-31T20:36:56,20230531,,True,False,False
234,NK-226GXBdQ5p6NjgrTpTQNVW,Company,"Открытое акционерное общество ""Электростальски...","""Open Joint-Stock Company """"Elektrostal Chemic...",,ru,"144001, Russian, Federation, Moscow region, м....",1025007108390;5053002307;505301001,"""ВАТ """"ЕХМЗ ім. М.Д. Зелінського"""" розробляє т...",,,"Russian National Settlement Depository (NSD, I...",2022-01-01T00:00:00,2023-12-08T18:30:01,20231208,2023-12-08T12:30:01,True,False,False
258,NK-228ZdYZVXaZBSBgVwapnks,Company,"Private enterprise ""Master-SG""","""ПП """"МАГІСТАР-СГ"""""";""ПРИВАТНЕ ПІДПРИЄМСТВО """"...",,ua,"79034, Lviv region, Lviv city, str. Navrotskoh...",42206417,"""1) блокування активів – тимчасове обмеження п...",,,Ukraine Consolidated State Registry;Ukraine NA...,2022-04-06 08:09:32,2023-01-01 18:14:35,20230101,,True,False,False
622,NK-228jBYSTdUSvbZvsKsiHh6,Organization,"Joint-stock company ""Electroagregat""","""Акционерное общество """"Электроагрегат"""""";""Акц...",,ru,"305022, Russian Federation, Kursk region, Kurs...",1024600965531;4631005223,192/2023 - valid - 2023-04-01 - 2033-04-01;act...,,,Ukraine NABC Sanctions Tracker;Ukraine Nationa...,2023-04-02 12:17:08,2023-04-12 18:15:02,20230412,,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128036606,zafic-93,Person,MEHREZ BEN MAHMOUD BEN SASSI AL-AMDOUNI,,1969-12-18,it;tn,Italy,"Passport, G737411","""""",,,South African Targeted Financial Sanctions,2023-02-06 12:11:45,2023-02-06 18:23:11,20230206,,True,True,False
128036607,zafic-94,Person,ABU UBAYDAH YUSUF AL-ANABI,,1969-02-07,dz,Algeria,,"""""",,,South African Targeted Financial Sanctions,2023-02-06 12:11:45,2023-02-06 18:23:11,20230206,,True,True,False
128036608,zafic-96,Person,MOHAMED BEN BELGACEM BEN ABDALLAH AL-AOUADI,,1974-12-11,tn,"50th Street, Number 23, Zehrouni, Tunis",,"""""",,,South African Targeted Financial Sanctions,2023-02-06 12:11:45,2023-02-06 18:23:11,20230206,,True,True,False
128036609,zafic-97,Person,IBRAHIM HASSAN TALI AL-ASIRI,,1982-04-18,sa;ye,Yemen,"National Identification Number, 1028745097, Pa...","""""",,,South African Targeted Financial Sanctions,2023-02-06 12:11:45,2023-02-06 18:23:11,20230206,,True,True,False


In [10]:
# Add a 'month' column
res['month'] = pd.to_datetime(res['date_stamp']).dt.to_period('M')
#get a count of all listings per month
res['listing_count'] = 0  # Initialize the 'listing_count' column with zeros
res['listing_count'] = res.groupby('month')['id'].transform('nunique')
res

Unnamed: 0,id,schema,name,aliases,birth_date,countries,addresses,identifiers,sanctions,phones,...,dataset,first_seen,last_seen,date_stamp,last_change,new_entry,deletion,unchanged,month,listing_count
0,NK-223yQP6hRaMuiALDCJ6xbY,Organization,"Limited Liability Company ""Zelinsky Group""","""Общество с ограниченной ответственностью """"Зе...",,ru,"115054, Russian Federation, Moscow, 57 Dubinin...",1187746408761;7725491052,"813/2023 - valid - 2023-12-07 - 2033-12-07;""Th...",,...,Ukraine NABC Sanctions Tracker;Ukraine Nationa...,2023-12-08T12:30:01,2023-12-13T18:30:01,20231213,2023-12-13T09:22:01,True,False,False,2023-12,563325
1,NK-223yQP6hRaMuiALDCJ6xbY,Organization,"Limited Liability Company ""Zelinsky Group""","""Общество с ограниченной ответственностью """"Зе...",,ru,"115054, Russian Federation, Moscow, 57 Dubinin...",1187746408761;7725491052,"813/2023 - valid - 2023-12-07 - 2033-12-07;""Яв...",,...,Ukraine NABC Sanctions Tracker;Ukraine Nationa...,2023-12-08T12:30:01,2023-12-14T18:30:01,20231214,2023-12-13T09:22:01,False,False,True,2023-12,563325
2,NK-223yQP6hRaMuiALDCJ6xbY,Organization,"Limited Liability Company ""Zelinsky Group""","""Общество с ограниченной ответственностью """"Зе...",,ru,"115054, Russian Federation, Moscow, 57 Dubinin...",1187746408761;7725491052,"813/2023 - valid - 2023-12-07 - 2033-12-07;""Th...",,...,Ukraine NABC Sanctions Tracker;Ukraine Nationa...,2023-12-08T12:30:01,2023-12-15T18:30:01,20231215,2023-12-13T09:22:01,False,False,True,2023-12,563325
3,NK-223yQP6hRaMuiALDCJ6xbY,Organization,"Limited Liability Company ""Zelinsky Group""","""Общество с ограниченной ответственностью """"Зе...",,ru,"115054, Russian Federation, Moscow, 57 Dubinin...",1187746408761;7725491052,"813/2023 - valid - 2023-12-07 - 2033-12-07;""Th...",,...,Ukraine NABC Sanctions Tracker;Ukraine Nationa...,2023-12-08T12:30:01,2023-12-16T18:30:01,20231216,2023-12-13T09:22:01,False,False,True,2023-12,563325
4,NK-223yQP6hRaMuiALDCJ6xbY,Organization,"Limited Liability Company ""Zelinsky Group""","""Общество с ограниченной ответственностью """"Зе...",,ru,"115054, Russian Federation, Moscow, 57 Dubinin...",1187746408761;7725491052,"813/2023 - valid - 2023-12-07 - 2033-12-07;""Є ...",,...,Ukraine NABC Sanctions Tracker;Ukraine Nationa...,2023-12-08T12:30:01,2023-12-17T18:30:01,20231217,2023-12-13T09:22:01,False,False,True,2023-12,563325
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128036606,zafic-93,Person,MEHREZ BEN MAHMOUD BEN SASSI AL-AMDOUNI,,1969-12-18,it;tn,Italy,"Passport, G737411","""""",,...,South African Targeted Financial Sanctions,2023-02-06 12:11:45,2023-02-06 18:23:11,20230206,,True,True,False,2023-02,281302
128036607,zafic-94,Person,ABU UBAYDAH YUSUF AL-ANABI,,1969-02-07,dz,Algeria,,"""""",,...,South African Targeted Financial Sanctions,2023-02-06 12:11:45,2023-02-06 18:23:11,20230206,,True,True,False,2023-02,281302
128036608,zafic-96,Person,MOHAMED BEN BELGACEM BEN ABDALLAH AL-AOUADI,,1974-12-11,tn,"50th Street, Number 23, Zehrouni, Tunis",,"""""",,...,South African Targeted Financial Sanctions,2023-02-06 12:11:45,2023-02-06 18:23:11,20230206,,True,True,False,2023-02,281302
128036609,zafic-97,Person,IBRAHIM HASSAN TALI AL-ASIRI,,1982-04-18,sa;ye,Yemen,"National Identification Number, 1028745097, Pa...","""""",,...,South African Targeted Financial Sanctions,2023-02-06 12:11:45,2023-02-06 18:23:11,20230206,,True,True,False,2023-02,281302


# Deduplicate

In [11]:
res_deduplicated = res.copy()
res_deduplicated.sort_values(by=['id', 'date_stamp'], inplace=True)

# Identify new entries and deletions based on the index and columns you want to compare
new_entries = ~res_deduplicated.duplicated(subset=['id'], keep='first')
deletions = ~res_deduplicated.duplicated(subset=['id'], keep='last')

# Identify unchanged rows by checking for duplicates based on the id and date_stamp
unchanged_rows = ~new_entries & ~deletions

# Create new columns for first seen and last seen dates
res_deduplicated['listing_date'] = res_deduplicated['date_stamp'].where(new_entries)
res_deduplicated['delisting_date'] = res_deduplicated['date_stamp'].where(deletions)

# Deduplicate the entries (keep the first occurrence for each entity)
res_deduplicated = res_deduplicated.drop_duplicates(subset=['id'], keep='first')

# Reset index for the final result
res_deduplicated.reset_index(drop=True, inplace=True)

# Print or further analyze the deduplicated DataFrame 'res_deduplicated'
print(res_deduplicated)

                               id        schema  \
0       NK-223yQP6hRaMuiALDCJ6xbY  Organization   
1       NK-224TRezPqwzhQZ37exWxtX        Person   
2       NK-226GXBdQ5p6NjgrTpTQNVW       Company   
3       NK-228ZdYZVXaZBSBgVwapnks       Company   
4       NK-228jBYSTdUSvbZvsKsiHh6  Organization   
...                           ...           ...   
662837                   zafic-93        Person   
662838                   zafic-94        Person   
662839                   zafic-96        Person   
662840                   zafic-97        Person   
662841                   zafic-98        Person   

                                                     name  \
0              Limited Liability Company "Zelinsky Group"   
1                                     SANAVBARI NIKITENKO   
2       Открытое акционерное общество "Электростальски...   
3                          Private enterprise "Master-SG"   
4                    Joint-stock company "Electroagregat"   
...                  

In [12]:
res_deduplicated

Unnamed: 0,id,schema,name,aliases,birth_date,countries,addresses,identifiers,sanctions,phones,...,last_seen,date_stamp,last_change,new_entry,deletion,unchanged,month,listing_count,listing_date,delisting_date
0,NK-223yQP6hRaMuiALDCJ6xbY,Organization,"Limited Liability Company ""Zelinsky Group""","""Общество с ограниченной ответственностью """"Зе...",,ru,"115054, Russian Federation, Moscow, 57 Dubinin...",1187746408761;7725491052,"813/2023 - valid - 2023-12-07 - 2033-12-07;""Th...",,...,2023-12-13T18:30:01,20231213,2023-12-13T09:22:01,True,False,False,2023-12,563325,20231213,
1,NK-224TRezPqwzhQZ37exWxtX,Person,SANAVBARI NIKITENKO,,1992-06-28,ru,,,,,...,2023-05-31T20:36:56,20230531,,True,False,False,2023-05,373925,20230531,
2,NK-226GXBdQ5p6NjgrTpTQNVW,Company,"Открытое акционерное общество ""Электростальски...","""Open Joint-Stock Company """"Elektrostal Chemic...",,ru,"144001, Russian, Federation, Moscow region, м....",1025007108390;5053002307;505301001,"""ВАТ """"ЕХМЗ ім. М.Д. Зелінського"""" розробляє т...",,...,2023-12-08T18:30:01,20231208,2023-12-08T12:30:01,True,False,False,2023-12,563325,20231208,
3,NK-228ZdYZVXaZBSBgVwapnks,Company,"Private enterprise ""Master-SG""","""ПП """"МАГІСТАР-СГ"""""";""ПРИВАТНЕ ПІДПРИЄМСТВО """"...",,ua,"79034, Lviv region, Lviv city, str. Navrotskoh...",42206417,"""1) блокування активів – тимчасове обмеження п...",,...,2023-01-01 18:14:35,20230101,,True,False,False,2023-01,306805,20230101,
4,NK-228jBYSTdUSvbZvsKsiHh6,Organization,"Joint-stock company ""Electroagregat""","""Акционерное общество """"Электроагрегат"""""";""Акц...",,ru,"305022, Russian Federation, Kursk region, Kurs...",1024600965531;4631005223,192/2023 - valid - 2023-04-01 - 2033-04-01;act...,,...,2023-04-12 18:15:02,20230412,,True,False,False,2023-04,277039,20230412,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
662837,zafic-93,Person,MEHREZ BEN MAHMOUD BEN SASSI AL-AMDOUNI,,1969-12-18,it;tn,Italy,"Passport, G737411","""""",,...,2023-02-06 18:23:11,20230206,,True,True,False,2023-02,281302,20230206,20230206
662838,zafic-94,Person,ABU UBAYDAH YUSUF AL-ANABI,,1969-02-07,dz,Algeria,,"""""",,...,2023-02-06 18:23:11,20230206,,True,True,False,2023-02,281302,20230206,20230206
662839,zafic-96,Person,MOHAMED BEN BELGACEM BEN ABDALLAH AL-AOUADI,,1974-12-11,tn,"50th Street, Number 23, Zehrouni, Tunis",,"""""",,...,2023-02-06 18:23:11,20230206,,True,True,False,2023-02,281302,20230206,20230206
662840,zafic-97,Person,IBRAHIM HASSAN TALI AL-ASIRI,,1982-04-18,sa;ye,Yemen,"National Identification Number, 1028745097, Pa...","""""",,...,2023-02-06 18:23:11,20230206,,True,True,False,2023-02,281302,20230206,20230206


# Descriptives and clean up

In [13]:
#the countries are coded as iso2 codes -> transform to iso 3
#keep both countries? could search for "ru" over regex functions
print(res_deduplicated.countries.unique())
print(res_deduplicated.schema.unique())
print(res_deduplicated.listing_date.unique())

['ru' 'ua' 'es;vn' ... 'es;no' 'um' 'sa;us']
['Organization' 'Person' 'Company' 'LegalEntity' 'Airplane' 'Vessel'
 'PublicBody' 'Security' 'CryptoWallet']
['20231213' '20230531' '20231208' '20230101' '20230412' '20230115'
 '20230226' '20230323' '20230721' '20230526' '20230912' '20230605'
 '20230703' '20230201' '20230314' '20230530' '20231206' '20230320'
 '20230628' '20230830' '20230722' '20230916' '20231121' '20231027'
 '20230515' '20231106' '20231223' '20231214' '20230608' '20230319'
 '20230427' '20230522' '20230921' '20230915' '20230418' '20231019'
 '20230712' '20230904' '20230302' '20230501' '20230502' '20230205'
 '20230807' '20230523' '20230504' '20230216' '20230817' '20230715'
 '20230518' '20230223' '20231020' '20230713' '20230215' '20231120'
 '20230309' '20230120' '20230331' '20230705' '20230520' '20230228'
 '20230406' '20231026' '20231212' '20230318' '20230403' '20231123'
 '20230422' '20230704' '20230124' '20230413' '20230102' '20231128'
 '20230301' '20230710' '20230423' '202304

In [14]:
#get missingness
is_null = res_deduplicated.isnull().sum()
display(is_null)

id                     0
schema                 0
name                   0
aliases           424924
birth_date        423980
countries          47191
addresses         509073
identifiers       266132
sanctions         476261
phones            656171
emails            651815
dataset                0
first_seen             1
last_seen              1
date_stamp             0
last_change       450254
new_entry              0
deletion               0
unchanged              0
month                  0
listing_count          0
listing_date           0
delisting_date    653282
dtype: int64

In [15]:
res_deduplicated.columns

Index(['id', 'schema', 'name', 'aliases', 'birth_date', 'countries',
       'addresses', 'identifiers', 'sanctions', 'phones', 'emails', 'dataset',
       'first_seen', 'last_seen', 'date_stamp', 'last_change', 'new_entry',
       'deletion', 'unchanged', 'month', 'listing_count', 'listing_date',
       'delisting_date'],
      dtype='object')

In [16]:
res_deduplicated['new_listing_count'] = 0  # Initialize the 'listing_count' column with zeros
res_deduplicated['new_listing_count'] = res_deduplicated.groupby('month')['new_entry'].transform('nunique')
res_deduplicated

Unnamed: 0,id,schema,name,aliases,birth_date,countries,addresses,identifiers,sanctions,phones,...,date_stamp,last_change,new_entry,deletion,unchanged,month,listing_count,listing_date,delisting_date,new_listing_count
0,NK-223yQP6hRaMuiALDCJ6xbY,Organization,"Limited Liability Company ""Zelinsky Group""","""Общество с ограниченной ответственностью """"Зе...",,ru,"115054, Russian Federation, Moscow, 57 Dubinin...",1187746408761;7725491052,"813/2023 - valid - 2023-12-07 - 2033-12-07;""Th...",,...,20231213,2023-12-13T09:22:01,True,False,False,2023-12,563325,20231213,,1
1,NK-224TRezPqwzhQZ37exWxtX,Person,SANAVBARI NIKITENKO,,1992-06-28,ru,,,,,...,20230531,,True,False,False,2023-05,373925,20230531,,1
2,NK-226GXBdQ5p6NjgrTpTQNVW,Company,"Открытое акционерное общество ""Электростальски...","""Open Joint-Stock Company """"Elektrostal Chemic...",,ru,"144001, Russian, Federation, Moscow region, м....",1025007108390;5053002307;505301001,"""ВАТ """"ЕХМЗ ім. М.Д. Зелінського"""" розробляє т...",,...,20231208,2023-12-08T12:30:01,True,False,False,2023-12,563325,20231208,,1
3,NK-228ZdYZVXaZBSBgVwapnks,Company,"Private enterprise ""Master-SG""","""ПП """"МАГІСТАР-СГ"""""";""ПРИВАТНЕ ПІДПРИЄМСТВО """"...",,ua,"79034, Lviv region, Lviv city, str. Navrotskoh...",42206417,"""1) блокування активів – тимчасове обмеження п...",,...,20230101,,True,False,False,2023-01,306805,20230101,,1
4,NK-228jBYSTdUSvbZvsKsiHh6,Organization,"Joint-stock company ""Electroagregat""","""Акционерное общество """"Электроагрегат"""""";""Акц...",,ru,"305022, Russian Federation, Kursk region, Kurs...",1024600965531;4631005223,192/2023 - valid - 2023-04-01 - 2033-04-01;act...,,...,20230412,,True,False,False,2023-04,277039,20230412,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
662837,zafic-93,Person,MEHREZ BEN MAHMOUD BEN SASSI AL-AMDOUNI,,1969-12-18,it;tn,Italy,"Passport, G737411","""""",,...,20230206,,True,True,False,2023-02,281302,20230206,20230206,1
662838,zafic-94,Person,ABU UBAYDAH YUSUF AL-ANABI,,1969-02-07,dz,Algeria,,"""""",,...,20230206,,True,True,False,2023-02,281302,20230206,20230206,1
662839,zafic-96,Person,MOHAMED BEN BELGACEM BEN ABDALLAH AL-AOUADI,,1974-12-11,tn,"50th Street, Number 23, Zehrouni, Tunis",,"""""",,...,20230206,,True,True,False,2023-02,281302,20230206,20230206,1
662840,zafic-97,Person,IBRAHIM HASSAN TALI AL-ASIRI,,1982-04-18,sa;ye,Yemen,"National Identification Number, 1028745097, Pa...","""""",,...,20230206,,True,True,False,2023-02,281302,20230206,20230206,1


In [17]:
#res_deduplicated = res_deduplicated.drop(['last_seen', 'first_seen','new_entry', 'deletion','unchanged'],axis = 1)

In [18]:
res_deduplicated.to_csv("default_batch3.csv")