**This is the simplified Opensantions data in csv format**

In [1]:
#import packages
import requests
import pandas as pd
import numpy as np
import datetime
import os
import re

# Retrieval of the data

In [2]:
#I can access the data via pandas very easily, output already in tabular form
#data = pd.read_csv('https://data.opensanctions.org/datasets/20240121/us_ofac_sdn/targets.simple.csv')
#data

In [3]:
#create urls for all relevant dates
date_list = pd.date_range(start='20210820',end='20231231',freq='D').strftime('%Y%m%d')
date_list

Index(['20210820', '20210821', '20210822', '20210823', '20210824', '20210825',
       '20210826', '20210827', '20210828', '20210829',
       ...
       '20231222', '20231223', '20231224', '20231225', '20231226', '20231227',
       '20231228', '20231229', '20231230', '20231231'],
      dtype='object', length=864)

In [4]:
#get all the urls for all the dates we need the data for
#make each day an individual list so we can compare and match them
websites = []

for i in date_list:
    test = 'https://data.opensanctions.org/datasets/'+(i)+'/us_ofac_cons/targets.simple.csv'
    websites.append(test)
print(websites)

['https://data.opensanctions.org/datasets/20210820/us_ofac_cons/targets.simple.csv', 'https://data.opensanctions.org/datasets/20210821/us_ofac_cons/targets.simple.csv', 'https://data.opensanctions.org/datasets/20210822/us_ofac_cons/targets.simple.csv', 'https://data.opensanctions.org/datasets/20210823/us_ofac_cons/targets.simple.csv', 'https://data.opensanctions.org/datasets/20210824/us_ofac_cons/targets.simple.csv', 'https://data.opensanctions.org/datasets/20210825/us_ofac_cons/targets.simple.csv', 'https://data.opensanctions.org/datasets/20210826/us_ofac_cons/targets.simple.csv', 'https://data.opensanctions.org/datasets/20210827/us_ofac_cons/targets.simple.csv', 'https://data.opensanctions.org/datasets/20210828/us_ofac_cons/targets.simple.csv', 'https://data.opensanctions.org/datasets/20210829/us_ofac_cons/targets.simple.csv', 'https://data.opensanctions.org/datasets/20210830/us_ofac_cons/targets.simple.csv', 'https://data.opensanctions.org/datasets/20210831/us_ofac_cons/targets.simp

#first check for missing days
for site in websites:
    try:
        response = requests.get(site)
        response.raise_for_status()  # Raises an HTTPError for bad response
    except requests.exceptions.RequestException as e:
        print(f"Error accessing {site}: {e}")
        continue

In [None]:
#now loop over the batches to get all the data and append the dataframes to one another
#this gives us a list of dataframes
#the loop must contain a date identifyer as a new column so each dataframe is marked with the retrieval date that is then needed to match the different days against each other
entities_list = []
date_pattern = r'/datasets/(\d{8})/'
for site in websites:
    response = requests.get(site)
    if response.status_code != 200:
        continue
    data = pd.read_csv(site)
    match = re.search(date_pattern, site) #extract the date from the url
    if match:
        date = match.group(1)
        data['date_stamp'] = date #set an individual date stamp for each dataframe matching the url date
        #ata =  data.set_index('date_stamp') #set as an index
    entities_list.append(data)
res = pd.concat(entities_list)  # concatenate list of dataframes

# Match the dates against each other

In [None]:
res

In [None]:
#now match the different dates against each other and return an indicator per row for a new listing or delisting
# Sort the DataFrame based on date_stamp and id
res.sort_values(by=['id', 'date_stamp'], inplace=True)

# Identify new entries and deletions based on the index and columns you want to compare
new_entries = ~res.duplicated(subset=['id'], keep='first')
deletions = ~res.duplicated(subset=['id'], keep='last')

# Identify unchanged rows by checking for duplicates based on the id and date_stamp
unchanged_rows = ~new_entries & ~deletions

# Set the values of new_entry, deletion, and unchanged columns
res['new_entry'] = new_entries
res['deletion'] = deletions
res['unchanged'] = unchanged_rows

# Reset index for the final result
res.reset_index(drop=True, inplace=True)

In [None]:
res

In [None]:
#test thematching function
test = res.query('new_entry == True')
test

# Deduplicate

In [None]:
res_deduplicated = res.copy()
res_deduplicated.sort_values(by=['id', 'date_stamp'], inplace=True)

# Identify new entries and deletions based on the index and columns you want to compare
new_entries = ~res_deduplicated.duplicated(subset=['id'], keep='first')
deletions = ~res_deduplicated.duplicated(subset=['id'], keep='last')

# Identify unchanged rows by checking for duplicates based on the id and date_stamp
unchanged_rows = ~new_entries & ~deletions

# Create new columns for first seen and last seen dates
res_deduplicated['listing_date'] = res_deduplicated['date_stamp'].where(new_entries)
res_deduplicated['delisting_date'] = res_deduplicated['date_stamp'].where(deletions)

# Add a 'month' column
res_deduplicated['month'] = pd.to_datetime(res_deduplicated['date_stamp']).dt.to_period('M')

# Deduplicate the entries (keep the first occurrence for each entity)
res_deduplicated = res_deduplicated.drop_duplicates(subset=['id'], keep='first')

# Reset index for the final result
res_deduplicated.reset_index(drop=True, inplace=True)

# Print or further analyze the deduplicated DataFrame 'res_deduplicated'
print(res_deduplicated)

In [None]:
res_deduplicated

# Descriptives and clean up

In [None]:
#the countries are coded as iso2 codes -> transform to iso 3
#keep both countries? could search for "ru" over regex functions
print(res_deduplicated.countries.unique())
print(res_deduplicated.schema.unique())
print(res_deduplicated.listing_date.unique())

In [None]:
list_regimes = list[res_deduplicated.sanctions.unique()]
print(list_regimes)
#but it doesnt show me all

In [None]:
#get missingness
is_null = res_deduplicated.isnull().sum()
display(is_null)

In [None]:
res_deduplicated.columns

In [None]:
res_deduplicated = res_deduplicated.drop([ 'last_seen', 'first_seen','new_entry', 'deletion','unchanged'],axis = 1)

In [None]:
res_deduplicated.to_csv("us_bis_batch5.csv")