#  US Consolidated Screening List

# 1. Import Libraries

In [1]:
# import libraries
import pandas as pd
import numpy as np
import urllib.request
import requests
import random
import time
from time import time as timer
import IPython.display
from tqdm.notebook import tqdm
# Disable the warnings
import warnings
warnings.filterwarnings('ignore')

# 2. Set Sources Name

* Correspondent Account or Payable-Through Account Sanctions (CAP),
* Denied Persons List (DPL),
* ITAR Debarred List (DTC),
* Entity List (EL),
* Foreign Sanctions Evaders List (FSE),
* Nonproliferation Sanctions (ISN),
* Palestinian Legislative Council List (PLC),
* Specially Designated Nationals List (SDN),
* Sectoral Sanctions Identifications List (SSI),
* Unverified List (UVL),
* The Military End User List(MEU),
* Sources Details
https://internationaltradeadministration.github.io/developerportal/consolidated-screening-list.html

In [2]:
# Set sources name
all_sources = ['CAP','CMIC','DPL','DTL','EL','FSE','ISN','MBS','MEU','PLC','SDN','SSI','UVL']

# 3. Call ITA API, clean retrieced data and save to local

In [3]:
def Call_ITA_API(sources):
    
    url = "https://data.trade.gov/consolidated_screening_list/v1/search?sources={sources}&offset=0".format(sources=sources)   
    payload={}       
    headers = {'subscription-key': '66a535dc3dd64b1cbc63eaf7b0c1736d'}
    response = requests.request("GET", url, headers=headers, data=payload)
    response = response.json()
    n_total = response['total']
    
    print("This source {name} contains {n} entries.".format(n=n_total,name=sources))
    print("-"*100)
    print("Source info: {source}".format(source=response['sources']))
    print("-"*100)    
    n_page = int(n_total/50)+1
   
    print('There are {n} pages in total.'.format(n=n_page))    
    df_sources = pd.DataFrame()    
    for page in tqdm(range(n_page)):
        time.sleep(2)      
        url = "https://data.trade.gov/consolidated_screening_list/v1/search?sources={sources}&size=50&offset={i}".format(sources=sources,i=page*50)
        payload={}       
        headers = {'subscription-key': '66a535dc3dd64b1cbc63eaf7b0c1736d'}
        response = requests.request("GET", url, headers=headers, data=payload)
        json_data= response.json()
        try:
            result =json_data['results']
        except:
            print('Error: DPL.keys has no results')            
        # iterate len of each result        
        for j in range(len(result)):
            df= pd.DataFrame(result[j].items()).T
            df.rename(columns=df.iloc[0],inplace=True)
            df = df[1:]
            df_sources = df_sources.append(df, ignore_index=True)
    return df_sources  


def Clean_API_Results(dataset):

    df_addresses = pd.DataFrame()      
    try:
        dataset= dataset.drop(['country'],axis=1)           
    except:
        print('Error: This list does not have a country field')
    df_temp = pd.DataFrame(columns=['address','city','state','postal_code','country'])
    for i in range(len(dataset)):
        if dataset.addresses[i]!=[]:
            df_address = pd.DataFrame.from_dict(dataset.addresses[i][0], orient='index').T
            df_temp = df_temp.append(df_address,ignore_index=True)
        else:
            df_temp = df_temp.append({'address':None, 'city':None, 'state':None,'postal_code':None,'country':None},ignore_index=True)

    dataset = pd.concat([dataset,df_temp],axis=1).drop(['addresses'],axis=1)
    df = dataset  
    df["ids"] = df["ids"].apply(lambda x: None if x == [] else str(x).replace("[",'').replace("]",''))       
    df["alt_names"] = df["alt_names"].apply(lambda x: None if x == [] else str(x).replace("[",'').replace("]",''))
    df["citizenships"] = df["citizenships"].apply(lambda x: None if x == [] else str(x).replace("[",'').replace("]",''))
    df["dates_of_birth"] = df["dates_of_birth"].apply(lambda x: None if x == [] else str(x).replace("[",'').replace("]",''))  
    df["nationalities"] = df["nationalities"].apply(lambda x: None if x == [] else str(x).replace("[",'').replace("]",''))
    df["places_of_birth"] = df["places_of_birth"].apply(lambda x: None if x == [] else str(x).replace("[",'').replace("]",''))
    df["programs"] = df["programs"].apply(lambda x: None if x == [] else str(x).replace("[",'').replace("]",''))
    
    return df


def main():    
    df_all_sources = pd.DataFrame()
    for i in all_sources:
        df_source = Call_ITA_API(sources=i)
        df_all_sources = df_all_sources.append(df_source,ignore_index=True)        
        df_all_sources.drop_duplicates(subset='id',inplace=True)  

    IPython.display.clear_output()     
    
    n_duplicates = len(df_all_sources.id.unique()) 
    print('{n} duplicates entries are dropped'.format(n=n_duplicates))
    # clean data
    df_all_cleaned = Clean_API_Results(dataset=df_all_sources)
    df_all_cleaned.to_csv('data/US_Consolidated_Screening_List.csv')
    print("-"*100)
    print('Dataset is saved to local directory, file name is US_Consolidated_Screening_List.csv')
      

# run main function
if __name__ == '__main__':
    start = timer()
    main()
    print("-"*96)
    print(f'Finished with {timer() - start:.2f} secs')

3435 duplicates entries are dropped
----------------------------------------------------------------------------------------------------
Dataset is saved to local directory, file name is US_Consolidated_Screening_List.csv
------------------------------------------------------------------------------------------------
Finished with 945.49 secs
