In [1]:
from newsapi import NewsApiClient
import pandas as pd
from pandas.io.json import json_normalize
import datetime as dt

In [2]:
from retrying import retry

In [73]:
comps = pd.read_csv('100-high-risk-Co.csv')

In [74]:
comps.head()

Unnamed: 0,Name1,Name2
0,Acer,Acer
1,Alibaba,Alibaba
2,AstraZeneca,AstraZeneca
3,Asus,Asus
4,Auchan-Retail,Auchan-Retail


In [75]:
SDN = pd.read_csv('SDN.csv', header = None, names = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L'])

In [76]:
SDN.head()

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,K,L
0,36,AEROCARIBBEAN AIRLINES,-0-,CUBA,-0-,-0-,-0-,-0-,-0-,-0-,-0-,-0-
1,173,"ANGLO-CARIBBEAN CO., LTD.",-0-,CUBA,-0-,-0-,-0-,-0-,-0-,-0-,-0-,-0-
2,306,BANCO NACIONAL DE CUBA,-0-,CUBA,-0-,-0-,-0-,-0-,-0-,-0-,-0-,a.k.a. 'BNC'.
3,424,BOUTIQUE LA MAISON,-0-,CUBA,-0-,-0-,-0-,-0-,-0-,-0-,-0-,-0-
4,475,CASA DE CUBA,-0-,CUBA,-0-,-0-,-0-,-0-,-0-,-0-,-0-,-0-


In [77]:
# only retain useful columns
SDN = SDN[['A','B','D']]
SDN.rename(columns = {'A':'index','B':'name','D':'program'},inplace = True)
SDN.head()

Unnamed: 0,index,name,program
0,36,AEROCARIBBEAN AIRLINES,CUBA
1,173,"ANGLO-CARIBBEAN CO., LTD.",CUBA
2,306,BANCO NACIONAL DE CUBA,CUBA
3,424,BOUTIQUE LA MAISON,CUBA
4,475,CASA DE CUBA,CUBA


In [8]:
# get unique country list
programs = SDN.program.unique().tolist()

In [9]:
# SDNs can be grouped by 179 countries
len(programs)

179

In [10]:
programs

['CUBA',
 'SDGT',
 'SDGT] [SYRIA',
 'IRAQ2',
 'SDNT',
 'IRAN] [SDGT] [IRGC] [IFSR',
 'IRAN] [IRAN-EO13902',
 'IRAN] [SDGT] [IFSR',
 'FTO] [SDGT',
 'FTO] [SDGT] [SYRIA',
 'SDNTK] [FTO] [SDGT',
 'SDNTK',
 'SDNTK] [SDGT',
 'SDNTK] [ILLICIT-DRUGS-EO',
 'ZIMBABWE',
 'BALKANS',
 'DRCONGO',
 'SYRIA] [IRAQ2',
 'SYRIA',
 'NPWMD] [DPRK2',
 'NPWMD',
 'NPWMD] [IFSR] [IRAN-CON-ARMS-EO',
 'NPWMD] [IFSR',
 'DARFUR',
 'SDGT] [NS-PLC',
 'BELARUS',
 'FTO] [SDGT] [NPWMD] [IRGC] [IFSR] [IRAN-HR] [HRIT-IR] [ELECTION-EO13848',
 'SDGT] [NPWMD] [IFSR] [IRAN-CON-ARMS-EO',
 'SDGT] [IFSR',
 'FTO] [SDGT] [SYRIA] [IRGC] [IFSR] [IRAN-HR] [ELECTION-EO13848',
 'SDGT] [NPWMD] [IRGC] [IFSR',
 'SDGT] [SYRIA] [NPWMD] [IRGC] [IFSR',
 'IRAQ3',
 'LEBANON',
 'SYRIA] [LEBANON',
 'SDGT] [IRAQ3] [IRGC',
 'FTO] [SDGT] [SOMALIA',
 'SYRIA] [HRIT-SY',
 'SDGT] [CAR',
 'SDGT] [IRAQ3',
 'SDGT] [IRAQ3] [IRGC] [IFSR',
 'NPWMD] [IFSR] [IRAN-TRA',
 'VENEZUELA',
 'IRAN] [SDGT] [IFSR] [IFCA',
 'IRAN',
 'SDNTK] [TCO] [ILLICIT-DRUGS-EO',
 'FT

In [11]:
newsapi = NewsApiClient(api_key='cannot shown here due to confidentiality')

In [84]:
comps_ls = comps['Name2']

In [87]:
# create list of companies in different sanctions program
dft = pd.DataFrame(SDN.groupby('program')['name'].nunique())
dft.to_csv('Aggregate_Program.csv')

we will only focus on entities in Russia - Ukraine program

In [88]:
SDN_R_U = SDN[SDN.program.str.contains('RUSSIA', na=False)|SDN.program.str.contains('Ukraine', na=False)].iloc[:,1]

In [89]:
#classes
class spco_news: 
    def __init__(self, name, news): 
        self.name = name 
        self.news = news

In [90]:
#functions
def write_to_local(df,iter_num):
    if iter_num == 0:
        df.to_csv('news.csv', index=False)
    else:
        df.to_csv('news.csv', mode='a', index=False, header=False)
    

def fetch_news(comp, sdn):
    key = str('"'+ comp + '"' + 'AND' + '"' + sdn + '"')
    data = newsapi.get_everything(q= key,
                                  language='en',
                                  sort_by='relevancy',
                                  page_size=100,
                                  page=1)
    if data['totalResults'] != 0:
        news_articles = pd.DataFrame(data['articles'])
        news_articles['pub_date'] = news_articles.apply(lambda x: dt.datetime.strptime(x['publishedAt'],"%Y-%m-%dT%H:%M:%SZ").date(), axis = 1)
        news = news_articles.loc[:,['content','pub_date']]
        news['SDN'] = sdn
        return news
    else:
        return pd.DataFrame()
    
def frame_news(ls,comp):
    df_output = pd.concat(ls,ignore_index = True)
    df_output['comp'] = comp
    return df_output
        

In [91]:
for num, comp in enumerate(comps_ls[:2]):
    
    news_list = list()
    
    for sdn in SDN_R_U:
        news = fetch_news(comp, sdn)
        if not news.empty:
            news_list.append(news)
    
    print(comp)
    if bool(news_list):
        df = frame_news(news_list,comp)
        write_to_local(df,num)

Acer
Alibaba


In [92]:
test = pd.read_csv('news.csv')

In [93]:
test.shape

(498, 4)

In [94]:
test.head(5)

Unnamed: 0,content,pub_date,SDN,comp
0,The organised criminal association (REvil) has...,2022-01-20,FEDERAL SECURITY SERVICE,Acer
1,According to Russia’s top agency FSB (Federal ...,2022-01-15,FEDERAL SECURITY SERVICE,Acer
2,"In a surprising twist, the Russian government ...",2022-01-14,FEDERAL SECURITY SERVICE,Acer
3,(Bloomberg) -- Russia detained several members...,2022-01-14,FEDERAL SECURITY SERVICE,Acer
4,Russia detained several members of the notorio...,2022-01-14,FEDERAL SECURITY SERVICE,Acer


In [95]:
def datediff(d1, d2):
    d1 = dt.datetime.strptime(d1, "%Y-%m-%d")
    d2 = dt.datetime.strptime(d2, "%Y-%m-%d")
    return abs((d2 - d1).days)

In [96]:
datediff(test['pub_date'][2],test['pub_date'][1])

1

In [97]:
date_p = dt.datetime.now().date()
str_p = str(date_p)
str_p

'2022-03-31'

In [98]:
def function(d1, d2):
    if datediff(d1, d2) <= 30:
        return 'Within 30 days'
    elif 30 < datediff(d1, d2) <= 90:
        return '31-90 days'
    elif 90 < datediff(d1, d2) <= 180:
        return '91-180 days'
    elif 180 < datediff(d1, d2) <= 365:
        return '181-365 days'
    else:
        return 'Over 1 year'

In [99]:
test['Status'] = test.apply(lambda x : function(str_p,x['pub_date']),axis=1)

In [100]:
test.head(5)

Unnamed: 0,content,pub_date,SDN,comp,Status
0,The organised criminal association (REvil) has...,2022-01-20,FEDERAL SECURITY SERVICE,Acer,31-90 days
1,According to Russia’s top agency FSB (Federal ...,2022-01-15,FEDERAL SECURITY SERVICE,Acer,31-90 days
2,"In a surprising twist, the Russian government ...",2022-01-14,FEDERAL SECURITY SERVICE,Acer,31-90 days
3,(Bloomberg) -- Russia detained several members...,2022-01-14,FEDERAL SECURITY SERVICE,Acer,31-90 days
4,Russia detained several members of the notorio...,2022-01-14,FEDERAL SECURITY SERVICE,Acer,31-90 days


In [19]:
test.to_csv('Data_collection_dt.csv')