In [None]:
import pandas as pd 
import numpy as np

# This section turns the global fundamentals data into EM data

fund = pd.read_csv('Glo_Fund.csv', index_col= 'datadate')

fund['niq'] = fund['piq'] - fund['txtq']

fund = fund[['conm', 'sedol', 'fic', 'curcdq',
       'atq', 'ltq', 'niq', 'oiadpq', 'revtq']]

EM_countries = ['BRA', 'CHL', 'CHN', 'COL', 'CZE', 'EGY', 'GRC', 'HUN', 'IND', 'IDN', 'KOR', 'KWT', 'MYS', 'MEX', 'PER', 'PHL', 'POL', 'QAT', 'SAU', 
               'ZAF', 'TWN', 'THA', 'TUR', 'ARE']

fund = fund[fund['fic'].isin(EM_countries)]

fund.to_csv('EM_fund.csv',index=True)

# Now working with EM data

In [None]:
# Load fundamentals data, just from emerging markets
fund = pd.read_csv('EM_Fund.csv', index_col= 'datadate')

In [None]:
# Make index into datetime type
fund.index = pd.to_datetime(fund.index)

In [None]:
# Sort the index
fund = fund.sort_index()

sedols = fund['sedol'].drop_duplicates()

sedols.to_csv('sedols.csv',index=False)

# Sort out the dates for price data

# Dont save these as they are too big, you can get them using the compustat library and the sedol list above
price = pd.read_csv('Daily_EM_price.csv', index_col= 'datadate')
price = price[['conm','sedol','curcdd','prccd','cshoc']]

price.head()

price.index = pd.to_datetime(price.index)

price = price[price.index.month.isin([3,6,9,12])]

price = price[price.index.day.isin([31,30,29,28,27,26])]

price['weekday'] = price.index.dayofweek

price.head()

price = price[price['weekday'] < 5]

price=price.sort_index()

price.index = price.index.astype(str)

last_date_of_each_q = [
 '2000-03-31',
 '2000-06-30',
 '2000-09-29',
 '2000-12-29',
 '2001-03-30',
 '2001-06-29',
 '2001-09-28',
 '2001-12-31',
 '2002-03-29',
 '2002-06-28',
 '2002-09-30',
 '2002-12-31',
 '2003-03-31',
 '2003-06-30',
 '2003-09-30',
 '2003-12-31',
 '2004-03-31',
 '2004-06-30',
 '2004-09-30',
 '2004-12-31',
 '2005-03-31',
 '2005-06-30',
 '2005-09-30',
 '2005-12-30',
 '2006-03-31',
 '2006-06-30',
 '2006-09-29',
 '2006-12-29',
 '2007-03-30',
 '2007-06-29',
 '2007-09-28',
 '2007-12-31',
 '2008-03-31',
 '2008-06-30',
 '2008-09-30',
 '2008-12-31',
 '2009-03-31',
 '2009-06-30',
 '2009-09-30',
 '2009-12-31',
 '2010-03-31',
 '2010-06-30',
 '2010-09-30',
 '2010-12-31',
 '2011-03-31',
 '2011-06-30',
 '2011-09-30',
 '2011-12-30',
 '2012-03-30',
 '2012-06-29',
 '2012-09-28',
 '2012-12-31',
 '2013-03-29',
 '2013-06-28',
 '2013-09-30',
 '2013-12-31',
 '2014-03-31',
 '2014-06-30',
 '2014-09-30',
 '2014-12-31',
 '2015-03-31',
 '2015-06-30',
 '2015-09-30',
 '2015-12-31',
 '2016-03-31',
 '2016-06-30',
 '2016-09-30',
 '2016-12-30',
 '2017-03-31',
 '2017-06-30',
 '2017-09-29',
 '2017-12-29',
 '2018-03-30',
 '2018-06-29',
 '2018-09-28',
 '2018-12-31',
 '2019-03-29',
 '2019-06-28',
 '2019-09-30',
 '2019-12-31',
 '2020-03-31',
 '2020-06-30',
 '2020-09-30',
 '2020-12-31',
 '2021-03-31',
 '2021-06-30',
 '2021-09-30',
 '2021-12-31',
 '2022-03-31',
 '2022-06-30',
 '2022-09-30',
 '2022-12-30',
 '2023-03-31',
 '2023-06-30']

price = price[price.index.isin(last_date_of_each_q)]

price.to_csv('Quarterly_EM_price.csv',index=True)

# Now using quarterly price data 

In [None]:
# Load quarterly emerging markets data
price = pd.read_csv('Quarterly_EM_price.csv', index_col= 'datadate')

In [None]:
# Cut unwanted variables from market data
price = price[['conm', 'sedol', 'curcdd', 'prccd', 'cshoc']]

In [None]:
# Set index of price to datetime type
price.index = pd.to_datetime(price.index)

In [None]:
# Lag fundamental data date by 2 months and 20 days
price['date'] = price.index - pd.DateOffset(months=2) - pd.DateOffset(days=20)

In [None]:
# Get unique list of date in fundamental and market data
price_dates = price['date'].unique().tolist()
fund_dates = fund.index.unique().tolist()

In [None]:
# Creates a dictionary with newest fundamental date before the market data date

dic = {}

for d in price_dates:
    f = [i for i in fund_dates if i <= d]
    if len(f)>0:
        dic[d] = max(f)

In [None]:
# Make a column in price which has the needed fundamentals dates
for i in range(len(price)):
    if price.iloc[i,-1] in dic:
        price.iloc[i,-1] = dic[price.iloc[i,-1]]
    else:
        price.iloc[i,-1] = np.nan

In [None]:
# Make the index of price and fund a key of date + company name
fund['kdate']=fund.index.astype(str)
price['buy_date']=price.index
price['kdate']=price['date'].astype(str)

fund.index  = fund['kdate']+fund['conm']
price.index  = price['kdate'] +price['conm']

In [None]:
# Join the dataframes on the key date
data = price.join(fund,how="left",lsuffix='', rsuffix='f')

In [None]:
# Drop unwanted columns from joined dataset
data = data[['conm', 'sedol', 'curcdd', 'prccd', 'cshoc', 'buy_date',
         'atq', 'ltq', 'niq', 'oiadpq', 'revtq']]

In [None]:
# Drop row which have the same company name on the same date
data = data.drop_duplicates(['conm','buy_date'],keep= 'first')

In [None]:
# Drop columns from data that have certain values as nan
data = data[data['conm'].notna()]
data = data[data['curcdd'].notna()]
data = data[data['prccd'].notna()]
data = data[data['cshoc'].notna()]
data = data[data['buy_date'].notna()]
data = data[data['atq'].notna()]
data = data[data['ltq'].notna()]

# Currency Excahnge 

In [None]:
# Read in foreign exchange data
fx = pd.read_csv('fx.csv', index_col= 'ANNDATS')

In [None]:
# Set index of forex data to datetime type
fx.index = pd.to_datetime(fx.index)

In [None]:
# Set forex date column as index
fx['date'] = fx.index
# Sort fund by index
fund = fund.sort_index()

In [None]:
# Get forex dates and data dates in 2 lists 
buy_dates = data['buy_date'].unique().tolist()
fx_dates = fx.index.unique().tolist()

In [None]:
# Find the foreign exchange date closest (but not after) the buy date and add them to dictionary
dic2 = {}

for d in buy_dates:
    f = [i for i in fx_dates if i <= d]
    
    if len(f)>0:
        dic2[d] = max(f)

In [None]:
# Set cur_date column as na
data['cur_date'] = np.nan

In [None]:
# Set date column cur_date as forex date
for i in range(len(data)):
    if data.iloc[i,-7] in dic2:
        data.iloc[i,-1] = dic2[data.iloc[i,-7]]

In [None]:
# Set cur_date as datetime type
data['cur_date'] = pd.to_datetime(data['cur_date'])

In [None]:
# Make a key to join fx and data
fx.index = fx['date'].astype(str)+fx['CURR']
data.index = data['cur_date'].astype(str)+ data['curcdd']

In [None]:
# Join fx and data on key
data = data.join(fx,how="left",lsuffix='', rsuffix='')

In [None]:
# Drop rows where exchange rate isnt available
data = data[data['EXRAT'].notna()]

In [None]:
# Set data index as buy date 
data.index  = data['buy_date']

In [None]:
# Drop unwanted columns from data
data = data[['conm', 'sedol', 'curcdd', 'prccd', 'cshoc', 'buy_date',
         'atq', 'ltq', 'niq', 'oiadpq', 'revtq','EXRAT']]

In [None]:
# Set exchange rate to zero where currency is dollar
data.loc[data["curcdd"] == "USD", "EXRAT"] = 1

In [None]:
# Convert currencies to USD
data['prccd'] =  data['prccd']*data['EXRAT']
data['atq'] =  data['atq']*data['EXRAT']
data['ltq'] =  data['ltq']*data['EXRAT']
data['niq'] =  data['niq']*data['EXRAT']
data['oiadpq'] =  data['oiadpq']*data['EXRAT']
data['revtq'] =  data['revtq']*data['EXRAT']

In [None]:
# Drop unwanted columns from data
data = data[['conm', 'sedol', 'curcdd', 'prccd', 'cshoc',
         'atq', 'ltq', 'niq', 'oiadpq', 'revtq']]

In [None]:
# No exchange data past 12/22 and missng dates pre 2003
# The universe pre 2005 is small so drop this also
from datetime import datetime

datetime_str = '01/01/05 00:00:00'
date_floor = datetime.strptime(datetime_str, '%m/%d/%y %H:%M:%S')
datetime_str = '01/01/23 00:00:00'
date_cap = datetime.strptime(datetime_str, '%m/%d/%y %H:%M:%S')

data = data[data.index>date_floor]
data = data[data.index<date_cap]

In [None]:
# Drop ros from data that have nan
data = data.dropna()

In [None]:
# Drop data to csv file
data.to_csv('EM_full.csv',index=True)