## Class05

In [1]:
import pandas as pd
import numpy as np

### Compustat Global 

#### Read data

In [4]:
file_path = '/Users/ml/Google Drive/af/teaching/database/data/'
gsecd = pd.read_csv(file_path+'g_secd.txt',sep='\t',low_memory=False)
gsecd.head()

Unnamed: 0,gvkey,iid,datadate,ajexdi,cshoc,prccd,trfd,isin,tpci,fic,prirow
0,1491,01W,19960702,1.0,3805100.0,131.93,1.000413,IL0006320183,0,ISR,01W
1,1491,01W,19960703,1.0,3805100.0,133.25,1.000413,IL0006320183,0,ISR,01W
2,1491,01W,19960704,1.0,3805100.0,135.25,1.000413,IL0006320183,0,ISR,01W
3,1491,01W,19960708,1.0,3805100.0,131.87,1.000413,IL0006320183,0,ISR,01W
4,1491,01W,19960709,1.0,3805100.0,128.57,1.000413,IL0006320183,0,ISR,01W


In [5]:
print('Number of obs:',len(gsecd))

Number of obs:  539761


#### Keep common shares
**tpci** = 0 means common share

In [7]:
gsecd_1 = gsecd[gsecd['tpci']=='0'].copy()
print('Number of obs:',len(gsecd_1))

Number of obs 508329


#### Keep primary share
**prirow** indicates which is primary share

In [8]:
gsecd_2 = gsecd_1[gsecd_1['prirow']==gsecd_1['iid']].copy()
print('Number of obs:',len(gsecd_2))

Number of obs: 469425


#### Remove missing fic

In [9]:
gsecd_3 = gsecd_2[gsecd_2['fic'].notnull()].copy()
print('Number of obs:',len(gsecd_3))

Number of obs: 469198


#### Number of markets

In [10]:
n_mkt = len(gsecd_3['fic'].unique())
print('Number of markets:',n_mkt)

Number of markets: 76


#### Number of years by market

In [13]:
gsecd_3['year'] = (gsecd_3['datadate']/10000).astype(int)
gsecd_3.groupby('fic')['year'].aggregate(['min','max']).head()

Unnamed: 0_level_0,min,max
fic,Unnamed: 1_level_1,Unnamed: 2_level_1
ARG,1996,1996
AUS,1996,1996
AUT,1996,1996
BEL,1996,1996
BGR,1996,1996


#### Number of firms by market

In [16]:
n_firm = gsecd_3.drop_duplicates(['fic','gvkey']).groupby('fic')['gvkey'].count().to_frame('n').reset_index()
n_firm.head()

Unnamed: 0,fic,n
0,ARG,67
1,AUS,548
2,AUT,88
3,BEL,112
4,BGR,1


#### Keep if number of firms is greater than 50

In [17]:
gsecd_4 = gsecd_3.merge(n_firm,how='inner',on='fic')
gsecd_4 = gsecd_4[gsecd_4['n']>50]
print('Number of obs:',len(gsecd_4))

Number of obs: 461017


#### List all markets

In [19]:
gsecd_4.drop_duplicates('fic')[['fic']].reset_index(drop=True)

Unnamed: 0,fic
0,ISR
1,GBR
2,ESP
3,PHL
4,SGP
5,JPN
6,IRL
7,SWE
8,AUS
9,FIN


#### Adjusted price
$$p_{adj} = \frac{prccd}{ajexdi}\times trfd$$

In [21]:
gsecd_4['p_adj'] = gsecd_4['prccd'] / gsecd_4['ajexdi'] * gsecd_4['trfd']
# deal with infinity value in case ajexdi = 0
gsecd_4['p_adj'] = np.where((gsecd_4['p_adj']==np.inf)|(gsecd_4['p_adj']==-np.inf),np.nan,gsecd_4['p_adj'])

### Bloomberg

In [23]:
bb = pd.read_excel(file_path+'bloomberg_data.xlsx',sheet_name='Sheet3')
bb.head()

Unnamed: 0,date,GB00B1XZS820 Equity,GB0006731235 Equity,GB00B02J6398 Equity,GB0000536739 Equity,GB0000456144 Equity,GB0002162385 Equity,GB0009895292 Equity,GB0002634946 Equity,GB0031348658 Equity,...,GB0007669376 Equity,GB00B1FH8J72 Equity,GB0008847096 Equity,DE000TUAG000 Equity,GB0008782301 Equity,GB00B10RZP78 Equity,GB00B39J2M42 Equity,GB00BH4HKS39 Equity,JE00B8KF9B49 Equity,GB00B1KJJ408 Equity
0,1995-12-31,,3380,,100.2,589.7778,,7892.1128,11846.692,168826,...,,3689.5,5908,8130.6519,1285.9,23519.7598,3607.0,1719.9761,1881.2,3252.5
1,1996-12-31,,3647,,186.0,675.6567,,8576.5,13250.6925,186002,...,,3919.5,6216,7971.6686,1253.2,25127.4228,5466.3999,2142.4644,1894.8,3534.8999
2,1997-12-31,,3987,,250.5,1073.5072,91481.0,8256.48,12887.36,232429,...,2131.0,4110.6001,6622,7626.6794,1360.4,19674.0,5658.2002,3417.9853,1979.3,3842.3999
3,1998-12-31,,4014,,414.2,1664.9763,105864.0,8977.895,15018.475,219494,...,2817.8,4718.5,7438,8394.1866,1443.1,21520.0,6207.3999,3894.7934,2452.5,4119.8999
4,1999-12-31,26597.0,3703,,543.7,2286.3169,183184.0,19816.0,28173.344,254793,...,4069.7,4718.5,8699,15235.833,1561.2,27858.9658,6637.8999,5443.3167,3234.3999,4082.6001


#### Remove firms without any valid data

In [25]:
bb_1 = bb.select_dtypes(exclude='object')

#### Transpose data

In [26]:
bb_2 = pd.melt(bb_1,id_vars='date',value_vars=bb_1.columns[1:])
bb_2.head()

Unnamed: 0,date,variable,value
0,1995-12-31,GB00B1XZS820 Equity,
1,1996-12-31,GB00B1XZS820 Equity,
2,1997-12-31,GB00B1XZS820 Equity,
3,1998-12-31,GB00B1XZS820 Equity,
4,1999-12-31,GB00B1XZS820 Equity,26597.0


In [30]:
bb_3 = bb_2.copy()
bb_3.columns = ['date','isin','asset']
bb_3['isin'] = bb_3['isin'].str[:12]
bb_3 = bb_3[['isin','date','asset']]
bb_3 = bb_3.sort_values(['isin','date']).reset_index(drop=True)
bb_3.head(20)

Unnamed: 0,isin,date,asset
0,CH0198251305,1995-12-31,
1,CH0198251305,1996-12-31,
2,CH0198251305,1997-12-31,
3,CH0198251305,1998-12-31,
4,CH0198251305,1999-12-31,
5,CH0198251305,2000-12-31,
6,CH0198251305,2001-12-31,
7,CH0198251305,2002-12-31,
8,CH0198251305,2003-12-31,
9,CH0198251305,2004-12-31,
