In [1]:
## imports
import pandas as pd
import numpy as np
import os
from eda import insufficient_but_starting_eda
import seaborn as sns

### Here is a breakdown of what each column represents:

- gvkey: a unique identifier for the company (Global Company Key)
- cid: a unique identifier for the customer
- cnms: customer name
- ctype: customer type
- gareac: geographic area code
- gareat: geographic area type
- salecs: sales in current period (in millions)
- sid: segment identifier
- stype: segment type
- srcdate: source date
- conm: company name
- tic: stock ticker symbol
- cusip: CUSIP number, a unique identifier for a security
- cik: SEC Central Index Key, a unique identifier for a company
- sic: Standard Industrial Classification code, a numerical code used to classify industries



In [2]:
## downloading the SP500 info from the web
os.makedirs("inputs", exist_ok=True)
sp500_file = 'inputs/sp500_2022.csv'

if not os.path.exists(sp500_file):
    url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
    pd.read_html(url)[0].to_csv(sp500_file,index=False)

In [3]:
comp = pd.read_csv('inputs/cust_supply_2019_2022.csv')
comp

Unnamed: 0,gvkey,cid,cnms,ctype,gareac,gareat,salecs,sid,stype,srcdate,conm,tic,cusip,cik,sic
0,1004,31,All Other,MARKET,,,,0,,2019-05-31,AAR CORP,AIR,000361105,1750.0,5080
1,1004,18,U.S. Government,GOVDOM,USA,ISO,455.900,20,BUSSEG,2019-05-31,AAR CORP,AIR,000361105,1750.0,5080
2,1004,26,U.S. Government,GOVDOM,USA,ISO,90.300,22,BUSSEG,2019-05-31,AAR CORP,AIR,000361105,1750.0,5080
3,1004,36,Europe/Africa,GEOREG,EUROPE,REG,5.800,22,BUSSEG,2019-05-31,AAR CORP,AIR,000361105,1750.0,5080
4,1004,34,Other,GEOREG,OTHER,REG,170.400,20,BUSSEG,2019-05-31,AAR CORP,AIR,000361105,1750.0,5080
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77896,350681,1,Large Corporate Clients,MARKET,,,76.814,1,BUSSEG,2021-12-31,GETNET ADQUIRENCIA E,GET,37428A103,1867325.0,7374
77897,353444,4,Rest of the World,GEOREG,R_WORLD,REG,7148.294,1,BUSSEG,2021-12-31,HALEON PLC,HLN,405552100,,2834
77898,353444,3,China,GEOREG,CHN,ISO,1084.634,1,BUSSEG,2021-12-31,HALEON PLC,HLN,405552100,,2834
77899,353444,2,US,GEOREG,USA,ISO,4249.166,1,BUSSEG,2021-12-31,HALEON PLC,HLN,405552100,,2834


In [4]:
sp500 = pd.read_csv('inputs/sp500_2022.csv')
sp500

Unnamed: 0,Symbol,Security,GICS Sector,GICS Sub-Industry,Headquarters Location,Date added,CIK,Founded
0,MMM,3M,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1957-03-04,66740,1902
1,AOS,A. O. Smith,Industrials,Building Products,"Milwaukee, Wisconsin",2017-07-26,91142,1916
2,ABT,Abbott,Health Care,Health Care Equipment,"North Chicago, Illinois",1957-03-04,1800,1888
3,ABBV,AbbVie,Health Care,Pharmaceuticals,"North Chicago, Illinois",2012-12-31,1551152,2013 (1888)
4,ACN,Accenture,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989
...,...,...,...,...,...,...,...,...
498,YUM,Yum! Brands,Consumer Discretionary,Restaurants,"Louisville, Kentucky",1997-10-06,1041061,1997
499,ZBRA,Zebra Technologies,Information Technology,Electronic Equipment & Instruments,"Lincolnshire, Illinois",2019-12-23,877212,1969
500,ZBH,Zimmer Biomet,Health Care,Health Care Equipment,"Warsaw, Indiana",2001-08-07,1136869,1927
501,ZION,Zions Bancorporation,Financials,Regional Banks,"Salt Lake City, Utah",2001-06-22,109380,1873


# EDA

## SP500 Data

In [5]:
sp500.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
CIK,503.0,796191.729622,553471.843765,1800.0,99119.0,885725.0,1139327.0,1932393.0


In [6]:
insufficient_but_starting_eda(sp500)


  Symbol     Security             GICS Sector               GICS Sub-Industry  \
0    MMM           3M             Industrials        Industrial Conglomerates   
1    AOS  A. O. Smith             Industrials               Building Products   
2    ABT       Abbott             Health Care           Health Care Equipment   
3   ABBV       AbbVie             Health Care                 Pharmaceuticals   
4    ACN    Accenture  Information Technology  IT Consulting & Other Services   

     Headquarters Location  Date added      CIK      Founded  
0    Saint Paul, Minnesota  1957-03-04    66740         1902  
1     Milwaukee, Wisconsin  2017-07-26    91142         1916  
2  North Chicago, Illinois  1957-03-04     1800         1888  
3  North Chicago, Illinois  2012-12-31  1551152  2013 (1888)  
4          Dublin, Ireland  2011-07-06  1467373         1989   
---
    Symbol              Security             GICS Sector  \
498    YUM           Yum! Brands  Consumer Discretionary   
499   ZBRA

## Compustat Data

In [7]:
comp.describe() # describes integers

Unnamed: 0,gvkey,cid,salecs,sid,cik,sic
count,77901.0,77901.0,68264.0,77901.0,77419.0,77901.0
mean,65404.898576,27.709336,876.577773,4.974634,1043317.0,4681.055211
std,72289.710392,28.479348,4733.002892,7.997586,553630.3,1997.68622
min,1004.0,1.0,-3464.0,0.0,1750.0,100.0
25%,13189.0,8.0,10.4,1.0,806517.0,3310.0
50%,30571.0,19.0,75.8225,1.0,1076682.0,3841.0
75%,115044.0,38.0,411.1425,6.0,1511337.0,6798.0
max,353444.0,261.0,278969.09,99.0,1962738.0,9997.0


In [8]:
insufficient_but_starting_eda(comp, ['cnms', 'ctype', 'gareac', 'gareat', 
                            'stype', 'srcdate', 'conm', 'tic', 'cusip'])


   gvkey  cid             cnms   ctype  gareac gareat  salecs  sid   stype  \
0   1004   31        All Other  MARKET     NaN    NaN     NaN    0     NaN   
1   1004   18  U.S. Government  GOVDOM     USA    ISO   455.9   20  BUSSEG   
2   1004   26  U.S. Government  GOVDOM     USA    ISO    90.3   22  BUSSEG   
3   1004   36    Europe/Africa  GEOREG  EUROPE    REG     5.8   22  BUSSEG   
4   1004   34            Other  GEOREG   OTHER    REG   170.4   20  BUSSEG   

      srcdate      conm  tic      cusip     cik   sic  
0  2019-05-31  AAR CORP  AIR  000361105  1750.0  5080  
1  2019-05-31  AAR CORP  AIR  000361105  1750.0  5080  
2  2019-05-31  AAR CORP  AIR  000361105  1750.0  5080  
3  2019-05-31  AAR CORP  AIR  000361105  1750.0  5080  
4  2019-05-31  AAR CORP  AIR  000361105  1750.0  5080   
---
        gvkey  cid                     cnms   ctype   gareac gareat    salecs  \
77896  350681    1  Large Corporate Clients  MARKET      NaN    NaN    76.814   
77897  353444    4        Re

In [9]:
##missing values
ccm = comp
(
    ( # these lines do the calculation - what % of missing values are there for each var
        ccm.isna()      # ccm.isna() TURNS every obs/variable = 1 when its missing and 0 else
       .sum(axis=0)     # count the number of na for each variable (now data is 1 obs per column = # missing)
        /len(ccm)       # convert # missing to % missing 
        *100            # report as percentage
    ) 
    # you can stop here and report this...
    # but I wanted to format it a bit...
    .sort_values(ascending=False)[:13]
    .to_frame(name='% missing') # the next line only works on a frame, and because pandas sees only 1 variable at this pt
    .style.format("{:.1f}")     # in the code, it calls this a "series" type object, so convert it to dataframe type object
)
#

Unnamed: 0,% missing
gareac,57.8
gareat,57.8
stype,14.0
salecs,12.4
cik,0.6
tic,0.0
gvkey,0.0
cid,0.0
cnms,0.0
ctype,0.0


## Accounting Data

In [10]:
acct_raw = pd.read_csv("inputs/acct_data.csv")
acct_raw

Unnamed: 0,gvkey,fyear,acominc,ap,at,capx,capxv,cogs,epsfx,gp,ib,invt,ni,oibdp,rect,sale
0,1004,2018,-40.900,187.800,1517.200,17.400,17.400,1679.500,2.40,372.300,84.100,589.000,7.500,153.500,258.100,2051.800
1,1004,2019,-44.600,191.600,2079.000,23.600,23.600,1728.700,0.71,360.600,24.800,692.700,4.400,150.100,229.100,2089.300
2,1004,2020,-18.300,127.200,1539.700,11.300,11.300,1364.600,1.30,286.800,46.300,591.000,35.800,101.800,238.600,1651.400
3,1004,2021,-19.600,156.400,1573.900,17.300,17.300,1470.300,2.16,346.800,78.500,604.100,78.700,149.300,290.300,1817.100
4,1045,2018,-5274.000,1773.000,60580.000,3745.000,3745.000,31365.000,3.03,13176.000,1412.000,1522.000,1412.000,5606.000,1706.000,44541.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26900,349972,2022,0.096,1.378,28.064,0.000,0.000,0.000,-1.73,0.000,-14.323,0.000,-14.323,-14.909,0.000,0.000
26901,350681,2021,-0.043,9263.248,10595.813,67.367,67.367,265.517,0.09,246.577,85.469,7.341,85.469,180.979,9796.007,512.094
26902,351038,2021,0.000,0.923,9.357,0.000,0.000,0.000,-5.42,0.000,-21.463,0.000,-21.463,-5.959,0.000,0.000
26903,351038,2022,0.000,0.606,14.458,0.000,0.000,0.000,-3.90,0.000,-9.381,0.000,-9.381,-9.153,0.000,0.000


In [11]:
insufficient_but_starting_eda(acct_raw)

   gvkey  fyear  acominc      ap       at    capx   capxv     cogs  epsfx  \
0   1004   2018    -40.9   187.8   1517.2    17.4    17.4   1679.5   2.40   
1   1004   2019    -44.6   191.6   2079.0    23.6    23.6   1728.7   0.71   
2   1004   2020    -18.3   127.2   1539.7    11.3    11.3   1364.6   1.30   
3   1004   2021    -19.6   156.4   1573.9    17.3    17.3   1470.3   2.16   
4   1045   2018  -5274.0  1773.0  60580.0  3745.0  3745.0  31365.0   3.03   

        gp      ib    invt      ni   oibdp    rect     sale  
0    372.3    84.1   589.0     7.5   153.5   258.1   2051.8  
1    360.6    24.8   692.7     4.4   150.1   229.1   2089.3  
2    286.8    46.3   591.0    35.8   101.8   238.6   1651.4  
3    346.8    78.5   604.1    78.7   149.3   290.3   1817.1  
4  13176.0  1412.0  1522.0  1412.0  5606.0  1706.0  44541.0   
---
        gvkey  fyear   acominc        ap         at     capx    capxv  \
26900  349972   2022     0.096     1.378     28.064    0.000    0.000   
26901  350681 

In [12]:
##missing values
ccm = acct_raw
(
    ( # these lines do the calculation - what % of missing values are there for each var
        ccm.isna()      # ccm.isna() TURNS every obs/variable = 1 when its missing and 0 else
       .sum(axis=0)     # count the number of na for each variable (now data is 1 obs per column = # missing)
        /len(ccm)       # convert # missing to % missing 
        *100            # report as percentage
    ) 
    # you can stop here and report this...
    # but I wanted to format it a bit...
    .sort_values(ascending=False)[:20]
    .to_frame(name='% missing') # the next line only works on a frame, and because pandas sees only 1 variable at this pt
    .style.format("{:.1f}")     # in the code, it calls this a "series" type object, so convert it to dataframe type object
)
#

Unnamed: 0,% missing
capxv,16.3
oibdp,11.4
invt,8.9
capx,8.7
rect,8.5
acominc,8.3
ap,8.3
epsfx,8.1
ib,8.0
ni,8.0
