 ## Data Analysis

### Here is a breakdown of what each column represents:

- gvkey: a unique identifier for the company (Global Company Key)
- cid: a unique identifier for the customer
- cnms: customer name
- ctype: customer type
- gareac: geographic area code
- gareat: geographic area type
- salecs: sales in current period (in millions)
- sid: segment identifier
- stype: segment type
- srcdate: source date
- conm: company name
- tic: stock ticker symbol
- cusip: CUSIP number, a unique identifier for a security
- cik: SEC Central Index Key, a unique identifier for a company
- sic: Standard Industrial Classification code, a numerical code used to classify industries



In [1]:
## imports
import pandas as pd
import numpy as np
import os
#from eda import insufficient_but_starting_eda
import seaborn as sns

## Getting the SP500 data

In [2]:
## downloading the SP500 info from the web

os.makedirs("inputs", exist_ok=True)
sp500_file = 'inputs/sp500_2022.csv'

if not os.path.exists(sp500_file):
    url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
    pd.read_html(url)[0].to_csv(sp500_file,index=False)

In [3]:
# comp = pd.read_csv('inputs/cust_supply_2019_2022.csv')
# comp

In [4]:
sp500 = pd.read_csv('inputs/sp500_2022.csv')
sp500

Unnamed: 0,Symbol,Security,GICS Sector,GICS Sub-Industry,Headquarters Location,Date added,CIK,Founded
0,MMM,3M,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1957-03-04,66740,1902
1,AOS,A. O. Smith,Industrials,Building Products,"Milwaukee, Wisconsin",2017-07-26,91142,1916
2,ABT,Abbott,Health Care,Health Care Equipment,"North Chicago, Illinois",1957-03-04,1800,1888
3,ABBV,AbbVie,Health Care,Pharmaceuticals,"North Chicago, Illinois",2012-12-31,1551152,2013 (1888)
4,ACN,Accenture,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989
...,...,...,...,...,...,...,...,...
498,YUM,Yum! Brands,Consumer Discretionary,Restaurants,"Louisville, Kentucky",1997-10-06,1041061,1997
499,ZBRA,Zebra Technologies,Information Technology,Electronic Equipment & Instruments,"Lincolnshire, Illinois",2019-12-23,877212,1969
500,ZBH,Zimmer Biomet,Health Care,Health Care Equipment,"Warsaw, Indiana",2001-08-07,1136869,1927
501,ZION,Zions Bancorporation,Financials,Regional Banks,"Salt Lake City, Utah",2001-06-22,109380,1873


# Merging the data

In [5]:
# Select rows where 'cnms' is not equal to 'U.S. Government'

comp2 = comp
#comp2 = comp[comp['ctype'] == 'COMPANY']
comp2 = comp2[comp2['cnms'] != 'Not Reported']
# comp4 = comp2[comp2['cnms'] == 'Not Reported']
# comp4

comp3 = comp2.dropna(subset=['salecs'])
comp3

NameError: name 'comp' is not defined

In [None]:
comp3 = comp3.rename(columns = {'cik': 'CIK'})
merged = comp3.merge(sp500, on='CIK', how = 'inner')
merged

In [None]:
#looking at the dates - see there is a filing in every month...
merged['date'] = pd.to_datetime(merged['srcdate'])
dates = merged.sort_values(by='srcdate')
print(dates['srcdate'])

In [None]:
start_date = '2020-01-01'
end_date = '2021-12-31'
filtered_df = merged.query('@start_date <= date <= @end_date')

# get the indices of the filtered dates
filtered_indices = filtered_df.index

# drop the filtered dates from the original dataframe
filtered_out_df = merged.drop(filtered_indices)

filtered_out_df

In [None]:
listkeys = pd.DataFrame(filtered_out_df['gvkey'].unique())
listkeys.to_csv('inputs/listkeys.csv', index=False)

In [None]:
g = merged.groupby("CIK")['cnms'].apply(lambda x: list(np.unique(x)))
len(g)

### EDA

In [None]:
insufficient_but_starting_eda(comp, ['cnms', 'ctype', 'gareac', 'gareat', 
                            'stype', 'srcdate', 'conm', 'tic', 'cusip'])


In [None]:
comp.describe() # describes integers

In [None]:
##missing values
ccm = comp
(
    ( # these lines do the calculation - what % of missing values are there for each var
        ccm.isna()      # ccm.isna() TURNS every obs/variable = 1 when its missing and 0 else
       .sum(axis=0)     # count the number of na for each variable (now data is 1 obs per column = # missing)
        /len(ccm)       # convert # missing to % missing 
        *100            # report as percentage
    ) 
    # you can stop here and report this...
    # but I wanted to format it a bit...
    .sort_values(ascending=False)[:13]
    .to_frame(name='% missing') # the next line only works on a frame, and because pandas sees only 1 variable at this pt
    .style.format("{:.1f}")     # in the code, it calls this a "series" type object, so convert it to dataframe type object
)
#