In [None]:
import pandas as pd

data_root = '../../MyData/'

### Preparing 10-k file descriptions

In [2]:
# item1 contains descriptions from 10-k files
file_path = data_root + 'raw_data/df_stage4_5years.csv'
item1 = pd.read_csv(file_path)
item1 = item1[item1['Year']==2021]
item1 = item1.drop(['Unnamed: 0.1', 'Unnamed: 0'], axis=1)
print(len(item1))
item1.columns

1225


Index(['CIK', 'Year', 'Content', 'Word_Count', 'Token_Count', 'Summary',
       'Error', 'ErrorMessage', 'Summary_Token_Count', 'Summary_Word_Count',
       'Embeddings'],
      dtype='object')

In [3]:
# There are several firms that have more than one embedding
duplicates = item1[item1['CIK'].duplicated(keep=False)]
print(duplicates.sort_values(by="CIK"))

# I only retain first one here
item1 = item1.drop_duplicates(subset='CIK', keep='first')
print(item1[item1['CIK'].duplicated(keep=False)])
print(len(item1))

          CIK  Year                                            Content  \
1235   861459  2021  Item 1. BUSINESS Introduction Granite Construc...   
3845   861459  2021  Item 1. BUSINESS Introduction Granite Construc...   
831    886128  2021  Item 1.BUSINESSIndex to Item 1. BUSINESS PageF...   
1021   886128  2021  Item 1.     BUSINESS   Index to Item 1. BUSINE...   
2912   898437  2021  Item1. Business 5 PartII Item7. Managements Di...   
6386   898437  2021  ITEM 1. BUSINESS Overview Founded in 1992, Ani...   
1550   943819  2021  ITEM 1. BUSINESS.OverviewUnited Airlines Holdi...   
2935   943819  2021  Item 1. Business. Overview MAA, an S&P 500 com...   
6040   943819  2021  ITEM 1 BUSINESS General We are a global leader...   
4407  1590715  2021  Item 1. Business. Overview When we formed our ...   
5177  1590715  2021  Item 1. Business. Overview When we formed our ...   

      Word_Count  Token_Count  \
1235       25469        29993   
3845       26004        30666   
831         

In [4]:
item1 = item1.rename(columns={'Content':'item1',
                          'Word_Count':'item1_word_count',
                          'Token_Count':'item1_token_count',
                          'Summary':'item1_summary',
                          'Summary_Token_Count':'item1_summary_token_count',
                          'Summary_Word_Count':'item1_summary_word_count',
                          'Embeddings':'item1_embeddings',
                          'CIK':'cik'})
item1.columns

Index(['cik', 'Year', 'item1', 'item1_word_count', 'item1_token_count',
       'item1_summary', 'Error', 'ErrorMessage', 'item1_summary_token_count',
       'item1_summary_word_count', 'item1_embeddings'],
      dtype='object')

### Preparing Compustat data

In [6]:
# Extract compustat data from 2021

# file_path = '../item1_and_financial_variable/compustat_entire_2005_2022.csv'
# df = pd.read_csv(file_path)
# print(len(df))

# df = df[df['fyear'] == 2021]
# file_path = '../item1_and_financial_variable/multi-view/compustat_2021.csv'
# df.to_csv(file_path, index=False)
# print(len(df))

In [5]:
# Obtain compustat data from 2021
file_path = data_root + 'compustat_2021.csv'
compustat2021 = pd.read_csv(file_path)
compustat2021 = compustat2021.rename(columns={'fyear': 'Year'})
print(len(compustat2021))

# remove rows that contain nan in 'cik', 'Year', 'tic'
compustat2021 = compustat2021.dropna(subset=['cik', 'Year', 'tic'])
print(len(compustat2021))

# Here, I only retain 'tic' in compustat.
compustat2021 = compustat2021[['cik', 'Year', 'tic']]

12656
8564


  compustat2021 = pd.read_csv(file_path)


### Filtering out 1197 firms

In [6]:
compustat2021['cik'] = compustat2021["cik"].astype("int64")
compustat2021['Year'] = compustat2021['Year'].astype("int64")

print(item1['cik'].dtype)
print(compustat2021['cik'].dtype)

print(item1['Year'].dtype)
print(compustat2021['Year'].dtype)

int64
int64
int64
int64


In [7]:
set1197 = pd.merge(item1, compustat2021, on=['cik', 'Year'])
print(len(set1197))

# Some duplicate from compustats, droped those duplicated records
set1197 = set1197.drop_duplicates(subset=["cik", "Year"])
print(len(set1197))

1381
1197


### Preparing sp, yh, sa descriptions

In [8]:
# descriptions from sp, yh and sa

file_path = data_root + 'checked_data/final_df_cleaned_retained.csv'

sp_yh_sa = pd.read_csv(file_path)

print(len(sp_yh_sa))

sp_yh_sa.head()

1401


Unnamed: 0.1,Unnamed: 0,cik,conm,gvkey,tic,SIC_Division,SIC_MajorGroup,NAICS_Sector,NAICS_SubSector,hoberg_fic25,...,SP_TOPICTAG,SP_CATEGORIES,SP_CUSIP,SP_TICKER,SP_EXCHANGE_TICKER,SP_COMPANY_NAME,_merge,word_count,YH_BUS_DESC,SA_BUS_DESC
0,0,702325,FIRST MIDWEST BANCORP INC,11896,FMBI,H,60,52,522.0,,...,Anti-Fraud; EDI (Electronic Data Interchange);...,,320867104,FMBI,NASDAQGM:FMBI,"First Midwest Bancorp, Inc.",both,,,
1,1,16040,CABOT CORP,2593,CBT,D,28,32,325.0,11.0,...,CleanTech; Pollution Control,,127055101,CBT,NYSE:CBT,Cabot Corporation,both,,Cabot Corporation operates as a specialty chem...,Cabot Corporation operates as a specialty chem...
2,2,1156039,ELEVANCE HEALTH INC,145046,ELV,H,63,52,524.0,14.0,...,Outpatient Care; Employee Benefits; Chronic Di...,,036752103,ELV,NYSE:ELV,"Elevance Health, Inc.",both,,"Elevance Health, Inc., together with its subsi...","Elevance Health, Inc., together with its subsi..."
3,3,1488813,CUSTOMERS BANCORP INC,170396,CUBI,H,60,52,522.0,4.0,...,Consumer Lending; Fintech; Blockchain; Digital...,,23204G100,CUBI,NYSE:CUBI,"Customers Bancorp, Inc.",both,,"Customers Bancorp, Inc. operates as the bank h...","Customers Bancorp, Inc. operates as the bank h..."
4,4,785161,ENCOMPASS HEALTH CORP,12589,EHC,I,80,62,622.0,14.0,...,Health Diagnostics; Neurology,"Health Care, Medical, Rehabilitation",29261A100,EHC,NYSE:EHC,Encompass Health Corporation,both,,Encompass Health Corporation provides post-acu...,Encompass Health Corporation provides post-acu...


### Merging 10-k with sp, yh, sk

In [9]:
print(len(set1197))
print(len(sp_yh_sa))

1197
1401


In [10]:
# There are 8 firms that don't have record in sp_yh_sa (final_df_cleaned), and no duplicated firm
inner_merged_1_df = pd.merge(set1197, sp_yh_sa, on=['cik', 'tic'])
print(len(inner_merged_1_df))
left_merged_1_df = pd.merge(set1197, sp_yh_sa, how="left", on=['cik', 'tic'])
print(len(left_merged_1_df))

inner_set = set(inner_merged_1_df['tic'])
print(len(inner_set))
left_set = set(left_merged_1_df['tic'])
print(len(left_set))
print(left_set - inner_set)

1189
1197
1189
1197
{'VOLT', 'FLOW', 'FUN', 'CAMP', 'AMRS', 'NLS', 'BBBY', 'TEN'}


In [11]:
# But there are actually totaly 15 rows that don't have any description from sp, yh or sa
print(left_merged_1_df[left_merged_1_df['SP_BUSINESS_DESCRIPTION'].isna() & 
          left_merged_1_df['SP_LONG_BUSINESS_DESCRIPTION'].isna() &
          left_merged_1_df['YH_BUS_DESC'].isna() &
          left_merged_1_df['SA_BUS_DESC'].isna()]['tic'])

77       FLT
98      SMLP
327     NYCB
452       RE
670      NLS
732      FUN
738      AUD
748      GLT
827     BBBY
925     FLOW
1008     TEN
1014    AMRS
1023    CAMP
1154    VOLT
1194    PACW
Name: tic, dtype: object


In [12]:
'''
left_merged_1_df contains:
    basic: cik, tic
    introductions: item1(original content, summary content, embedding), SP(short, long), YH, SA
    classification code: SIC, NAICS, GICS
'''
left_merged_1_df.columns

Index(['cik', 'Year', 'item1', 'item1_word_count', 'item1_token_count',
       'item1_summary', 'Error', 'ErrorMessage', 'item1_summary_token_count',
       'item1_summary_word_count', 'item1_embeddings', 'tic', 'Unnamed: 0',
       'conm', 'gvkey', 'SIC_Division', 'SIC_MajorGroup', 'NAICS_Sector',
       'NAICS_SubSector', 'hoberg_fic25', 'hoberg_fic100', 'GICS_Sector',
       'GICS_IndustryGroup', 'GICS_Industry', 'cusip', 'SP_ENTITY_NAME',
       'SP_ENTITY_ID', 'SP_BUSINESS_DESCRIPTION',
       'SP_LONG_BUSINESS_DESCRIPTION', 'SP_TOPICTAG', 'SP_CATEGORIES',
       'SP_CUSIP', 'SP_TICKER', 'SP_EXCHANGE_TICKER', 'SP_COMPANY_NAME',
       '_merge', 'word_count', 'YH_BUS_DESC', 'SA_BUS_DESC'],
      dtype='object')

### Preparing Orbis data

In [13]:
file_path = data_root + 'checked_data/orbis_filtered.csv'
orbis = pd.read_csv(file_path)
orbis = orbis.rename(columns={'ticker': 'tic'})

# There are only 1163 firms in Orbis
print(len(orbis))
print(orbis.columns)

1163
Index(['Unnamed: 0', 'company_name', 'tic', 'trade_description',
       'products_services', 'primary_business', 'main_activity',
       'main_products', 'description_history', 'full_overview'],
      dtype='object')


### Merging Orbis with previous left_merged_1_df

In [15]:
# There are 193 firms that don't have record in orbis, and no duplicated firm
inner_merged_2_df = pd.merge(left_merged_1_df, orbis[['tic', 'products_services', 'full_overview']], on=['tic'])
print(len(inner_merged_2_df))
left_merged_2_df = pd.merge(left_merged_1_df, orbis[['tic', 'products_services', 'full_overview']], how="left", on=['tic'])
print(len(left_merged_2_df))

inner_set = set(inner_merged_2_df['tic'])
print(len(inner_set))
left_set = set(left_merged_2_df['tic'])
print(len(left_set))
print(f"number of firms that don't have record in orbis: {len(left_set - inner_set)}")

1004
1197
1004
1197
number of firms that don't have record in orbis: 193


In [17]:
# But there are actually totaly 15 rows that don't have any description from sp, yh, sa or orbis
print(left_merged_2_df[left_merged_2_df['SP_BUSINESS_DESCRIPTION'].isna() & 
          left_merged_2_df['SP_LONG_BUSINESS_DESCRIPTION'].isna() &
          left_merged_2_df['YH_BUS_DESC'].isna() &
          left_merged_2_df['SA_BUS_DESC'].isna() & 
          left_merged_2_df['products_services'].isna() &
          left_merged_2_df['full_overview'].isna()]['tic'])

77       FLT
98      SMLP
327     NYCB
452       RE
670      NLS
732      FUN
738      AUD
748      GLT
827     BBBY
925     FLOW
1008     TEN
1014    AMRS
1023    CAMP
1154    VOLT
1194    PACW
Name: tic, dtype: object


In [19]:
'''
121 firms' "trade_description" are short texts like: "Bank Holding Company", "INSURANCE"
Therefore, maybe using full_overview is a better.
'''
def check_orbis_length(row):
    if type(row['trade_description']) == str and len(row['trade_description']) < 100:
        # print(row['trade_description'])
        return 1
    return 0

left_merged_2_df.apply(check_orbis_length, axis=1).sum()

121

In [19]:
left_merged_2_df = left_merged_2_df.rename(columns={'products_services': 'ORBIS_PROD_SERV',
                                                    'full_overview': 'ORBIS_OVERVIEW',
                                                    'SP_BUSINESS_DESCRIPTION': 'SP_SHORT_DESC',
                                                    'SP_LONG_BUSINESS_DESCRIPTION': 'SP_LONG_DESC'})

In [20]:
left_merged_2_df.columns

Index(['cik', 'Year', 'item1', 'item1_word_count', 'item1_token_count',
       'item1_summary', 'Error', 'ErrorMessage', 'item1_summary_token_count',
       'item1_summary_word_count', 'item1_embeddings', 'tic', 'Unnamed: 0',
       'conm', 'gvkey', 'SIC_Division', 'SIC_MajorGroup', 'NAICS_Sector',
       'NAICS_SubSector', 'hoberg_fic25', 'hoberg_fic100', 'GICS_Sector',
       'GICS_IndustryGroup', 'GICS_Industry', 'cusip', 'SP_ENTITY_NAME',
       'SP_ENTITY_ID', 'SP_SHORT_DESC', 'SP_LONG_DESC', 'SP_TOPICTAG',
       'SP_CATEGORIES', 'SP_CUSIP', 'SP_TICKER', 'SP_EXCHANGE_TICKER',
       'SP_COMPANY_NAME', '_merge', 'word_count', 'YH_BUS_DESC', 'SA_BUS_DESC',
       'ORBIS_PROD_SERV', 'ORBIS_OVERVIEW'],
      dtype='object')

In [21]:
# left_merged_2_df.to_csv("data/merged_1197.csv", index=False)