Further information can be obtained on [Wharton's website](https://wrds-www.wharton.upenn.edu/pages/support/manuals-and-overviews/compustat/capital-iq/transcripts/wrds-overview-capitaliq-transcripts-data/#general-description).

In [15]:
import wrds
import pandas as pd

In [11]:
db: wrds.Connection = wrds.Connection()

WRDS recommends setting up a .pgpass file.
Created .pgpass file successfully.
You can create this file yourself at any time with the create_pgpass_file() function.
Loading library list...
Done


Run the query
Using our WRDS connection, db, we can run a query with some joins and filters.This query retrieves transcript component text as well as transcript and speaker metadata.

Three tables are used:

- wrds_transcript_detail - transcript metadata
- wrds_transcript_person - speaker metadata
- ciqtranscriptcomponent - full transcript text
The transcript data is filtered to companies with CIQ CompanyId. This example filters to 112350, 21835, 24937, which are the IDs for IBM, Microsoft, and Apple.

In [51]:
select_companies_with_id = '''
        SELECT DISTINCT d.companyid, d.companyname
        FROM ciq.wrds_transcript_detail as d
        WHERE date_part('year', mostimportantdateutc) BETWEEN 2023 AND 2024
'''

companies: pd.DataFrame = db.raw_sql(select_companies_with_id)

companies.head()

Unnamed: 0,companyid,companyname
0,18511.0,3i Group plc
1,18527.0,ABB Ltd
2,18671.0,Albemarle Corporation
3,18711.0,The Allstate Corporation
4,18749.0,"Amazon.com, Inc."


In [52]:
companies.companyid = companies.companyid.astype(int)
companies.dtypes

companyid               int64
companyname    string[python]
dtype: object

In [79]:
def get_companies(name: str) -> pd.DataFrame:
    """Filter function for search in companies dataframe
    I don't want to reach out to Wharton API with every and each search,
    so this function will filter in the pulled dataframe (companies)

    Args:
        name (str): Company name filter

    Returns:
        pd.DataFrame: list of companies with their id-s
    """
    
    return companies[
        companies.companyname.str.contains(
            name, case=False, na=False
        )
    ]

In [74]:
def get_company_id(company: str) -> int | None:
    """Filtering based on company name and returning the one and only company's id

    Args:
        company (str): Company name, expected full match

    Returns:
        int: returned id, None if there's no such company
    """
    filtered: pd.DataFrame = companies[
        companies.companyname.str.fullmatch(
            company
        )
    ]
    return filtered.companyid.item() if filtered.shape[0] == 1 else None
    

In [None]:
def get_company_names(ids: list[str]) -> pd.DataFrame:
    """Filter function for search in companies dataframe based on id
    I don't want to reach out to Wharton API with every and each search,
    so this function will filter in the pulled dataframe (companies)

    Args:
        ids (list[str]): Company id filter, all matching companies are returned

    Returns:
        pd.DataFrame: list of companies with their id-s
    """
    
    return companies[
        companies.companyname.isin(ids)
    ]

In [80]:
company_name = 'goog'

filtered = get_companies(company_name)
filtered

Unnamed: 0,companyid,companyname
8897,312932093,Google LLC


In [130]:
goog: str = 'Google LLC'

google_id = get_company_id(goog)
google_id

312932093

In [131]:
chipotle = 'Chipotle Mexican Grill, Inc.'
chipotle_id = get_company_id(chipotle)

chipotle_id

26446

In [133]:
asml = 'ASML Holding N.V.'
asml_id = get_company_id(asml)

asml_id

388904

In [None]:
def get_id_string(ids: list[str]):
    company_id_string: str = ""
    for c in ids:
        company_id_string += str(c) + ','
    
    return company_id_string[:-1]

In [None]:
company_id_list: list = [google_id, chipotle_id, asml_id]
    
company_id_string = get_id_string(company_id_list)
company_id_string

'312932093,26446,388904'

In [121]:
sql_query = f'''
            SELECT a.*, b.*, c.componenttext
            FROM (
                  SELECT * 
                  FROM ciq.wrds_transcript_detail
                  WHERE companyid IN ({asml_id})
                    AND date_part('year', mostimportantdateutc) BETWEEN 2018 AND 2025
                 ) AS a
            JOIN ciq.wrds_transcript_person AS b
              ON a.transcriptid = b.transcriptid
            JOIN ciq.ciqtranscriptcomponent AS c
              ON b.transcriptcomponentid = c.transcriptcomponentid
            ORDER BY a.transcriptid, b.componentorder;
            '''

df = db.raw_sql(sql_query)

In [122]:
df.head()

Unnamed: 0,companyid,keydevid,transcriptid,headline,mostimportantdateutc,mostimportanttimeutc,keydeveventtypeid,keydeveventtypename,companyname,transcriptcollectiontypeid,...,transcriptcomponenttypename,transcriptpersonid,transcriptpersonname,proid,companyofperson,speakertypeid,speakertypename,componenttextpreview,word_count,componenttext
0,388904.0,549118855.0,1369961.0,"ASML Holding N.V., Q4 2017 Earnings Call, Jan ...",2018-01-17,14:00:00,48.0,Earnings Calls,ASML Holding N.V.,7,...,Presentation Operator Message,1.0,Operator,,,1,Operator,"Ladies and gentlemen, thank you for standing b...",57,"Ladies and gentlemen, thank you for standing b..."
1,388904.0,549118855.0,1369961.0,"ASML Holding N.V., Q4 2017 Earnings Call, Jan ...",2018-01-17,14:00:00,48.0,Earnings Calls,ASML Holding N.V.,7,...,Presenter Speech,313720.0,Skip Miller,,,2,Executives,"Thank you, operator. Good afternoon, good morn...",211,"Thank you, operator. Good afternoon, good morn..."
2,388904.0,549118855.0,1369961.0,"ASML Holding N.V., Q4 2017 Earnings Call, Jan ...",2018-01-17,14:00:00,48.0,Earnings Calls,ASML Holding N.V.,7,...,Presenter Speech,140594.0,P. Wennink,509830.0,,2,Executives,"Thank you, Skip. Good morning you and good aft...",111,"Thank you, Skip. Good morning you and good aft..."
3,388904.0,549118855.0,1369961.0,"ASML Holding N.V., Q4 2017 Earnings Call, Jan ...",2018-01-17,14:00:00,48.0,Earnings Calls,ASML Holding N.V.,7,...,Presenter Speech,282615.0,Wolfgang Nickl,251796074.0,,2,Executives,"Thank you, Peter, and welcome, everyone. I wil...",834,"Thank you, Peter, and welcome, everyone. I wil..."
4,388904.0,549118855.0,1369961.0,"ASML Holding N.V., Q4 2017 Earnings Call, Jan ...",2018-01-17,14:00:00,48.0,Earnings Calls,ASML Holding N.V.,7,...,Presenter Speech,140594.0,P. Wennink,509830.0,,2,Executives,We also announced a new share buyback program ...,98,We also announced a new share buyback program ...


In [123]:
df = df.drop(['transcriptpersonname'], axis=1)

In [124]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15977 entries, 0 to 15976
Data columns (total 29 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   companyid                       15977 non-null  Float64
 1   keydevid                        15977 non-null  Float64
 2   transcriptid                    15977 non-null  Float64
 3   headline                        15977 non-null  string 
 4   mostimportantdateutc            15977 non-null  string 
 5   mostimportanttimeutc            15977 non-null  string 
 6   keydeveventtypeid               15977 non-null  Float64
 7   keydeveventtypename             15977 non-null  string 
 8   companyname                     15977 non-null  string 
 9   transcriptcollectiontypeid      15977 non-null  Int64  
 10  transcriptcollectiontypename    15977 non-null  string 
 11  transcriptpresentationtypeid    15977 non-null  Int64  
 12  transcriptpresentationtypename  

In [125]:
transcripts: pd.DataFrame = df.groupby(['mostimportantdateutc', 'mostimportanttimeutc', 'headline']).apply(
    lambda group: '\n'.join(
        f'{row['speakertypename']}: {row['componenttext']}' for _, row in group.iterrows()
    )
).reset_index(name='full_text')

  transcripts: pd.DataFrame = df.groupby(['mostimportantdateutc', 'mostimportanttimeutc', 'headline']).apply(


In [126]:
transcripts.head()

Unnamed: 0,mostimportantdateutc,mostimportanttimeutc,headline,full_text
0,2018-01-17,14:00:00,"ASML Holding N.V., Q4 2017 Earnings Call, Jan ...","Operator: Ladies and gentlemen, thank you for ..."
1,2018-04-18,00:00:00,"ASML Holding N.V., Q1 2018 Earnings Call, Apr ...","Executives: Mr. Nickl, the first quarter of 20..."
2,2018-04-18,13:00:00,"ASML Holding N.V., Q1 2018 Earnings Call, Apr ...","Operator: Ladies and gentlemen, thank you for ..."
3,2018-04-25,12:00:00,ASML Holding N.V. - Shareholder/Analyst Call,Executives: Before we open the official meetin...
4,2018-05-16,14:00:00,ASML Holding N.V. Presents at 46th Annual J.P....,"Analysts: Okay, good morning, everyone. My nam..."


In [127]:
transcripts['word_count'] = transcripts['full_text'].apply(
    lambda x: len(str(x).split())
)

In [128]:
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')

transcripts['word_count_nltk'] = transcripts['full_text'].apply(
    lambda x: len(word_tokenize(str(x)))
)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
