Connecting to WRDS

In [47]:

import os
import wrds
import pandas as pd
import sqlite3

wrds_username = os.environ.get("WRDS_USERNAME")
wrds_password = os.environ.get("WRDS_PASSWORD")

db = wrds.Connection(wrds_username=wrds_username, wrds_password=wrds_password)


Loading library list...
Done


# Sample query

In [16]:

# Retrieve 2015 - 2017 Earnings Conference Call Transcripts with Full-text components
sql_query = '''SELECT a.*, b.*, c.componenttext \
                 FROM (SELECT * FROM ciq.wrds_transcript_detail \
                                WHERE companyid in (112350, 21835, 24937) AND \
                                      date_part('year',mostimportantdateutc)>=2015 AND \
                                      date_part('year',mostimportantdateutc)<=2017) AS a, \
                      ciq.wrds_transcript_person as b, ciq.ciqtranscriptcomponent AS c \
                 WHERE a.transcriptid=b.transcriptid AND b.transcriptcomponentid=c.transcriptcomponentid \
                 ORDER by a.transcriptid, b.componentorder;'''
 
print(sql_query) 
data = db.raw_sql(sql_query)

SELECT a.*, b.*, c.componenttext                  FROM (SELECT * FROM ciq.wrds_transcript_detail                                 WHERE companyid in (112350, 21835, 24937) AND                                       date_part('year',mostimportantdateutc)>=2015 AND                                       date_part('year',mostimportantdateutc)<=2017) AS a,                       ciq.wrds_transcript_person as b, ciq.ciqtranscriptcomponent AS c                  WHERE a.transcriptid=b.transcriptid AND b.transcriptcomponentid=c.transcriptcomponentid                  ORDER by a.transcriptid, b.componentorder;


In [17]:
data.columns

Index(['companyid', 'keydevid', 'transcriptid', 'headline',
       'mostimportantdateutc', 'mostimportanttimeutc', 'keydeveventtypeid',
       'keydeveventtypename', 'companyname', 'transcriptcollectiontypeid',
       'transcriptcollectiontypename', 'transcriptpresentationtypeid',
       'transcriptpresentationtypename', 'transcriptcreationdate_utc',
       'transcriptcreationtime_utc', 'audiolengthsec', 'isdelayed_flag',
       'delayreasontypeid', 'delayreasontypename', 'transcriptcomponentid',
       'componentorder', 'transcriptcomponenttypeid',
       'transcriptcomponenttypename', 'transcriptpersonid',
       'transcriptpersonname', 'proid', 'companyofperson', 'speakertypeid',
       'speakertypename', 'componenttextpreview', 'word_count',
       'componenttext'],
      dtype='object')

# Full query: year by year from 2000 to 2023

In [None]:

# Use SQLLite to store the data
# Create a connection to the SQLite database (it will create a new one if it doesn't exist)
conn = sqlite3.connect('../data/transcripts_raw_v2/transcripts_data.db')


In [38]:


for y in range(2023, 2024):
    print( f'Processing year {y}')
    sql_query = f'''SELECT a.*, b.*, c.componenttext \
                    FROM (SELECT * FROM ciq.wrds_transcript_detail \
                                    WHERE date_part('year',mostimportantdateutc)= {y}) AS a \
                        JOIN ciq.wrds_transcript_person as b ON a.transcriptid=b.transcriptid \
                        JOIN ciq.ciqtranscriptcomponent AS c ON b.transcriptcomponentid=c.transcriptcomponentid \
                    ORDER by a.transcriptid, b.componentorder;'''

    # Fetch data directly into the SQLite database
    data = db.raw_sql(sql_query)
    data.to_sql(f'transcripts_{y}', conn, if_exists='replace', index=False)  # Name the table based on the year

# Commit changes and close connection
conn.commit()
conn.close()


Processing year 2023


## Testing the tables to make sure it works

In [None]:
# Create a connection to the SQLite database (it will create a new one if it doesn't exist)
# list the tables in the database
conn = sqlite3.connect('../data/transcripts_raw_v2/transcripts_data.db')

cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(cursor.fetchall())

[('transcripts_2004',), ('transcripts_2005',), ('transcripts_2006',), ('transcripts_2007',), ('transcripts_2008',), ('transcripts_2009',), ('transcripts_2010',), ('transcripts_2011',), ('transcripts_2012',), ('transcripts_2013',), ('transcripts_2014',), ('transcripts_2015',), ('transcripts_2016',), ('transcripts_2017',), ('transcripts_2018',), ('transcripts_2019',), ('transcripts_2020',), ('transcripts_2021',), ('transcripts_2022',), ('transcripts_2023',)]


In [40]:
# print the number of rows in each table   
for table in cursor.execute("SELECT name FROM sqlite_master WHERE type='table';").fetchall():
    print(table[0], cursor.execute(f"SELECT count(*) FROM {table[0]};").fetchall()[0][0])

transcripts_2004 382
transcripts_2005 2169
transcripts_2006 111982
transcripts_2007 286836
transcripts_2008 962948
transcripts_2009 1539300
transcripts_2010 2238747
transcripts_2011 4438049
transcripts_2012 5410158


KeyboardInterrupt: 

# Downloading identifier tables

In [45]:
# list of tables to download:
id_tables = ['wrds_cik', 'wrds_cusip', 'wrds_gvkey', 'wrds_ciqsymbol']

In [None]:
# create a new database named 'id_data.db'
# download each table into the database as a separate table
# use the same table name as the original table name

conn = sqlite3.connect('../data/transcripts_raw_v2/id_data.db')

for table in id_tables:
    print(f'Downloading table {table}')
    sql_query = f'''SELECT * FROM ciq.{table};'''
    # Fetch data directly into the SQLite database
    data = db.raw_sql(sql_query)
    data.to_sql(f'{table}', conn, if_exists='replace', index=False)  # Name the table based on the year


# Commit changes and close connection
conn.commit()
conn.close()


Downloading table wrds_cik
Downloading table wrds_cusip
Downloading table wrds_gvkey
Downloading table wrds_ciqsymbol
