In [2]:
import pandas as pd
import wrds
import numpy as np
from datetime import datetime
import os

# Connect to WRDS
db = wrds.Connection()

Enter your WRDS username [dyuan868]:pqg2rb
Enter your password:········
WRDS recommends setting up a .pgpass file.
Create .pgpass file now [y/n]?: y
Created .pgpass file successfully.
You can create this file yourself at any time with the create_pgpass_file() function.
Loading library list...
Done


In [4]:
# Setup directory structure
base_dir = "/Users/dyuan868/Desktop/gpas/4993/stat-4993-sp25/Data"
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Create main directories
dirs = [
    "1_raw_data", 
    "2_processed_data", 
    "3_financial_ratios", 
    "4_industry_datasets",
    "5_analysis"
]

for dir_name in dirs:
    os.makedirs(f"{base_dir}/{dir_name}", exist_ok=True)

print(f"Created directory structure in {base_dir}")

Created directory structure in /Users/dyuan868/Desktop/gpas/4993/stat-4993-sp25/Data


In [5]:
# 1. Fetch CRSP monthly stock data with ticker and industry information
print("Fetching CRSP monthly stock data...")
crsp_query = """
    SELECT a.permno,
           a.date,
           b.ticker,
           b.comnam AS company_name,
           b.siccd AS sic_code,      -- Standard Industrial Classification code
           a.ret AS monthly_return,
           a.prc AS price,
           ABS(a.prc) AS adj_price,
           a.vol AS volume,
           b.shrcd AS share_code,
           b.exchcd AS exchange_code,
           CASE 
               WHEN b.exchcd = 1 THEN 'NYSE'
               WHEN b.exchcd = 2 THEN 'AMEX'
               WHEN b.exchcd = 3 THEN 'NASDAQ'
               ELSE 'Other'
           END AS exchange_name
    FROM crsp.msf AS a
    LEFT JOIN crsp.msenames AS b
        ON a.permno = b.permno
        AND b.namedt <= a.date 
        AND a.date <= b.nameendt
    WHERE a.date BETWEEN '2010-01-01' AND '2015-12-31'
      AND b.shrcd IN (10, 11)
      AND b.exchcd IN (1, 2, 3)
"""
crsp_data = db.raw_sql(crsp_query)
crsp_data['date'] = pd.to_datetime(crsp_data['date'])
crsp_data['year'] = crsp_data['date'].dt.year
crsp_data['quarter'] = crsp_data['date'].dt.quarter

# Save raw CRSP data
crsp_data.to_csv(f"{base_dir}/1_raw_data/crsp_data_2010_2015.csv", index=False)
print(f"Raw CRSP data saved to {base_dir}/1_raw_data/crsp_data_2010_2015.csv")

Fetching CRSP monthly stock data...
Raw CRSP data saved to /Users/dyuan868/Desktop/gpas/4993/stat-4993-sp25/Data/1_raw_data/crsp_data_2010_2015.csv


In [6]:
# 2. Add industry classification using SIC codes
def get_industry_classification(sic_code):
    sic_str = str(sic_code).zfill(4)
    
    # Define industry classifications based on SIC codes
    if sic_str.startswith(('01', '02', '07', '08', '09')):
        return 'Agriculture'
    elif sic_str.startswith(('10', '12', '13', '14')):
        return 'Mining'
    elif sic_str.startswith(('15', '16', '17')):
        return 'Construction'
    elif sic_str.startswith(('20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39')):
        return 'Manufacturing'
    elif sic_str.startswith(('40', '41', '42', '43', '44', '45', '46', '47', '48', '49')):
        return 'Transportation & Utilities'
    elif sic_str.startswith(('50', '51')):
        return 'Wholesale Trade'
    elif sic_str.startswith(('52', '53', '54', '55', '56', '57', '58', '59')):
        return 'Retail Trade'
    elif sic_str.startswith(('60', '61', '62', '63', '64', '65', '66', '67')):
        return 'Finance'
    elif sic_str.startswith(('70', '71', '72', '73', '74', '75', '76', '77', '78', '79')):
        return 'Services'
    elif sic_str.startswith(('80', '81', '82', '83', '84', '85', '86', '87', '88', '89')):
        return 'Health & Education'
    elif sic_str.startswith(('91', '92', '93', '94', '95', '96', '97', '98', '99')):
        return 'Public Administration'
    else:
        return 'Other'

# Add industry classification
crsp_data['industry'] = crsp_data['sic_code'].apply(get_industry_classification)
print(f"CRSP data loaded: {crsp_data.shape}")

# Save CRSP data with industry classification
crsp_data.to_csv(f"{base_dir}/2_processed_data/crsp_with_industry.csv", index=False)
print(f"CRSP data with industry classification saved to {base_dir}/2_processed_data/crsp_with_industry.csv")

CRSP data loaded: (272845, 15)
CRSP data with industry classification saved to /Users/dyuan868/Desktop/gpas/4993/stat-4993-sp25/Data/2_processed_data/crsp_with_industry.csv


In [9]:
# 3. Get Compustat quarterly financial data with expanded ratios
print("Fetching Compustat quarterly financial data...")
comp_query = """
    SELECT gvkey,
           datadate,
           fyearq AS fiscal_year,
           fqtr AS fiscal_quarter,
           rdq AS report_date,
           
           -- Income Statement Items
           revtq AS revenue,
           cogsq AS cost_of_goods_sold,
           xsgaq AS sga_expense,
           oiadpq AS operating_income,
           oibdpq AS operating_income_before_depreciation,
           dpq AS depreciation,
           oancfy AS operating_cash_flow,
           txtq AS income_tax,
           niq AS net_income,
           piq AS pretax_income,
           ibq AS income_before_extraordinary_items,
           xintq AS interest_expense,
           
           -- Balance Sheet Items
           atq AS total_assets,
           ltq AS total_liabilities,
           ceqq AS common_equity,
           seqq AS stockholders_equity,
           cheq AS cash_and_equivalents,
           rectq AS accounts_receivable,
           invtq AS inventories, 
           ppentq AS ppe_net,
           actq AS current_assets,
           lctq AS current_liabilities,
           dlttq AS long_term_debt,
           dlcq AS debt_in_current_liabilities,
           apq AS accounts_payable,
           pstkq AS preferred_stock,
           cshoq AS common_shares_outstanding,
           
           -- Other Items
           capxy AS capital_expenditure,
           tic AS ticker_compustat,
           gindq AS gics_industry
           
    FROM comp.fundq
    WHERE datadate BETWEEN '2010-01-01' AND '2015-12-31'
      AND indfmt='INDL'
      AND datafmt='STD'
      AND popsrc='D'
      AND consol='C'
"""
comp_data = db.raw_sql(comp_query)
comp_data['datadate'] = pd.to_datetime(comp_data['datadate'])
comp_data['year'] = comp_data['datadate'].dt.year
comp_data['quarter'] = comp_data['datadate'].dt.quarter

# Save raw Compustat data
comp_data.to_csv(f"{base_dir}/1_raw_data/compustat_data_2010_2015.csv", index=False)
print(f"Raw Compustat data saved to {base_dir}/1_raw_data/compustat_data_2010_2015.csv")
print(f"Compustat data loaded: {comp_data.shape}")

Fetching Compustat quarterly financial data...


ProgrammingError: (psycopg2.errors.UndefinedColumn) column "gindq" does not exist
LINE 42:            gindq AS gics_industry
                    ^

[SQL: 
    SELECT gvkey,
           datadate,
           fyearq AS fiscal_year,
           fqtr AS fiscal_quarter,
           rdq AS report_date,
           
           -- Income Statement Items
           revtq AS revenue,
           cogsq AS cost_of_goods_sold,
           xsgaq AS sga_expense,
           oiadpq AS operating_income,
           oibdpq AS operating_income_before_depreciation,
           dpq AS depreciation,
           oancfy AS operating_cash_flow,
           txtq AS income_tax,
           niq AS net_income,
           piq AS pretax_income,
           ibq AS income_before_extraordinary_items,
           xintq AS interest_expense,
           
           -- Balance Sheet Items
           atq AS total_assets,
           ltq AS total_liabilities,
           ceqq AS common_equity,
           seqq AS stockholders_equity,
           cheq AS cash_and_equivalents,
           rectq AS accounts_receivable,
           invtq AS inventories, 
           ppentq AS ppe_net,
           actq AS current_assets,
           lctq AS current_liabilities,
           dlttq AS long_term_debt,
           dlcq AS debt_in_current_liabilities,
           apq AS accounts_payable,
           pstkq AS preferred_stock,
           cshoq AS common_shares_outstanding,
           
           -- Other Items
           capxy AS capital_expenditure,
           tic AS ticker_compustat,
           gindq AS gics_industry
           
    FROM comp.fundq
    WHERE datadate BETWEEN '2010-01-01' AND '2015-12-31'
      AND indfmt='INDL'
      AND datafmt='STD'
      AND popsrc='D'
      AND consol='C'
]
(Background on this error at: https://sqlalche.me/e/20/f405)

In [None]:
# Close WRDS connection
db.close()