In [1]:
as_of_date_database = "YOUR_SCHEMA"
as_of_date_ownership_table = "TABLE_WITH_YOUR_SIGNAL_BEFORE_ESG"

sh_database = "YOUR_SCHEMA"
sh_table = "smart_holdings_v1_mapped"

esg_database = "YOUR_SCHEMA"
esg_table = "esg_factors_for_as_of_dates_keyvalue"

# To conserve on space, only include the ESG factors of interest
analytics_factors = ['TotalCO2EquivalentsEmissionsToRevenues', 'VocEmissionsToRevenues', 'TotalWasteToRevenues', 'TotalHazardousWasteToRevenues', 'WaterPollutantEmissionsToRevenues', 'InjuriesToMillionHours', 'TotalRenewableEnergyToEnergyUse']
value_factors = ['VoluntaryTurnoverOfEmployees', 'WomenManagers', 'AverageTrainingHours', 'CustomerSatisfaction']
score_factors = ['ESGManagementScore', 'ESGShareholdersScore']
all_esg_factors = analytics_factors + value_factors + score_factors

esg_as_of_dates_database = "YOUR_SCHEMA"
esg_as_of_dates_table = "esg_factors_for_as_of_dates"
esg_as_of_dates_table_s3_dir = "s3://YOUR_BUCKET/data/repo/esgsh_esg_pit/"
esg_as_of_dates_table_s3_fullpath = esg_as_of_dates_table_s3_dir + esg_as_of_dates_table

pyathena_staging = "s3://YOUR_BUCKET/data/repo/esgsh_esg_pit/pyathena/"

# ===================================
# DON'T CHANGE ANYTHING BELOW HERE
# ===================================

In [2]:
!pip install pyathena 



In [3]:
import pyathena 

In [4]:
# All ranks will be type double
all_factors_str = ""

for factor_col in all_esg_factors:
        factor_str = 'e.' + factor_col + ', '
        all_factors_str += factor_str
        
# Remove last comma, so SQL plays nice
all_factors_str = all_factors_str.rstrip(', ')

In [5]:
# Generate the factor threshold transpose

factor_transpose = ""

for factor in all_esg_factors:
    
    factor_transpose_template = f"""
    kv['{factor}'] AS {factor},
    """
    factor_transpose = factor_transpose + factor_transpose_template

# Remove the last comma, else SQL doesn't like it
factor_transpose = factor_transpose[:factor_transpose.rfind(',')]

In [6]:
esg_holdingdate_sql = f"""
    CREATE TABLE "{esg_as_of_dates_database}"."{esg_as_of_dates_table}"
    WITH (format = 'Parquet', parquet_compression = 'SNAPPY', external_location ='{esg_as_of_dates_table_s3_fullpath}')
    AS
    -- For each AsOfDate, figure out the most recent ESG Factors
    -- Assuming a 1 lag, that means ESG factors at least 1 year prior to AsOfDate
    -- But also but in a 1 year lookback prior since not all companies will report on that each 1-year lag date
    WITH sh_universe
    AS
    (
        SELECT orgpermid,
        MIN(as_of_date) AS First_date,
        MAX(as_of_date) AS last_date
        FROM "{sh_database}"."{sh_table}"
        GROUP BY orgpermid
    ),
    sh_universe_dates
    AS
    (
        SELECT u.orgpermid, d.as_of_date
        FROM sh_universe u
        INNER JOIN "{as_of_date_database}"."{as_of_date_ownership_table}" d
        ON d.as_of_date BETWEEN u.first_date AND u.last_date
    ),
    transposed_esg
    AS
    (
        SELECT orgpermid,
        as_of_date,
        {factor_transpose}
        FROM
        (
            SELECT orgpermid, as_of_date, map_agg(itemcode, esg_value) kv
            FROM "{esg_database}"."{esg_table}"
            GROUP BY orgpermid, as_of_date
        )
    )
    SELECT d.as_of_date, d.orgpermid, {all_factors_str} 
    FROM sh_universe_dates d
    LEFT OUTER JOIN transposed_esg e
    ON d.orgpermid = e.orgpermid
    AND d.as_of_date = e.as_of_date
    """

### Here's what the SQL looks like with all the variables in it

In [None]:
print(esg_holdingdate_sql) 

### Now connect to Athena and run the query

In [8]:
athena_cur = pyathena.connect(s3_staging_dir=pyathena_staging).cursor() 

In [9]:
athena_cur.execute(esg_holdingdate_sql)
print(athena_cur.query_id) 

70827da2-a6a0-4a1e-ad6d-62f0c292f1e1
