### There are 3 main changes being fixed in the point-in-time tables
1) Replacing 'null' with the actual value NULL

2) Strongly typing columns

3) For the very first EffectiveFromDate for a company/period/item, change it to PeriodEndDate + 1 Year, if that is earlier.  This is because point-in-time starts well after ESG data started being recorded.

In [1]:
esg_database = "YOUR_SCHEMA"
esg_table = "esgpitscores_csv"

fixed_esg_database = "YOUR_SCHEMA"
fixed_esg_table = "esgpitscores_fixed"
fixed_esg_table_s3_dir = "s3://YOUR_BUCKET/data/repo/esg_pit/"
fixed_esg_table_s3_fullpath = fixed_esg_table_s3_dir + fixed_esg_table

pyathena_staging = "s3://YOUR_BUCKET/data/repo/esg_pit/pyathena/"

# ===================================
# DON'T CHANGE ANYTHING BELOW HERE
# ===================================

In [2]:
!pip install pyathena 



In [3]:
import pyathena 

In [4]:
fixing_sql = f"""
    CREATE TABLE "{fixed_esg_database}"."{fixed_esg_table}"
    WITH (format = 'Parquet', parquet_compression = 'SNAPPY', external_location ='{fixed_esg_table_s3_fullpath}')
    AS
    WITH firstdates
    AS
    (
        SELECT organizationid,
        itemcode,
        periodenddate AS periodenddate,
        MIN(effectivefromdate) AS first_date,
        DATE_ADD('year',1, DATE(periodenddate)) AS lagged_date,
        LEAST(DATE(SUBSTRING(MIN(effectivefromdate),1,10)), DATE_ADD('year',1, DATE(periodenddate))) AS effectivefromdate_new
        FROM "{esg_database}"."{esg_table}"
        GROUP BY organizationid, itemcode, periodenddate
    ) 
    SELECT CAST(e.organizationid AS BIGINT) AS organizationid,
    e.financialperiodid,
    DATE(e.periodenddate) AS periodenddate,
    e.periodtype,
    CAST(e.year AS INT) AS year,
    e.itemcode,
    CAST(e.esganalyticvaluescore AS DOUBLE) AS esganalyticvalue,
    NULLIF(e.esganalyticvaluescoregrade,'null') AS esganalyticvaluescoregrade,
    COALESCE(f.effectivefromdate_new, DATE(SUBSTRING(e.effectivefromdate,1,10))) AS effectivefromdate,
    DATE(SUBSTRING(NULLIF(e.effectivetodate,'null'),1,10)) AS effectivetodate
    FROM "{esg_database}"."{esg_table}" e
    LEFT OUTER JOIN firstdates f
    ON e.organizationid = f.organizationid
    AND e.itemcode = f.itemcode
    AND e.periodenddate = f.periodenddate
    AND e.effectivefromdate = f.first_date
    """

### Here's what the SQL looks like with all the variables in it

In [None]:
print(fixing_sql) 

### Now connect to Athena and run the query

In [6]:
athena_cur = pyathena.connect(s3_staging_dir=pyathena_staging).cursor() 

In [7]:
%%time
athena_cur.execute(fixing_sql)
print(athena_cur.query_id) 

ada4ea13-7751-4773-bfb0-515880336c53
CPU times: user 96.9 ms, sys: 718 µs, total: 97.6 ms
Wall time: 22.9 s
