Pull Athena Results into Pandas

Compute percentages, generate scorecard table, and export CSV

In [7]:
import boto3
import pandas as pd
from io import StringIO

s3 = boto3.client('s3')

bucket = "my-data-lake-lab-nandnioubt"
key = "athena-results/Unsaved/2026/01/03/0a58b0df-aeea-4428-aa4e-5a52eca1bb16.csv"

# Get S3 object
obj = s3.get_object(Bucket=bucket, Key=key)

# Read CSV into Pandas
df = pd.read_csv(StringIO(obj['Body'].read().decode('utf-8')))

print(df.head())


   ratecodeid  vendorid  tpep_pickup_datetime  tpep_dropoff_datetime  \
0         1.0         2   1672533130000000000    1672533636000000000   
1         1.0         2   1672534508000000000    1672534887000000000   
2         1.0         2   1672532704000000000    1672533469000000000   
3         1.0         2   1672531829000000000    1672532479000000000   
4         1.0         2   1672534234000000000    1672534972000000000   

   passenger_count  trip_distance store_and_fwd_flag  pulocationid  \
0              1.0           0.97                  N           161   
1              1.0           1.10                  N            43   
2              1.0           2.51                  N            48   
3              1.0           1.43                  N           107   
4              1.0           1.84                  N           161   

   dolocationid  payment_type  ...  pickup_zone_name      dropoff_zone_name  \
0           141             2  ...    Midtown Center        Lenox H

In [9]:
# Example: compute some metrics
scorecard = pd.DataFrame({
    'Quality Dimension': ['Payment type valid %','Store-and-forward valid %','Vendor/Rate code completeness'],
    'Owner': ['Data Eng','Data Eng','MDM Team'],
    'Threshold': ['>= 99%','>= 99%','100%'],
    'Value (%)': [
        (df['is_valid_payment_type']=='Y').mean()*100,
        (df['is_valid_store_fwd']=='Y').mean()*100,
        (~df['vendor_name'].isnull() & ~df['rate_code_desc'].isnull()).mean()*100
    ],
    'Action on Failure': ['Reject batch','Alert','Block load']
})

print(scorecard)

# Save scorecard back to S3
s3.put_object(
    Bucket=bucket,
    Key="curated/nyc_taxi/scorecard.csv",
    Body=scorecard.to_csv(index=False)
)


               Quality Dimension     Owner Threshold   Value (%)  \
0           Payment type valid %  Data Eng    >= 99%  100.000000   
1      Store-and-forward valid %  Data Eng    >= 99%  100.000000   
2  Vendor/Rate code completeness  MDM Team      100%   99.958566   

  Action on Failure  
0      Reject batch  
1             Alert  
2        Block load  


{'ResponseMetadata': {'RequestId': 'G7TYQG3GM2T7DFM0',
  'HostId': 'nLHjhuGn7biXIQiUKW7eoW/b0M/IPtRGM9Nt/JykwWX1ZDpWRg2JnNz5tw5bwGIbzEqPhwNz4Ks=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'nLHjhuGn7biXIQiUKW7eoW/b0M/IPtRGM9Nt/JykwWX1ZDpWRg2JnNz5tw5bwGIbzEqPhwNz4Ks=',
   'x-amz-request-id': 'G7TYQG3GM2T7DFM0',
   'date': 'Sun, 04 Jan 2026 05:24:20 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"16a1b8c26b695a74b0147c8e8c4a279f"',
   'x-amz-checksum-crc32': 'Niu2KA==',
   'x-amz-checksum-type': 'FULL_OBJECT',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"16a1b8c26b695a74b0147c8e8c4a279f"',
 'ChecksumCRC32': 'Niu2KA==',
 'ChecksumType': 'FULL_OBJECT',
 'ServerSideEncryption': 'AES256'}