# TLC Data Analytics 

This project is to genereate data based on the ETL data. 


In [None]:
# intialize the Glue environment

%idle_timeout 2880
%glue_version 4.0
%worker_type G.1X
%number_of_workers 16

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job

sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

import os
# Detect if the code is running in a Glue job
def is_glue_job():
    try:
        args = getResolvedOptions(sys.argv,['JOB_NAME'])
        print("JOB_NAME: ", args['JOB_NAME'])
        return True
    except:
        return False

print("Running in Glue Job: ", is_glue_job())


## 1. Data Aggregation
We will run the following analysis on the data:
- the sum of data grouped by date (e.g., the number of trips, the total driver_pay)
- the average of data grouped by date (e.g., the average trip distance, the average trip duration)

In [7]:
# Create a Spark DataFrame from the Glue Catalog
df = glueContext.create_dynamic_frame.from_catalog(database = "tlc", table_name = "fhvhv_final").toDF()

# print the schema if not running in a Glue job
if not is_glue_job():
    print("Schema before transformation:")
    df.printSchema()

# Count the number of rows
print("Number of rows:", df.count())

In [None]:
from pyspark.sql.functions import count, sum, avg
from pyspark.sql.functions import count, sum, avg, format_number

grouped_df = df.groupBy("year", "month", "day", "rider").agg(
    count("*").alias("count"),
    format_number(sum("trip_km"), 2).alias("total_trip_km"),
    sum("trip_time").alias("total_trip_time"),
    format_number(sum("base_passenger_fare"), 2).alias("total_base_passenger_fare"),
    format_number(sum("tolls"), 2).alias("total_tolls"),
    format_number(sum("sales_tax"), 2).alias("total_sales_tax"),
    format_number(sum("congestion_surcharge"), 2).alias("total_congestion_surcharge"),
    format_number(sum("tips"), 2).alias("total_tips"),
    format_number(sum("driver_pay"), 2).alias("total_driver_pay"),
    format_number(avg("trip_km"), 2).alias("avg_trip_km"),
    avg("trip_time").alias("avg_trip_time"),
    format_number(avg("base_passenger_fare"), 2).alias("avg_base_passenger_fare"),
    format_number(avg("tolls"), 2).alias("avg_tolls"),
    format_number(avg("sales_tax"), 2).alias("avg_sales_tax"),
    format_number(avg("congestion_surcharge"), 2).alias("avg_congestion_surcharge"),
    format_number(avg("tips"), 2).alias("avg_tips"),
    format_number(avg("driver_pay"), 2).alias("avg_driver_pay")
)

# Display the result
grouped_df.show(5)

In [None]:
from pyspark.sql.functions import concat, col, lit
from pyspark.sql.functions import to_timestamp

grouped_df = grouped_df.withColumn("date", to_timestamp(concat(col("year"), lit("-"), col("month"), lit("-"), col("day"))))
grouped_df = grouped_df.drop("year", "month", "day")

grouped_df.show(5)

In [None]:
# Write the result to S3 in CSV format
grouped_df.coalesce(1).write.csv("s3://qiaoshi-aws-ml/tlc/results/aggreated/", header=True)

In [None]:
# Commit the job, this is to signal to Glue that the job completed successfully.
job.commit()

print("Done!")