# Estudo exploratório inicial com dados de táxi da NYC usando PySpark

In [None]:
import os
import requests
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date, current_timestamp

In [None]:
url = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2025-01.parquet"
parquet_path = "yellow_tripdata_2025-01.parquet"

def download_file(url, file_path):
    if not os.path.exists(file_path):
        print("Baixando arquivo...")
        response = requests.get(url)
        with open(file_path, 'wb') as f:
            f.write(response.content)
        print("Download concluído.")
    else:
        print("Arquivo já existe.")

download_file(url, parquet_path)

In [None]:
spark = SparkSession.builder \
    .appName("Exploração Dados Táxi NYC") \
    .getOrCreate()

In [None]:
df = spark.read.parquet(parquet_path)
df.printSchema()
df.show(5)

In [None]:
df = df.withColumn("pickup_datetime", (col("tpep_pickup_datetime") / 1000).cast("timestamp"))
df = df.withColumn("dropoff_datetime", (col("tpep_dropoff_datetime") / 1000).cast("timestamp"))
df = df.withColumn("trip_duration", (col("dropoff_datetime").cast("long") - col("pickup_datetime").cast("long")) / 60)

df = df.fillna({"passenger_count": 1, "fare_amount": 0, "tip_amount": 0, "total_amount": 0})
df = df.withColumn("trip_distance", col("trip_distance").cast("float"))
df = df.withColumn("fare_amount", col("fare_amount").cast("float"))
df = df.withColumn("tip_amount", col("tip_amount").cast("float"))
df = df.withColumn("total_amount", col("total_amount").cast("float"))
df = df.withColumn("last_updated", current_timestamp())
df = df.withColumn("pickup_date", to_date(col("pickup_datetime")))

In [None]:
df.select("pickup_datetime", "dropoff_datetime", "trip_distance", "fare_amount", "tip_amount").show(5)