In [1]:
import os
import sys

# Force Spark to use the exact Python executable running this notebook
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

# Point to your Spark installation
os.environ['SPARK_HOME'] = r'C:\spark'

# Windows-specific: Spark needs Hadoop binaries (winutils.exe)
# Ensure you have winutils.exe in C:\hadoop\bin
os.environ['HADOOP_HOME'] = r'C:\hadoop' 
os.environ['PATH'] += os.pathsep + r'C:\hadoop\bin'

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("VentasFix") \
    .master("local[*]") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.kryoserializer.buffer.max", "512m") \
    .config("spark.driver.extraJavaOptions", 
            "--add-opens=java.base/java.nio=ALL-UNNAMED " +
            "--add-opens=java.base/sun.nio.ch=ALL-UNNAMED") \
    .config("spark.executor.extraJavaOptions", 
            "--add-opens=java.base/java.nio=ALL-UNNAMED " +
            "--add-opens=java.base/sun.nio.ch=ALL-UNNAMED") \
    .getOrCreate()

sc = spark.sparkContext


In [2]:
from pyspark import SparkContext

In [3]:
import os
os.environ['HADOOP_HOME'] = "C:/hadoop"
os.environ['PATH'] += r";C:\hadoop\bin"

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

Trabajar primero en dataframe la limpieza de datos.
Es más fácil, los RDDs no se pueden modificar.

In [5]:
df = spark.read.parquet("C:/Users/gema2/OneDrive/Escritorio/MAESTRIA/DatosMasivos/yellow_tripdata_2025-01.parquet")

In [6]:
from pyspark.sql.functions import col, year, month, dayofmonth, date_format, unix_timestamp

In [7]:
df = df.withColumn('duracion_viaje',(unix_timestamp(col('tpep_dropoff_datetime'))-unix_timestamp(col('tpep_pickup_datetime')))/60)

In [8]:
df = df.drop('RatecodeID','store_and_fwd_flag','PULocationID','DOLocationID')

In [9]:
df = df.withColumn('año',year(col('tpep_pickup_datetime')))

In [10]:
df = df.withColumn('día',dayofmonth(col('tpep_pickup_datetime')))

In [11]:
df = df.withColumn("dia_semana_nombre", date_format(col("tpep_pickup_datetime"), "EEEE"))

In [12]:
df = df.withColumn('mes',month(col('tpep_pickup_datetime')))

In [13]:
df_limpio = df.filter((col("passenger_count") > 0) & (col("duracion_viaje") > 0) & (col("total_amount") > 0) & (col("año") == 2025))

In [14]:
df_limpio.na.drop()

DataFrame[VendorID: int, tpep_pickup_datetime: timestamp_ntz, tpep_dropoff_datetime: timestamp_ntz, passenger_count: bigint, trip_distance: double, payment_type: bigint, fare_amount: double, extra: double, mta_tax: double, tip_amount: double, tolls_amount: double, improvement_surcharge: double, total_amount: double, congestion_surcharge: double, Airport_fee: double, cbd_congestion_fee: double, duracion_viaje: double, año: int, día: int, dia_semana_nombre: string, mes: int]

In [15]:
df_limpio.count()

2848679

In [16]:
rdd = df_limpio.rdd

In [21]:
ventas = df_limpio.select(df_limpio.columns[12]).rdd.flatMap(lambda x: x)

In [23]:
total_ventas = ventas.sum()

In [24]:
numero_ventas = ventas.count()

In [27]:
promedio_ventas = total_ventas / numero_ventas 
print(f"Promedio de ventas es de: {promedio_ventas}")

Promedio de ventas es de: 27.78949766891006


In [None]:
# Calculamos la cantidad de pasajeros que cada vendor llevó.
# VendorID es x[0] y pasajeros es x[3]
rdd_parejas = rdd.map(lambda x: (str(x[0]), int(x[3]) if str(x[3]).isdigit() else 0))
rdd_sumado = rdd_parejas.reduceByKey(lambda x, y: x + y)

resultado = rdd_sumado.collect() # collect() devuelve lista de tuplas, suele ser más estable
print(dict(resultado))

{'1': 758038, '2': 2966613}
