In [None]:
from pyspark.sql import SparkSession
from dotenv import load_dotenv
from pyspark.sql.types import LongType
from pyspark.sql.functions import col

In [None]:
!docker inspect minioserver | grep IPAddress

In [None]:
!docker inspect spark-master | grep IPAddress

In [None]:
load_dotenv()

# Get credentials from .env
MINIO_USER = os.getenv("MINIO_ROOT_USER")
MINIO_PASSWORD = os.getenv("MINIO_ROOT_PASSWORD")

# Create Spark session connecting to your Docker cluster
spark = (SparkSession.builder
    .appName("MinIOUpload")
    .master("spark://172.18.0.3:7077")  # Your Spark master in Docker
    .config("spark.jars", 
            "./shared-data/hadoop-aws-3.3.4.jar,"
            "./shared-data/aws-java-sdk-bundle-1.12.792.jar")
    .config("spark.driver.extraClassPath",
            "./shared-data/hadoop-aws-3.3.4.jar:"
            "./shared-data/aws-java-sdk-bundle-1.12.792.jar")
    .config("fs.s3a.endpoint", "http://172.18.0.4:9000")  # MinIO on localhost
    .config("fs.s3a.access.key", MINIO_USER)
    .config("fs.s3a.secret.key", MINIO_PASSWORD)
    .config("fs.s3a.path.style.access", "true")
    .config("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("fs.s3a.connection.ssl.enabled", "false")
    .config("fs.s3a.aws.credentials.provider", 
            "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
    .getOrCreate())

print("✓ Spark session created!")
print(f"Spark version: {spark.version}")

In [None]:
# Use the local path on your machine (where Jupyter is running)
MINIO_SOURCE_PATH = "s3a://datalake/staging/yellow_tripdata"
MINIO_TARGET_PATH = "s3a://datalake/raw-data/unpartitioned/yellow_tripdata_2023_full.parquet"

In [None]:
file_status = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jvm.org.apache.hadoop.fs.Path(MINIO_SOURCE_PATH).toUri(), spark._jsc.hadoopConfiguration()).listStatus(spark._jvm.org.apache.hadoop.fs.Path(MINIO_SOURCE_PATH))

In [None]:
parquet_files = [str(f.getPath()) for f in file_status if str(f.getPath()).endswith('.parquet')]
print(parquet_files)

In [None]:
dfs = []
for file_path in parquet_files:
    print(f"Reading: {file_path}")
    df = spark.read.parquet(file_path)
    
    # Cast all int columns to LongType
    for col_name, col_type in df.dtypes:
        if col_type == 'int':
            df = df.withColumn(col_name, col(col_name).cast(LongType()))
    
    dfs.append(df)

In [None]:
combined_df = dfs[0]
for df in dfs[1:]:
    combined_df = combined_df.unionByName(df, allowMissingColumns=True)

In [None]:
print(f"✓ Read {df.count()} rows")
print(f"\nSchema:")
combined_df.printSchema()

In [None]:
combined_df.show()

In [None]:
print(f"\nWriting to: {MINIO_TARGET_PATH}")
combined_df.coalesce(1).write.mode("overwrite").parquet(MINIO_TARGET_PATH)

print("✓ Successfully uploaded to MinIO!")
spark.stop()