In [5]:
!pip install minio python-dotenv pandas



In [6]:
from pathlib import Path
import pandas as pd
from minio import Minio
from minio.error import S3Error
import io
from dotenv import load_dotenv
from pathlib import Path
import os

In [7]:
dotenv_path = Path('.') / '.env'
found_dotenv = load_dotenv(dotenv_path=dotenv_path)
# --- MinIO Connection Details ---
# These should match the credentials from your docker-compose.yml file
MINIO_ENDPOINT = "minio:9000"
MINIO_ACCESS_KEY = os.getenv("MINIO_ACCESS_KEY")
MINIO_SECRET_KEY = os.getenv("MINIO_SECRET_KEY")
MINIO_BUCKET_NAME = "social-media-data" # A new bucket for this data



if not found_dotenv:
    print(f"Warning: .env file not found at {dotenv_path.resolve()}.")
    print("Please ensure the .env file is in the same directory as your notebook.")





def get_minio_client():
    """Initializes and returns a MinIO client."""
    print(f"Connecting to MinIO at {MINIO_ENDPOINT}...")
    try:
        client = Minio(
            MINIO_ENDPOINT,
            access_key=MINIO_ACCESS_KEY,
            secret_key=MINIO_SECRET_KEY,
            secure=False # Set to True if using HTTPS
        )
        print("MinIO client initialized.")
        return client
    except Exception as e:
        print(f"Error initializing MinIO client: {e}")
        return None

def process_social_media_csv_to_minio(file_path):
    """
    Reads a CSV file, displays it, and uploads it as a Parquet file to MinIO.
    
    Args:
        file_path (str or Path): The local path to the CSV file.
    """
    print(f"--- Starting processing for {file_path} ---")
    
    # 1. Read the CSV file using Pandas
    try:
        print(f"Reading CSV file from '{file_path}'...")
        df = pd.read_csv(file_path)
        print("CSV file loaded successfully.")
    except FileNotFoundError:
        print(f"ERROR: The file '{file_path}' was not found.")
        return
    except Exception as e:
        print(f"An error occurred while reading the CSV file: {e}")
        return

    # 2. Display the DataFrame on the screen
    print("\nDisplaying the first 5 rows of the data:")
    display(df.head())

    # 3. Connect to MinIO
    minio_client = get_minio_client()
    if not minio_client:
        print("Aborting upload due to MinIO connection failure.")
        return

    # 4. Save the DataFrame as a Parquet file to MinIO
    try:
        # Ensure the bucket exists
        found = minio_client.bucket_exists(MINIO_BUCKET_NAME)
        if not found:
            minio_client.make_bucket(MINIO_BUCKET_NAME)
            print(f"Bucket '{MINIO_BUCKET_NAME}' created.")
        else:
            print(f"Bucket '{MINIO_BUCKET_NAME}' already exists.")
            
        # Convert DataFrame to Parquet format in-memory
        parquet_buffer = io.BytesIO()
        df.to_parquet(parquet_buffer, index=False)
        parquet_buffer.seek(0) # Rewind the buffer to the beginning

        # Define the object name for the Parquet file
        object_name = "social_media.parquet"

        # Upload the Parquet file to MinIO
        minio_client.put_object(
            MINIO_BUCKET_NAME,
            object_name,
            data=parquet_buffer,
            length=len(parquet_buffer.getvalue()),
            content_type='application/octet-stream'
        )
        print(f"\nSuccessfully uploaded '{object_name}' to MinIO bucket '{MINIO_BUCKET_NAME}'.")

    except S3Error as s3_err:
        print(f"An S3 error occurred with MinIO: {s3_err}")
    except Exception as e:
        print(f"An unexpected error occurred during the MinIO upload: {e}")

file_path = Path('.') /  'social_media_engagement.csv'


process_social_media_csv_to_minio(file_path)


--- Starting processing for social_media_engagement.csv ---
Reading CSV file from 'social_media_engagement.csv'...
ERROR: The file 'social_media_engagement.csv' was not found.
