#### Raw Data Storage
1. creating databucket in aws s3
2. ingesting raw data with timestamp in the folder raw_data and also downloading locally
3. generating log report

In [1]:
import os
import logging
import boto3
from botocore.exceptions import NoCredentialsError
from datetime import datetime

# AWS S3 Configuration
S3_BUCKET = "dmml-bank-churn-data"
S3_FOLDER = "raw_data/"

def setup_logging():
    """Sets up logging for raw data storage."""
    log_file = "logs/raw_data_storage.log"
    os.makedirs(os.path.dirname(log_file), exist_ok=True)
    logging.basicConfig(
        filename=log_file,
        level=logging.INFO,
        format="%(asctime)s - %(levelname)s - %(message)s",
    )

# Initialize S3 client
s3 = boto3.client("s3")

def upload_to_s3(file_path, s3_key):
    """Uploads a file to AWS S3 with versioning."""
    try:
        s3.upload_file(file_path, S3_BUCKET, s3_key)
        logging.info(f"Uploaded {file_path} to S3 as {s3_key}.")
    except NoCredentialsError:
        logging.error("AWS credentials not found.")
    except Exception as e:
        logging.error(f"Failed to upload {file_path} to S3: {e}")

def push_raw_data_to_s3():
    """Uploads the locally stored raw data to S3 with timestamped versioning."""
    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    
    raw_data_dir = "raw_data/"
    if not os.path.exists(raw_data_dir):
        logging.error("Raw data directory does not exist. Please run data ingestion first.")
        return
    
    for file in os.listdir(raw_data_dir):
        local_file_path = os.path.join(raw_data_dir, file)
        s3_key = f"{S3_FOLDER}{timestamp}_{file}"
        upload_to_s3(local_file_path, s3_key)

def main():
    """Main function to push raw data to S3."""
    logging.info("Starting raw data upload to S3...")
    push_raw_data_to_s3()
    logging.info("Raw data upload completed successfully.")

if __name__ == "__main__":
    setup_logging()
    main()


In [2]:
!aws s3 ls s3://dmml-bank-churn-data


                           PRE eda_reports/
                           PRE processed_data/
                           PRE raw_data/
                           PRE reports/
                           PRE transformed_data/


File association not found for extension .py


In [3]:
!aws s3 ls s3://dmml-bank-churn-data/raw_data/


2025-03-08 19:19:47    8196887 2025-03-08_19-19-46_api_data.csv
2025-03-08 19:19:48    8196887 2025-03-08_19-19-46_kaggle_data.csv
2025-03-08 19:19:49    8086863 2025-03-08_19-19-46_test.csv
2025-03-08 19:19:50   12350130 2025-03-08_19-19-46_train.csv
2025-03-08 20:20:16    8196887 2025-03-08_20-20-15_api_data.csv
2025-03-08 20:20:17    8086863 2025-03-08_20-20-15_kaggle_data.csv
2025-03-08 20:20:18   12350130 2025-03-08_20-20-15_train.csv
2025-03-08 21:49:24    8196887 2025-03-08_21-49-23_api_data.csv
2025-03-08 21:49:25    8086863 2025-03-08_21-49-23_kaggle_data.csv
2025-03-08 21:49:26    8086863 2025-03-08_21-49-23_test.csv
2025-03-08 21:49:27   12350130 2025-03-08_21-49-23_train.csv
2025-03-09 12:19:37    8196887 2025-03-09_12-19-35_api_data.csv
2025-03-09 12:19:39    8086863 2025-03-09_12-19-35_kaggle_data.csv
2025-03-09 12:19:39    8086863 2025-03-09_12-19-35_test.csv
2025-03-09 12:19:40   12350130 2025-03-09_12-19-35_train.csv
2025-03-11 11:37:01    8196887 2025-03-11_11-36-59_a

File association not found for extension .py
