In [None]:
# Parameters
input_path = None
output_path = None


from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, concat_ws
import logging
import os
import yaml

# ------------------ Load Config ------------------
config_path = os.environ.get("INSURAFLOW_CONFIG_PATH")

if not config_path:
    # fallback: build the path manually if not set via environment variable
    try:
        notebook_dir = os.getcwd()  # ✅ works in notebooks
        project_root = os.path.abspath(os.path.join(notebook_dir, "..", ".."))
        config_path = os.path.join(project_root, "config", "config.yaml")
    except Exception as e:
        raise FileNotFoundError("Failed to resolve config path.") from e


with open(config_path, "r") as f:
    config = yaml.safe_load(f)

input_path = config["paths"]["cleaned_data"]
output_path = config["paths"]["transformed_data"]
log_file = config["log_file"]


# ------------------ Logger Setup ------------------
os.makedirs(os.path.dirname(log_file), exist_ok=True)

logger = logging.getLogger("data_transformation")
if not logger.hasHandlers():
    handler = logging.FileHandler(log_file)
    formatter = logging.Formatter("[TRANSFORMATION] %(asctime)s - %(levelname)s - %(message)s")
    handler.setFormatter(formatter)
    logger.addHandler(handler)
    logger.setLevel(logging.INFO)

# ------------------ Spark Session ------------------

spark = SparkSession.builder \
    .appName("InsuranceTransformer") \
    .config("spark.sql.shuffle.partitions", "2") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .config("spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version", "2") \
    .config("spark.hadoop.io.native.lib.available", "false") \
    .config("spark.speculation", "false") \
    .getOrCreate()

logger.info("Spark session started.")


# ------------------ Read Cleaned JSON ------------------
try:
    logger.info("Starting file upload : cleaned_insurance_data.json")
    df = spark.read.json(input_path)
    logger.info("Loaded cleaned insurance data.")
except Exception as e:
    logger.error(f"Failed to load cleaned data: {e}")
    raise


# ------------------ Transformation Functions ------------------

def categorize_risk(df):
    return df.withColumn("risk_category",
        when(col("claim_amount") > 10000, "High")
       .when((col("claim_amount") <= 10000) & (col("claim_amount") > 1000), "Medium")
       .otherwise("Low")
    )

def create_full_address(df):
    return df.withColumn("full_address", concat_ws(", ", col("address"), col("zip_code")))

# ------------------ Drop Unwanted Columns ------------------
try:
    logger.info("Dropping contact_number column as it's no longer needed.")
    df = df.drop("contact_number")
    logger.info("Dropped contact_number column successfully.")
except Exception as e:
    logger.warning(f"Could not drop contact_number column: {e}")



# ------------------ Execute Transformations ------------------
try:
    logger.info("Starting categorizing risk")
    df = categorize_risk(df)
    logger.info("Categorized risk.")
    logger.info("Starting creation of full address")
    df = create_full_address(df)
    logger.info("Created Full Address")

    logger.info("Transformations applied successfully.")
except Exception as e:
    logger.error(f"Error during transformation: {e}")
    raise

# ------------------ Save Transformed Data ------------------
try:
    logger.info("Starting file save procedure: transformed_insurance_data.json")
    output_path = output_path

    original_order = [
        "policy_id", "customer_id", "customer_name", "gender", "date_of_birth", "age",
        "policy_type", "coverage_amount", "premium_amount", "payment_frequency",
        "policy_start_date", "policy_end_date",
        "claim_id", "claim_date", "claim_amount", "claim_reason", "claim_status",
        "agent_id", "agent_name", "region",
        "customer_income", "marital_status", "number_of_dependents",
        "vehicle_type", "vehicle_age",
        "property_type", "property_value",
        "health_condition", "smoker", "employment_status", "education_level",
        "policy_status", "last_payment_date", "next_due_date", "renewal_flag", "email_address", "address", "zip_code"
    ]

    # Dynamically find new transformation columns
    new_columns = [c for c in df.columns if c not in original_order]
    final_order = original_order + new_columns

    # Select only existing columns
    existing_columns = [c for c in final_order if c in df.columns]
    df = df.select(*existing_columns)
    
    logger.info(f"New columns added during transformation: {new_columns}")

    df_pd = df.toPandas()
    df_pd.to_json(output_path, orient="records", lines=True)
    logger.info(f"Transformed data written to {output_path} using pandas.")
except Exception as e:
    logger.error(f"Failed to write transformed data: {e}")
    raise


  PyArrow >= 4.0.0 must be installed; however, it was not found.
Attempting non-optimization as 'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to true.
  warn(msg)
