In [71]:
from pyspark.sql.functions import col, upper, lower, to_date, date_format, regexp_replace, udf, substring
from pyspark.sql.types import FloatType, IntegerType, StringType, DateType
import datetime
import re

def applyFieldConversions(pDF, aAsset):
    def log(msg):
        print(msg)

    df = pDF

    def cast_to(target_type):
        type_map = {
            "String": StringType(),
            "Int": IntegerType(),
            "Float": FloatType(),
            "Date": DateType()
        }
        return lambda c: c.cast(type_map[target_type])

    def date_to_days_udf():
        def date_to_days(d):
            if d:
                base = datetime.date(1900, 1, 1)
                if isinstance(d, datetime.datetime):
                    d = d.date()
                return (d - base).days if isinstance(d, datetime.date) else None
            return None
        return udf(date_to_days, IntegerType())

    def get_handler(rule, source_type, target_type):
        rule = rule.strip()

        # Left:N substring rule
        if rule.lower().startswith("left:"):
            n = int(rule.split(":")[1])
            return lambda c: substring(c, 1, n)

        # Currency cleanup
        if rule == "???":
            rule = "Currency:Dollar"

        # Date parsing/formatting
        if rule.startswith("Date:"):
            fmt = rule.split("Date:")[1]

            if source_type == "String" and target_type == "Date":
                return lambda c: to_date(c, fmt)
            elif source_type == "Date" and target_type == "String":
                return lambda c: date_format(c, fmt)
            elif source_type == "String" and target_type == "String":
                return lambda c: date_format(to_date(c, 'yyyy-MM-dd'), fmt)
            else:
                raise ValueError(f"Invalid Date conversion: {source_type} to {target_type}")

        # Static conversion rules
        handler_map = {
            "Upper": lambda c: upper(c),
            "Lower": lambda c: lower(c),
            "Decimal:Comma": lambda c: regexp_replace(c, ",", ".").cast(FloatType()),
            "Currency:Dollar": lambda c: regexp_replace(regexp_replace(c, "[$]", ""), ",", "").cast(FloatType()),
            "DateToInt": lambda c: date_to_days_udf()(c),
            "Standard": cast_to(target_type)
        }

        if rule not in handler_map:
            raise ValueError(f"Unsupported rule: {rule}")

        return handler_map[rule]

    # Apply rules
    print("Starting field conversion...")

    for dField in aAsset["Fields"]:
        field = dField["Target_Field_Name"]
        source_type = dField["Source_Data_Type_Code"]
        target_type = dField["Target_Data_Type_Code"]
        rule_string = dField.get("Conversion_Rule_String", "Standard")
        rule_list = [r.strip() for r in rule_string.split(",")]

        print(f"\n Field: {field} | Source: {source_type} to  Target: {target_type} | Rules: {rule_list}")

        try:
            expr = col(field)
            for rule in rule_list:
                handler = get_handler(rule, source_type, target_type)
                expr = handler(expr)
                print(f"Applied: {rule}")
            df = df.withColumn(field, expr)
            print(f"Transformed: {field}")
        except Exception as e:
            log(f"Error - Field: {field} | Reason: {str(e)}")

    print("Field conversion complete.")
    return df

In [43]:
from pyspark.sql.functions import col, upper, lower, to_date, date_format, regexp_replace, udf, substring
from pyspark.sql.types import FloatType, IntegerType, StringType, DateType
import datetime
import re

def applyFieldConversions(pDF, aAsset):

    def log(msg):
        print(msg)

    df = pDF

    def cast_to(target_type):
        type_map = {
            "String": StringType(),
            "Int": IntegerType(),
            "Float": FloatType(),
            "Date": DateType()
        }
        return lambda c: c.cast(type_map[target_type])

    def date_to_days_udf():
        def date_to_days(d):
            if d:
                base = datetime.date(1900, 1, 1)
                if isinstance(d, datetime.datetime):
                    d = d.date()
                return (d - base).days if isinstance(d, datetime.date) else None
            return None
        return udf(date_to_days, IntegerType())

    def get_handler(rule, source_type, target_type):
        rule = rule.strip()

        # Left:N substring rule
        if rule.lower().startswith("left:"):
            n = int(rule.split(":")[1])
            return lambda c: substring(c, 1, n)

        # Currency cleanup
        if rule == "???":
            rule = "Currency:Dollar"

        # Date parsing/formatting
        if rule.startswith("Date:"):
            fmt = rule.split("Date:")[1]

            if source_type == "String" and target_type == "Date":
                return lambda c: to_date(c, fmt)
            elif source_type == "Date" and target_type == "String":
                return lambda c: date_format(c, fmt)
            elif source_type == "String" and target_type == "String":
                return lambda c: date_format(to_date(c, 'yyyy-MM-dd'), fmt)
            else:
                raise ValueError(f"Invalid Date conversion: {source_type} to {target_type}")

        # Static conversion rules
        handler_map = {
            "Upper": lambda c: upper(c),
            "Lower": lambda c: lower(c),
            "Decimal:Comma": lambda c: regexp_replace(c, ",", ".").cast(FloatType()),
            "Currency:Dollar": lambda c: regexp_replace(regexp_replace(c, "[$]", ""), ",", "").cast(FloatType()),
            "DateToInt": lambda c: date_to_days_udf()(c),
            "Standard": cast_to(target_type)
        }

        if rule not in handler_map:
            raise ValueError(f"Unsupported rule: {rule}")

        return handler_map[rule]

    # Apply rules
    print("Starting field conversion...")

    for dField in aAsset["Fields"]:
        field = dField["Target_Field_Name"]
        source_type = dField["Source_Data_Type_Code"]
        target_type = dField["Target_Data_Type_Code"]
        rule_string = dField.get("Conversion_Rule_String", "Standard")
        rule_list = [r.strip() for r in rule_string.split(",")]

        print(f"\n Field: {field} | Source: {source_type} to  Target: {target_type} | Rules: {rule_list}")

        try:
            expr = col(field)
            for rule in rule_list:
                handler = get_handler(rule, source_type, target_type)
                expr = handler(expr)
                print(f"Applied: {rule}")
            df = df.withColumn(field, expr)
            print(f"Transformed: {field}")
        except Exception as e:
            log(f"Error - Field: {field} | Reason: {str(e)}")

    print("Field conversion complete.")
    return df

In [2]:
from pyspark.sql.functions import col, upper, lower, to_date, date_format, regexp_replace, udf, substring
from pyspark.sql.types import FloatType, IntegerType, StringType, DateType
import datetime
import re

def applyFieldConversions(pDF, aAsset, aLogger=None):
    df = pDF

    def log(msg):
        aLogger.info(msg) if aLogger else print(msg)

    def cast_to(target_type):
        type_map = {
            "String": StringType(),
            "Int": IntegerType(),
            "Float": FloatType(),
            "Date": DateType()
        }
        return lambda c: c.cast(type_map[target_type])

    def get_handler(rule, source_type, target_type):
        rule = rule.strip()

        # Handle Left:N (Substring)
        if rule.lower().startswith("left:"):
            n = int(rule.split(":")[1])
            return lambda c: substring(c, 1, n)

        # Handle Currency Cleanup (???)
        if rule == "???":
            rule = "Currency:Dollar"

        # Handle Date Parsing or Formatting
        if rule.startswith("Date:"):
            fmt = rule.split("Date:")[1]

            # String to Date
            if source_type == "String" and target_type == "Date":
                return lambda c: to_date(c, fmt)

            # Date to String - Ensure exact format as defined
            elif source_type == "Date" and target_type == "String":
                return lambda c: date_format(c, fmt)

            else:
                raise ValueError(f"Invalid Date conversion: {source_type} ➡ {target_type}")

        # Static Rule Handlers
        handler_map = {
            "Upper": lambda c: upper(c),
            "Lower": lambda c: lower(c),
            "Decimal:Comma": lambda c: regexp_replace(c, ",", ".").cast(FloatType()),
            "Currency:Dollar": lambda c: regexp_replace(regexp_replace(c, "[$]", ""), ",", "").cast(FloatType()),
            "Standard": cast_to(target_type)
        }

        if rule not in handler_map:
            raise ValueError(f"Unsupported rule: {rule}")

        return handler_map[rule]

    # Main Logic
    log("Starting field conversion...")

    for dField in aAsset["Fields"]:
        field = dField["Target_Field_Name"]
        source_type = dField["Source_Data_Type_Code"]
        target_type = dField["Target_Data_Type_Code"]
        rule_string = dField.get("Conversion_Rule_String", "Standard")
        rule_list = [r.strip() for r in rule_string.split(",")]

        log(f"Field: {field} | Source: {source_type} ➡ Target: {target_type} | Rules: {rule_list}")

        try:
            expr = col(field)
            for rule in rule_list:
                handler = get_handler(rule, source_type, target_type)
                expr = handler(expr)
                log(f"Applied: {rule}")
            df = df.withColumn(field, expr)
            log(f"Transformed: {field}")
        except Exception as e:
            log(f"Error - Field: {field} | Reason: {str(e)}")

    log("Field conversion complete.")
    return df


In [60]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, DateType
import datetime

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("DateConversionTest") \
    .getOrCreate()

# Define Schema Explicitly
schema = StructType([
    StructField("StandardStr", StringType(), True),
    StructField("UpperStr", StringType(), True),
    StructField("LowerStr", StringType(), True),
    StructField("YMDStr", StringType(), True),
    StructField("MDYStr", StringType(), True),
    StructField("IntStr", StringType(), True),
    StructField("FloatStr", StringType(), True),
    StructField("CommaStr", StringType(), True),
    StructField("DollarStr", StringType(), True),
    StructField("IntVal", IntegerType(), True),
    StructField("FloatVal", FloatType(), True),
    StructField("DateVal", DateType(), True),
    StructField("DateAsString", StringType(), True),
    StructField("DateTimeStr", StringType(), True),
    StructField("DateVal_MMDDYYYY", StringType(), True),
    StructField("DateVal_ToInt", IntegerType(), True),
    StructField("IntVal_ToFloat", FloatType(), True),
    StructField("DateTimeStr_ToDate", DateType(), True)
])

# Test Data with Correct Date Conversion
test_data = [
    ("ApPLE", "Apple", "Apple", "2025-05-25", "05/25/2025", "123", "123.45", "123,45", "$1,567.45",
     2, 123.45, datetime.date(2025, 5, 25), "2025-05-25", "2025-05-25 12:34:56", "05/25/2025", 43056, 2.0, datetime.date(2025, 5, 25))
]

# Create DataFrame
df = spark.createDataFrame(test_data, schema=schema)
df.show(truncate=False)

# Asset Configuration
aAsset = {
    "Fields": [
        # String ➝ String
        {"Target_Field_Name": "StandardStr", "Source_Data_Type_Code": "String", "Target_Data_Type_Code": "String", "Conversion_Rule_String": "Standard"},
        {"Target_Field_Name": "UpperStr", "Source_Data_Type_Code": "String", "Target_Data_Type_Code": "String", "Conversion_Rule_String": "Upper"},
        {"Target_Field_Name": "LowerStr", "Source_Data_Type_Code": "String", "Target_Data_Type_Code": "String", "Conversion_Rule_String": "Lower"},

        # String ➝ Date
        {"Target_Field_Name": "YMDStr", "Source_Data_Type_Code": "String", "Target_Data_Type_Code": "Date", "Conversion_Rule_String": "Date:yyyy-MM-dd"},
        {"Target_Field_Name": "MDYStr", "Source_Data_Type_Code": "String", "Target_Data_Type_Code": "Date", "Conversion_Rule_String": "Date:MM/dd/yyyy"},

        # Date ➝ String / Int
        {"Target_Field_Name": "DateAsString", "Source_Data_Type_Code": "Date", "Target_Data_Type_Code": "String", "Conversion_Rule_String": "Date:yyyy-MM-dd"},
        {"Target_Field_Name": "DateVal_MMDDYYYY", "Source_Data_Type_Code": "Date", "Target_Data_Type_Code": "String", "Conversion_Rule_String": "Date:MM/dd/yyyy"},
        {"Target_Field_Name": "DateVal_ToInt", "Source_Data_Type_Code": "Date", "Target_Data_Type_Code": "Int", "Conversion_Rule_String": "Standard"},

        # String ➝ Int / Float
        {"Target_Field_Name": "IntStr", "Source_Data_Type_Code": "String", "Target_Data_Type_Code": "Int", "Conversion_Rule_String": "Standard"},
        {"Target_Field_Name": "FloatStr", "Source_Data_Type_Code": "String", "Target_Data_Type_Code": "Float", "Conversion_Rule_String": "Standard"},
        {"Target_Field_Name": "CommaStr", "Source_Data_Type_Code": "String", "Target_Data_Type_Code": "Float", "Conversion_Rule_String": "Decimal:Comma"},
        {"Target_Field_Name": "DollarStr", "Source_Data_Type_Code": "String", "Target_Data_Type_Code": "Float", "Conversion_Rule_String": "???"},

        # Int ➝ Int / Float
        {"Target_Field_Name": "IntVal", "Source_Data_Type_Code": "Int", "Target_Data_Type_Code": "Int", "Conversion_Rule_String": "Standard"},
        {"Target_Field_Name": "IntVal_ToFloat", "Source_Data_Type_Code": "Int", "Target_Data_Type_Code": "Float", "Conversion_Rule_String": "Standard"},

        # Float ➝ Float
        {"Target_Field_Name": "FloatVal", "Source_Data_Type_Code": "Float", "Target_Data_Type_Code": "Float", "Conversion_Rule_String": "Standard"},

        # String ➝ Date using Left + Format
        {"Target_Field_Name": "DateTimeStr_ToDate", "Source_Data_Type_Code": "String", "Target_Data_Type_Code": "Date", "Conversion_Rule_String": "Left:10,Date:yyyy-MM-dd"}
    ]
}

print("✅ Data and Asset Configuration Ready for Testing!")

+-----------+--------+--------+----------+----------+------+--------+--------+---------+------+--------+----------+------------+-------------------+----------------+-------------+--------------+------------------+
|StandardStr|UpperStr|LowerStr|YMDStr    |MDYStr    |IntStr|FloatStr|CommaStr|DollarStr|IntVal|FloatVal|DateVal   |DateAsString|DateTimeStr        |DateVal_MMDDYYYY|DateVal_ToInt|IntVal_ToFloat|DateTimeStr_ToDate|
+-----------+--------+--------+----------+----------+------+--------+--------+---------+------+--------+----------+------------+-------------------+----------------+-------------+--------------+------------------+
|ApPLE      |Apple   |Apple   |2025-05-25|05/25/2025|123   |123.45  |123,45  |$1,567.45|2     |123.45  |2025-05-25|2025-05-25  |2025-05-25 12:34:56|05/25/2025      |43056        |2.0           |2025-05-25        |
+-----------+--------+--------+----------+----------+------+--------+--------+---------+------+--------+----------+------------+----------------

In [50]:
df = spark.createDataFrame(test_data, schema=schema)
df.show(truncate=False)

+-----------+--------+--------+----------+----------+------+--------+--------+---------+------+--------+----------+------------+-------------------+----------------+-------------+--------------+------------------+
|StandardStr|UpperStr|LowerStr|YMDStr    |MDYStr    |IntStr|FloatStr|CommaStr|DollarStr|IntVal|FloatVal|DateVal   |DateAsString|DateTimeStr        |DateVal_MMDDYYYY|DateVal_ToInt|IntVal_ToFloat|DateTimeStr_ToDate|
+-----------+--------+--------+----------+----------+------+--------+--------+---------+------+--------+----------+------------+-------------------+----------------+-------------+--------------+------------------+
|ApPLE      |Apple   |Apple   |2025-05-25|05/25/2025|123   |123.45  |123,45  |$1,567.45|2     |123.45  |2025-05-25|2025-05-25  |2025-05-25 12:34:56|05/25/2025      |43056        |2.0           |2025-05-25        |
+-----------+--------+--------+----------+----------+------+--------+--------+---------+------+--------+----------+------------+----------------

In [51]:
# Apply Conversion
converted_df = applyFieldConversions(df, aAsset)

Starting field conversion...

 Field: StandardStr | Source: String to  Target: String | Rules: ['Standard']
Applied: Standard
Transformed: StandardStr

 Field: UpperStr | Source: String to  Target: String | Rules: ['Upper']
Applied: Upper
Transformed: UpperStr

 Field: LowerStr | Source: String to  Target: String | Rules: ['Lower']
Applied: Lower
Transformed: LowerStr

 Field: YMDStr | Source: String to  Target: Date | Rules: ['Date:yyyy-MM-dd']
Applied: Date:yyyy-MM-dd
Transformed: YMDStr

 Field: MDYStr | Source: String to  Target: Date | Rules: ['Date:MM/dd/yyyy']
Applied: Date:MM/dd/yyyy
Transformed: MDYStr

 Field: DateAsString | Source: Date to  Target: String | Rules: ['Date:yyyy-MM-dd']
Applied: Date:yyyy-MM-dd
Transformed: DateAsString

 Field: DateVal_MMDDYYYY | Source: Date to  Target: String | Rules: ['Date:MM/dd/yyyy']
Applied: Date:MM/dd/yyyy
Transformed: DateVal_MMDDYYYY

 Field: DateVal_ToInt | Source: Date to  Target: Int | Rules: ['Standard']
Applied: Standard
Transfo

In [52]:
# Show Result
converted_df.show(truncate=False)

+-----------+--------+--------+----------+----------+------+--------+--------+---------+------+--------+----------+------------+-------------------+----------------+-------------+--------------+------------------+
|StandardStr|UpperStr|LowerStr|YMDStr    |MDYStr    |IntStr|FloatStr|CommaStr|DollarStr|IntVal|FloatVal|DateVal   |DateAsString|DateTimeStr        |DateVal_MMDDYYYY|DateVal_ToInt|IntVal_ToFloat|DateTimeStr_ToDate|
+-----------+--------+--------+----------+----------+------+--------+--------+---------+------+--------+----------+------------+-------------------+----------------+-------------+--------------+------------------+
|ApPLE      |APPLE   |apple   |2025-05-25|2025-05-25|123   |123.45  |123.45  |1567.45  |2     |123.45  |2025-05-25|2025-05-25  |2025-05-25 12:34:56|NULL            |43056        |2.0           |2025-05-25        |
+-----------+--------+--------+----------+----------+------+--------+--------+---------+------+--------+----------+------------+----------------

In [55]:
converted_df.printSchema()

root
 |-- StandardStr: string (nullable = true)
 |-- UpperStr: string (nullable = true)
 |-- LowerStr: string (nullable = true)
 |-- YMDStr: date (nullable = true)
 |-- MDYStr: date (nullable = true)
 |-- IntStr: integer (nullable = true)
 |-- FloatStr: float (nullable = true)
 |-- CommaStr: float (nullable = true)
 |-- DollarStr: float (nullable = true)
 |-- IntVal: integer (nullable = true)
 |-- FloatVal: float (nullable = true)
 |-- DateVal: date (nullable = true)
 |-- DateAsString: string (nullable = true)
 |-- DateTimeStr: string (nullable = true)
 |-- DateVal_MMDDYYYY: string (nullable = true)
 |-- DateVal_ToInt: integer (nullable = true)
 |-- IntVal_ToFloat: float (nullable = true)
 |-- DateTimeStr_ToDate: date (nullable = true)



In [74]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.types import StructType, StructField, StringType, DateType
import datetime

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("DateConversionTest") \
    .getOrCreate()

# Define Schema Explicitly
schema = StructType([
    StructField("DateVal", DateType(), True),
    StructField("DateAsString", DateType(), True),
    StructField("DateVal_MMDDYYYY", DateType(), True)
])

# Test Data with Correct Date Conversion
test_data = [
    (datetime.date(2025, 5, 25), datetime.date(2025, 5, 25), datetime.date(2025, 5, 25))
]

# schema = StructType([
#     StructField("Int_To_Float", IntegerType(), True),
#     StructField("String_To_Float", StringType(), True),
#     StructField("String_To_Float_Decimal", StringType(), True),
#     StructField("Float_To_Float", FloatType(), True)
# ])

# Test Data with Correct Date Conversion
# test_data = [
#     (1, "12.5", "1,345", 12.5)
# ]

# Create DataFrame
df = spark.createDataFrame(test_data, schema=schema)
df.show(truncate=False)

# Asset Configuration
aAsset = {
    "Fields": [
        # Date ➝ String
        {"Target_Field_Name": "DateVal", "Source_Data_Type_Code": "Date", "Target_Data_Type_Code": "Date", "Conversion_Rule_String": "Standard"},
        {"Target_Field_Name": "DateAsString", "Source_Data_Type_Code": "Date", "Target_Data_Type_Code": "String", "Conversion_Rule_String": "Date:yyyy-MM-dd"},
        {"Target_Field_Name": "DateVal_MMDDYYYY", "Source_Data_Type_Code": "Date", "Target_Data_Type_Code": "String", "Conversion_Rule_String": "Date:MM/dd/yyyy"}

        # Float Check
        # {"Target_Field_Name": "Int_To_Float", "Source_Data_Type_Code": "Int", "Target_Data_Type_Code": "Float", "Conversion_Rule_String": "Standard"},
        # {"Target_Field_Name": "String_To_Float", "Source_Data_Type_Code": "Date", "Target_Data_Type_Code": "Float", "Conversion_Rule_String": "Standard"},
        # {"Target_Field_Name": "String_To_Float_Decimal", "Source_Data_Type_Code": "Date", "Target_Data_Type_Code": "Float", "Conversion_Rule_String": "Decimal:Comma"},
        # {"Target_Field_Name": "Float_To_Float", "Source_Data_Type_Code": "Float", "Target_Data_Type_Code": "Float", "Conversion_Rule_String": "Standard"}

    ]
}

print("✅ Data and Asset Configuration Ready for Testing!")


+----------+------------+----------------+
|DateVal   |DateAsString|DateVal_MMDDYYYY|
+----------+------------+----------------+
|2025-05-25|2025-05-25  |2025-05-25      |
+----------+------------+----------------+

✅ Data and Asset Configuration Ready for Testing!


In [75]:
converted_df = applyFieldConversions(df, aAsset)
converted_df.show(truncate=False)

Starting field conversion...

 Field: DateVal | Source: Date to  Target: Date | Rules: ['Standard']
Applied: Standard
Transformed: DateVal

 Field: DateAsString | Source: Date to  Target: String | Rules: ['Date:yyyy-MM-dd']
Applied: Date:yyyy-MM-dd
Transformed: DateAsString

 Field: DateVal_MMDDYYYY | Source: Date to  Target: String | Rules: ['Date:MM/dd/yyyy']
Applied: Date:MM/dd/yyyy
Transformed: DateVal_MMDDYYYY
Field conversion complete.
+----------+------------+----------------+
|DateVal   |DateAsString|DateVal_MMDDYYYY|
+----------+------------+----------------+
|2025-05-25|2025-05-25  |05/25/2025      |
+----------+------------+----------------+



In [76]:
converted_df.printSchema()

root
 |-- DateVal: date (nullable = true)
 |-- DateAsString: string (nullable = true)
 |-- DateVal_MMDDYYYY: string (nullable = true)



In [1]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import col, to_date, date_format, upper, lower, regexp_replace, substring, udf
from pyspark.sql.types import StructType, StructField, StringType, DateType, FloatType, IntegerType
import datetime

class DIFFieldConversion:
    def __init__(self, df, asset_config):
        self.df = df
        self.asset_config = asset_config

    def convertStringToDate(self, df, field, fmt):
        return df.withColumn(field, to_date(col(field), fmt))

    def convertDateToString(self, df, field, fmt):
        return df.withColumn(field, date_format(col(field), fmt))

    def convertStringToString(self, df, field, fmt):
        return df.withColumn(field, date_format(to_date(col(field), 'yyyy-MM-dd'), fmt))

    def convertToUpper(self, df, field):
        return df.withColumn(field, upper(col(field)))

    def convertToLower(self, df, field):
        return df.withColumn(field, lower(col(field)))

    def convertDecimalCommaToFloat(self, df, field):
        # Handles both 1.234,56 and 1,234.56 or 1,100,345.56 formats
        clean_col = regexp_replace(regexp_replace(col(field), "[.]", ""), ",", ".")
        return df.withColumn(field, clean_col.cast(FloatType()))

    def convertCurrencyDollarToFloat(self, df, field):
        return df.withColumn(field, regexp_replace(regexp_replace(col(field), "[$]", ""), ",", "").cast(FloatType()))

    def convertDateToInt(self, df, field):
        def date_to_days(d):
            if d:
                base = datetime.date(1900, 1, 1)
                if isinstance(d, datetime.datetime):
                    d = d.date()
                return (d - base).days if isinstance(d, datetime.date) else None
            return None
        return df.withColumn(field, udf(date_to_days, IntegerType())(col(field)))

    def convertStringToInt(self, df, field):
        return df.withColumn(field, col(field).cast(IntegerType()))

    def convertStringToFloat(self, df, field):
        return df.withColumn(field, col(field).cast(FloatType()))

    def convertIntToFloat(self, df, field):
        return df.withColumn(field, col(field).cast(FloatType()))

    def applyConversions(self):
        df = self.df
        print("Starting field conversion...")

        for field_config in self.asset_config["Fields"]:
            field = field_config["Target_Field_Name"]
            sourceType = field_config["Source_Data_Type_Code"]
            targetType = field_config["Target_Data_Type_Code"]
            rule_string = field_config.get("Conversion_Rule_String", "Standard")
            rule_list = [r.strip() for r in rule_string.split(",")]

            print(f"\n Field: {field} | Source: {sourceType} to  Target: {targetType} | Rules: {rule_list}")

            try:
                for rule in rule_list:
                    if rule.startswith("Date:"):
                        fmt = rule.split("Date:")[1]
                        if sourceType == "String" and targetType == "Date":
                            df = self.convertStringToDate(df, field, fmt)
                        elif sourceType == "Date" and targetType == "String":
                            df = self.convertDateToString(df, field, fmt)
                        elif sourceType == "String" and targetType == "String":
                            df = self.convertStringToString(df, field, fmt)
                        else:
                            raise ValueError(f"Invalid Date conversion: {sourceType} to {targetType}")
                        print(f"Applied: {rule}")

                    elif rule == "Upper":
                        df = self.convertToUpper(df, field)
                        print(f"Applied: {rule}")

                    elif rule == "Lower":
                        df = self.convertToLower(df, field)
                        print(f"Applied: {rule}")

                    elif rule == "Decimal:Comma":
                        df = self.convertDecimalCommaToFloat(df, field)
                        print(f"Applied: {rule}")

                    elif rule == "???":
                        df = self.convertCurrencyDollarToFloat(df, field)
                        print(f"Applied: {rule} (Dollar Currency)")

                    elif rule == "Standard":
                        if sourceType == "String" and targetType == "Int":
                            df = self.convertStringToInt(df, field)
                        elif sourceType == "String" and targetType == "Float":
                            df = self.convertStringToFloat(df, field)
                        elif sourceType == "Int" and targetType == "Float":
                            df = self.convertIntToFloat(df, field)
                        elif sourceType == "Date" and targetType == "Int":
                            df = self.convertDateToInt(df, field)
                        print(f"Applied: {rule}")

                print(f"Transformed: {field}")
            except Exception as e:
                print(f"Error - Field: {field} | Reason: {str(e)}")

        print("Field conversion complete.")
        return df


# Initialize Spark Session
spark = SparkSession.builder \
    .appName("DateConversionTest") \
    .getOrCreate()

# Define Schema Explicitly
schema = StructType([
    StructField("StandardStr", StringType(), True),
    StructField("UpperStr", StringType(), True),
    StructField("LowerStr", StringType(), True),
    StructField("YMDStr", StringType(), True),
    StructField("MDYStr", StringType(), True),
    StructField("IntStr", StringType(), True),
    StructField("FloatStr", StringType(), True),
    StructField("CommaStr", StringType(), True),
    StructField("DollarStr", StringType(), True),
    StructField("IntVal", IntegerType(), True),
    StructField("FloatVal", FloatType(), True),
    StructField("DateVal", DateType(), True),
    StructField("DateAsString", StringType(), True),
    StructField("DateTimeStr", StringType(), True)
])

# Sample Data Row
test_data = [
    ("ApPLE", "Apple", "Apple", "2025-05-25", "05/25/2025", "123", "123.45", "123,45", "$123.45", 2, 123.45, datetime.date(2025, 5, 25), "2025-05-25", "2025-05-25 12:34:56")
]

# Create DataFrame
df = spark.createDataFrame(test_data, schema=schema)
df.show(truncate=False)

# Asset Configuration
aAsset = {
    "Fields": [
        {"Target_Field_Name": "StandardStr", "Source_Data_Type_Code": "String", "Target_Data_Type_Code": "String", "Conversion_Rule_String": "Standard"},
        {"Target_Field_Name": "UpperStr", "Source_Data_Type_Code": "String", "Target_Data_Type_Code": "String", "Conversion_Rule_String": "Upper"},
        {"Target_Field_Name": "LowerStr", "Source_Data_Type_Code": "String", "Target_Data_Type_Code": "String", "Conversion_Rule_String": "Lower"},
        {"Target_Field_Name": "YMDStr", "Source_Data_Type_Code": "String", "Target_Data_Type_Code": "Date", "Conversion_Rule_String": "Date:yyyy-MM-dd"},
        {"Target_Field_Name": "MDYStr", "Source_Data_Type_Code": "String", "Target_Data_Type_Code": "Date", "Conversion_Rule_String": "Date:MM/dd/yyyy"},
        {"Target_Field_Name": "DateAsString", "Source_Data_Type_Code": "Date", "Target_Data_Type_Code": "String", "Conversion_Rule_String": "Date:yyyy-MM-dd"},
        {"Target_Field_Name": "IntStr", "Source_Data_Type_Code": "String", "Target_Data_Type_Code": "Int", "Conversion_Rule_String": "Standard"},
        {"Target_Field_Name": "FloatStr", "Source_Data_Type_Code": "String", "Target_Data_Type_Code": "Float", "Conversion_Rule_String": "Standard"},
        {"Target_Field_Name": "CommaStr", "Source_Data_Type_Code": "String", "Target_Data_Type_Code": "Float", "Conversion_Rule_String": "Decimal:Comma"},
        {"Target_Field_Name": "DollarStr", "Source_Data_Type_Code": "String", "Target_Data_Type_Code": "Float", "Conversion_Rule_String": "???"},
        {"Target_Field_Name": "IntVal", "Source_Data_Type_Code": "Int", "Target_Data_Type_Code": "Float", "Conversion_Rule_String": "Standard"},
        {"Target_Field_Name": "DateVal", "Source_Data_Type_Code": "Date", "Target_Data_Type_Code": "Int", "Conversion_Rule_String": "Standard"}
    ]
}

# Apply conversion
converter = DIFFieldConversion(df, aAsset)
converted_df = converter.applyConversions()
converted_df.show(truncate=False)

print("✅ Field conversion completed!")


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/02 13:00:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

+-----------+--------+--------+----------+----------+------+--------+--------+---------+------+--------+----------+------------+-------------------+
|StandardStr|UpperStr|LowerStr|YMDStr    |MDYStr    |IntStr|FloatStr|CommaStr|DollarStr|IntVal|FloatVal|DateVal   |DateAsString|DateTimeStr        |
+-----------+--------+--------+----------+----------+------+--------+--------+---------+------+--------+----------+------------+-------------------+
|ApPLE      |Apple   |Apple   |2025-05-25|05/25/2025|123   |123.45  |123,45  |$123.45  |2     |123.45  |2025-05-25|2025-05-25  |2025-05-25 12:34:56|
+-----------+--------+--------+----------+----------+------+--------+--------+---------+------+--------+----------+------------+-------------------+

Starting field conversion...

 Field: StandardStr | Source: String to  Target: String | Rules: ['Standard']
Applied: Standard
Transformed: StandardStr

 Field: UpperStr | Source: String to  Target: String | Rules: ['Upper']
Applied: Upper
Transformed: Up

In [2]:
converted_df.show(truncate=False)

+-----------+--------+--------+----------+----------+------+--------+--------+---------+------+--------+-------+------------+-------------------+
|StandardStr|UpperStr|LowerStr|YMDStr    |MDYStr    |IntStr|FloatStr|CommaStr|DollarStr|IntVal|FloatVal|DateVal|DateAsString|DateTimeStr        |
+-----------+--------+--------+----------+----------+------+--------+--------+---------+------+--------+-------+------------+-------------------+
|ApPLE      |APPLE   |apple   |2025-05-25|2025-05-25|123   |123.45  |123.45  |123.45   |2.0   |123.45  |45800  |2025-05-25  |2025-05-25 12:34:56|
+-----------+--------+--------+----------+----------+------+--------+--------+---------+------+--------+-------+------------+-------------------+



In [26]:
def compress(chars):

        s = ""
        prev_char = None
        for char in chars:
            if prev_char is None:
                prev_char = char
                cnt = 1
                continue

            if char == prev_char:
                cnt += 1
            else:
                if cnt == 1:
                    s += prev_char
                else:
                    s += prev_char + str(cnt)

                prev_char = char
                cnt = 1

        if cnt == 1:
            s += prev_char
        else:
            s += prev_char + str(cnt)

        for i in range(len(s)):
            chars[i] = s[i]


        print(s, chars)
        return len(s)

chars = ["a","a","b","c","c","c"]
compress(chars)

a2bc3 ['a', '2', 'b', 'c', '3', 'c']


5