# Include the common_functions

In [2]:
%run common_functions

# Test deduplicate_dataframe()

## Test usage including order_by_desc_column

In [7]:
# usage with ordering column:   df_deduped = deduplicate_dataframe(df, key_columns=["id", "date"], order_by_desc_column="updated_at")

from pyspark.sql.types import StructType, StructField, StringType, TimestampType
from datetime import datetime

# Define schema
schema = StructType([
    StructField("CompanyName", StringType(), True),
    StructField("City", StringType(), True),
    StructField("LastUpdateTimeStamp", TimestampType(), True)
])

# Sample data (10 unique, 2 duplicates)
data = [
    ("Acme Corp", "New York", datetime(2023, 7, 1, 12, 30)),
    ("Globex Inc", "Chicago", datetime(2023, 7, 2, 9, 15)),
    ("Initech", "San Francisco", datetime(2023, 7, 3, 14, 45)),
    ("Umbrella Corp", "Los Angeles", datetime(2023, 7, 4, 16, 0)),
    ("Soylent Corp", "Boston", datetime(2023, 7, 5, 8, 30)),
    ("Stark Industries", "Dallas", datetime(2023, 7, 6, 10, 20)),
    ("Wayne Enterprises", "Gotham", datetime(2023, 7, 7, 11, 10)),
    ("Wonka Industries", "Seattle", datetime(2023, 7, 8, 13, 40)),
    ("Cyberdyne Systems", "Austin", datetime(2023, 7, 9, 15, 25)),
    ("Tyrell Corporation", "Detroit", datetime(2023, 7, 10, 17, 55)),
    # Duplicate entries
    ("Acme Corp", "New York", datetime(2023, 7, 11, 12, 0)),
    ("Globex Inc", "Chicago", datetime(2023, 7, 12, 9, 0)),
]

# Create DataFrame
df = spark.createDataFrame(data, schema=schema)

# Show the data
df.show(truncate=False)

# Now deduplicate the data
df_deduped = deduplicate_dataframe_by_key_columns(df, key_columns=["CompanyName"], order_by_desc_column="LastUpdateTimeStamp")
df_deduped.show(truncate=False)


## Test it without an order_by_desc_column

In [17]:
# Use teh same df from above
df_deduped2 = deduplicate_dataframe_by_key_columns(df, ["CompanyName"])
df_deduped2.show() # Picks the first row available for key.

# Test sanitize_column_names(df, replacement_dict=None)

In [20]:
# Creat a dataframe with funk column names.
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Define schema with problematic column names
schema = StructType([
    StructField("Company Name", StringType(), True),
    StructField("Revenue%", IntegerType(), True),
    StructField("Profit&Loss", IntegerType(), True),
    StructField("Address-Line#1", StringType(), True),
    StructField("@Region!", StringType(), True),
    StructField("1stColumn", StringType(), True),
    StructField("NormalColumn", StringType(), True),
])

# Sample data
data = [
    ("Acme Corp", 1000000, 100000, "123 Elm St", "North", "Value1", "Valid"),
    ("Globex Inc", 2000000, 250000, "456 Oak Ave", "South", "Value2", "StillValid")
]

# Create the DataFrame
df_test_column_name_fix = spark.createDataFrame(data, schema=schema)

# Show original schema
print("Original Schema:")
df_test_column_name_fix.printSchema()


## Test without dict of character sequence replacements

In [24]:

df_names_fixed_no_replacement = sanitize_column_names(df_test_column_name_fix)
df_names_fixed_no_replacement.printSchema()

## Test it with a dict of charecter sequence replacements

In [26]:
replacement_dict = {
    "%": "Pct",
    "&": "And",
    "#": "Number"
}
df_names_fixed = sanitize_column_names(df=df_test_column_name_fix, replacement_dict=replacement_dict)
df_names_fixed.printSchema()

# Test the KeyValuePair getter and setter

In [3]:
#def set_key_value(key_name: str, key_value: str):
set_key_value(key_name='TestKey', key_value='TestKeyValue3')

In [4]:
#def get_key_value(key_name):
key_value = get_key_value(key_name='TestKey')
print(f"key_value: {key_value}")

In [6]:
# The function defs.
import inspect

print(inspect.getsource(set_key_value))
print("*" * 100)
print(inspect.getsource(get_key_value))