# Silver transformation

## Imports

In [0]:
from src.modules.FunctionsDataCleaning import clean_dataframe
from pyspark.sql.functions import col, sum as _sum, when

## Standardize columns

### aircraft 

In [0]:
df_aircraft = spark.read.table("anac_aeronautical_occurrences_in_brazilian_civil_aviation.bronze_layer.gov_aircraft")

In [0]:
df_aircraft = df_aircraft.drop("aeronave_operador_categoria")

In [0]:
df_aircraft_clean = clean_dataframe(df_aircraft)

In [0]:
special_chars_regex = r'^[^a-zA-Z0-9]+$'

null_counts = df_aircraft_clean.select([
    _sum(when(col(c).isNull(), 1).otherwise(0)).alias(c) for c in df_aircraft_clean.columns
])

special_counts = df_aircraft_clean.select([
    _sum(when(col(c).rlike(special_chars_regex), 1).otherwise(0)).alias(c) for c in df_aircraft_clean.columns
])

print("---- NULLs by columns ----")
null_counts.display()

print("---- Values with only special characters per column ----")
special_counts.display()

### occurrence

In [0]:
df_occurrence = spark.read.table("anac_aeronautical_occurrences_in_brazilian_civil_aviation.bronze_layer.gov_occurrence")

In [0]:
df_occurrence_clean = clean_dataframe(df_occurrence)

In [0]:
special_chars_regex = r'^[^a-zA-Z0-9]+$'

null_counts = df_occurrence_clean.select([
    _sum(when(col(c).isNull(), 1).otherwise(0)).alias(c) for c in df_occurrence_clean.columns
])

special_counts = df_occurrence_clean.select([
    _sum(when(col(c).rlike(special_chars_regex), 1).otherwise(0)).alias(c) for c in df_occurrence_clean.columns
])

print("---- NULLs by columns ----")
null_counts.display()

print("---- Values with only special characters per column ----")
special_counts.display()

### recommendation

In [0]:
df_recommendation = spark.read.table("anac_aeronautical_occurrences_in_brazilian_civil_aviation.bronze_layer.gov_recommendation")

In [0]:
df_recommendation_clean = clean_dataframe(df_recommendation)

In [0]:
special_chars_regex = r'^[^a-zA-Z0-9]+$'

null_counts = df_recommendation_clean.select([
    _sum(when(col(c).isNull(), 1).otherwise(0)).alias(c) for c in df_recommendation_clean.columns
])

special_counts = df_recommendation_clean.select([
    _sum(when(col(c).rlike(special_chars_regex), 1).otherwise(0)).alias(c) for c in df_recommendation_clean.columns
])

print("---- NULLs by columns ----")
null_counts.display()

print("---- Values with only special characters per column ----")
special_counts.display()

### factor

In [0]:
df_factor = spark.read.table("anac_aeronautical_occurrences_in_brazilian_civil_aviation.bronze_layer.gov_significant_factor")

In [0]:
df_factor_clean = clean_dataframe(df_factor)

In [0]:
special_chars_regex = r'^[^a-zA-Z0-9]+$'

null_counts = df_factor_clean.select([
    _sum(when(col(c).isNull(), 1).otherwise(0)).alias(c) for c in df_factor_clean.columns
])

special_counts = df_factor_clean.select([
    _sum(when(col(c).rlike(special_chars_regex), 1).otherwise(0)).alias(c) for c in df_factor_clean.columns
])

print("---- NULLs by columns ----")
null_counts.display()

print("---- Values with only special characters per column ----")
special_counts.display()

### type occurrence

In [0]:
df_type = spark.read.table("anac_aeronautical_occurrences_in_brazilian_civil_aviation.bronze_layer.gov_type_occurrence")

In [0]:
df_type_clean = clean_dataframe(df_type)

In [0]:
special_chars_regex = r'^[^a-zA-Z0-9]+$'

null_counts = df_type_clean.select([
    _sum(when(col(c).isNull(), 1).otherwise(0)).alias(c) for c in df_type_clean.columns
])

special_counts = df_type_clean.select([
    _sum(when(col(c).rlike(special_chars_regex), 1).otherwise(0)).alias(c) for c in df_type_clean.columns
])

print("---- NULLs by columns ----")
null_counts.display()

print("---- Values with only special characters per column ----")
special_counts.display()

## Saving tables

In [0]:
dataframes = {
    "gov_recommendation_clean": df_recommendation_clean,
    "gov_significant_factor_clean": df_factor_clean,
    "gov_type_occurrence_clean": df_type_clean,
    "gov_occurrence_clean": df_occurrence_clean,
    "gov_aircraft_clean": df_aircraft_clean
}

for table_name, dataframe in dataframes.items():
    dataframe.write.format("delta") \
        .mode("overwrite") \
        .saveAsTable(f"anac_aeronautical_occurrences_in_brazilian_civil_aviation.silver_layer.{table_name}")

print("All tables saved in silver layer.")
