In [20]:
# Welcome to your new notebook
# Type here in the cell editor to add code!
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when
from pyspark.sql.functions import col, when, date_format, year, month, dayofmonth, lit

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Myanmar Air Quality Data Processing") \
    .getOrCreate()

StatementMeta(, 7ea1964f-1260-45db-a312-ea211fac4f81, 22, Finished, Available, Finished)

In [21]:
# Load CSV data

input_path = "Files/myanmar_air_quality/FinalData.csv"
df = spark.read.csv(input_path, header=True, inferSchema=True)

# Show the loaded data
df.show(5)


StatementMeta(, 7ea1964f-1260-45db-a312-ea211fac4f81, 23, Finished, Available, Finished)

+------+-----------------+---------------+----+-------+------------+-----+-----+-----+-------------+----------+------+---------+----------------+----------+-----------------+
|  City|           Center|           Date|Year|  Month|      Season|PM1_0|PM2_5| PM10|Temperature_F|Humidity_%|   AQI|New_cases|Cumulative_cases|New_deaths|Cumulative_deaths|
+------+-----------------+---------------+----+-------+------------+-----+-----+-----+-------------+----------+------+---------+----------------+----------+-----------------+
|Yangon|7 Miles Mayangone|10/20/2019 0:00|2019|October|Rainy Season| 29.6|44.27|52.87|        95.67|     49.64|122.59|        0|               0|         0|                0|
|Yangon|7 Miles Mayangone|10/21/2019 0:00|2019|October|Rainy Season|25.22|37.49|45.21|        94.42|     51.67| 105.9|        0|               0|         0|                0|
|Yangon|7 Miles Mayangone|10/22/2019 0:00|2019|October|Rainy Season|24.46|35.84|42.32|         95.4|     49.26|101.84|       

In [22]:
# Get a list of column names
columns = df.columns

# Create a new list to store unique column names
unique_columns = []

# Track duplicates
duplicates = {}

# Process each column name
for col_name in columns:
    if col_name in unique_columns:
        # If duplicate, append a suffix to make it unique
        if col_name in duplicates:
            duplicates[col_name] += 1
        else:
            duplicates[col_name] = 1
        col_name = f"{col_name}_{duplicates[col_name]}"
    
    unique_columns.append(col_name)

# Rename the columns
df = df.toDF(*unique_columns)

# Show the DataFrame with unique column names
df.show(5)


StatementMeta(, 7ea1964f-1260-45db-a312-ea211fac4f81, 24, Finished, Available, Finished)

+------+-----------------+---------------+----+-------+------------+-----+-----+-----+-------------+----------+------+---------+----------------+----------+-----------------+
|  City|           Center|           Date|Year|  Month|      Season|PM1_0|PM2_5| PM10|Temperature_F|Humidity_%|   AQI|New_cases|Cumulative_cases|New_deaths|Cumulative_deaths|
+------+-----------------+---------------+----+-------+------------+-----+-----+-----+-------------+----------+------+---------+----------------+----------+-----------------+
|Yangon|7 Miles Mayangone|10/20/2019 0:00|2019|October|Rainy Season| 29.6|44.27|52.87|        95.67|     49.64|122.59|        0|               0|         0|                0|
|Yangon|7 Miles Mayangone|10/21/2019 0:00|2019|October|Rainy Season|25.22|37.49|45.21|        94.42|     51.67| 105.9|        0|               0|         0|                0|
|Yangon|7 Miles Mayangone|10/22/2019 0:00|2019|October|Rainy Season|24.46|35.84|42.32|         95.4|     49.26|101.84|       

In [23]:
from pyspark.sql.types import IntegerType, StringType, FloatType, DateType

# Example: casting specific columns to the appropriate data types
df = df.withColumn("PM2_5", col("PM2_5").cast(FloatType())) \
       .withColumn("Year", col("Year").cast(IntegerType())) \
       .withColumn("Date", col("Date").cast(DateType()))

df.printSchema()  # Verify the schema

StatementMeta(, 7ea1964f-1260-45db-a312-ea211fac4f81, 25, Finished, Available, Finished)

root
 |-- City: string (nullable = true)
 |-- Center: string (nullable = true)
 |-- Date: date (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Month: string (nullable = true)
 |-- Season: string (nullable = true)
 |-- PM1_0: double (nullable = true)
 |-- PM2_5: float (nullable = true)
 |-- PM10: double (nullable = true)
 |-- Temperature_F: double (nullable = true)
 |-- Humidity_%: double (nullable = true)
 |-- AQI: double (nullable = true)
 |-- New_cases: integer (nullable = true)
 |-- Cumulative_cases: integer (nullable = true)
 |-- New_deaths: integer (nullable = true)
 |-- Cumulative_deaths: integer (nullable = true)



In [24]:
# Convert Date to ISO format and extract Year_Key, Month_Key, Day_Key
df = df.withColumn("Date", date_format(col("Date"), "yyyy-MM-dd HH:mm")) \
       .withColumn("Year_Key", year(col("Date")).cast(StringType())) \
       .withColumn("Month_Key", date_format(col("Date"), "yyyyMM")) \
       .withColumn("Day_Key", date_format(col("Date"), "yyyyMMdd"))

# Calculate Health_Impact based on PM2_5 value
df = df.withColumn("Health_Impact", 
                   when(col("PM2_5").between(0, 50), "Good")
                   .when(col("PM2_5").between(51, 100), "Satisfactory")
                   .when(col("PM2_5").between(101, 200), "Moderate")
                   .when(col("PM2_5").between(201, 300), "Poor")
                   .when(col("PM2_5").between(301, 400), "Very poor")
                   .when(col("PM2_5").between(401, 500), "Severe")
                   .otherwise("Unknown"))


StatementMeta(, 7ea1964f-1260-45db-a312-ea211fac4f81, 26, Finished, Available, Finished)

In [25]:
# Select and reorder columns according to the assignment output fields
output_df = df.select(
    col("City"),
    col("Center"),
    col("Date"),
    col("Year_Key"),
    col("Month_Key"),
    col("Day_Key"),
    col("Season"),
    col("PM1_0"),
    col("PM2_5"),
    col("PM10"),
    col("Temperature_F"),
    col("Humidity_%"),
    col("AQI"),
    col("Health_Impact")
)

# Show the DataFrame with the correct output schema
output_df.show(5)

StatementMeta(, 7ea1964f-1260-45db-a312-ea211fac4f81, 27, Finished, Available, Finished)

+------+-----------------+----+--------+---------+-------+------------+-----+-----+-----+-------------+----------+------+-------------+
|  City|           Center|Date|Year_Key|Month_Key|Day_Key|      Season|PM1_0|PM2_5| PM10|Temperature_F|Humidity_%|   AQI|Health_Impact|
+------+-----------------+----+--------+---------+-------+------------+-----+-----+-----+-------------+----------+------+-------------+
|Yangon|7 Miles Mayangone|null|    null|     null|   null|Rainy Season| 29.6|44.27|52.87|        95.67|     49.64|122.59|         Good|
|Yangon|7 Miles Mayangone|null|    null|     null|   null|Rainy Season|25.22|37.49|45.21|        94.42|     51.67| 105.9|         Good|
|Yangon|7 Miles Mayangone|null|    null|     null|   null|Rainy Season|24.46|35.84|42.32|         95.4|     49.26|101.84|         Good|
|Yangon|7 Miles Mayangone|null|    null|     null|   null|Rainy Season|23.42|34.53|41.37|        96.39|     48.06| 98.17|         Good|
|Yangon|7 Miles Mayangone|null|    null|     nul

In [26]:
# Output path for the parquet files
output_path = "Files/output_files/myanmar_air_quality_parquet"

# Write the DataFrame as parquet files
df.write.mode("overwrite").parquet(output_path)

print(f"Data successfully written to {output_path}")


StatementMeta(, 7ea1964f-1260-45db-a312-ea211fac4f81, 28, Finished, Available, Finished)

Data successfully written to Files/output_files/myanmar_air_quality_parquet


In [27]:
# Drop duplicate rows based on specific columns (e.g., City, Date)
df = df.dropDuplicates(["City", "Date"])

df.show(5)

StatementMeta(, 7ea1964f-1260-45db-a312-ea211fac4f81, 29, Finished, Available, Finished)

+--------+-----------------+----+----+-------+------------+-----+-----+-----+-------------+----------+------+---------+----------------+----------+-----------------+--------+---------+-------+-------------+
|    City|           Center|Date|Year|  Month|      Season|PM1_0|PM2_5| PM10|Temperature_F|Humidity_%|   AQI|New_cases|Cumulative_cases|New_deaths|Cumulative_deaths|Year_Key|Month_Key|Day_Key|Health_Impact|
+--------+-----------------+----+----+-------+------------+-----+-----+-----+-------------+----------+------+---------+----------------+----------+-----------------+--------+---------+-------+-------------+
|Mandalay|      19th Street|null|2019|October|Rainy Season|19.18|28.92| 33.4|        91.19|     53.36| 86.37|        0|               0|         0|                0|    null|     null|   null|         Good|
|  Yangon|7 Miles Mayangone|null|2019|October|Rainy Season| 29.6|44.27|52.87|        95.67|     49.64|122.59|        0|               0|         0|                0|    nul

In [28]:
# Example of creating a summary table with average PM2_5 by City and Month
summary_df = df.groupBy("City", "Month").agg({"PM2_5": "avg"}).withColumnRenamed("avg(PM2_5)", "Average_PM2_5")

summary_df.show(5)

StatementMeta(, 7ea1964f-1260-45db-a312-ea211fac4f81, 30, Finished, Available, Finished)

+--------+-------+------------------+
|    City|  Month|     Average_PM2_5|
+--------+-------+------------------+
|  Yangon|October| 44.27000045776367|
|Mandalay|October|28.920000076293945|
+--------+-------+------------------+

