In [81]:
spark

## Read Data from HDFS using PySpark

In [82]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Read CSV from HDFS") \
    .getOrCreate()

# Path to the CSV file on HDFS
file_path = "hdfs:///user/student/combined_raw_data.csv"

# Read the CSV file into a DataFrame
df = spark.read.csv(file_path, header=True, inferSchema=True)

# Show the DataFrame Schema
df.printSchema()

[Stage 117:>                                                        (0 + 4) / 4]

root
 |-- Line#: integer (nullable = true)
 |-- Date: string (nullable = true)
 |-- Time: string (nullable = true)
 |-- Water Content (m3/m3): double (nullable = true)
 |-- Solar Radiation (W/m2): double (nullable = true)
 |-- Rain (mm): double (nullable = true)
 |-- Temperature (Celcius): double (nullable = true)
 |-- RH (%): double (nullable = true)
 |-- Wind Speed (m/s): double (nullable = true)
 |-- Gust Speed (m/s): double (nullable = true)
 |-- Wind Direction (Degree): double (nullable = true)
 |-- Dew Point (Celcius): double (nullable = true)





## Check Null Values with PySpark DataFrame

In [83]:
from pyspark.sql.functions import col, count, when, isnan

null_counts = df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).collect()[0]

# Print null counts for each column
for column, null_count in zip(df.columns, null_counts):
    print(f"{column}: {null_count} null values")

                                                                                

Line#: 0 null values
Date: 0 null values
Time: 0 null values
Water Content (m3/m3): 0 null values
Solar Radiation (W/m2): 0 null values
Rain (mm): 0 null values
Temperature (Celcius): 0 null values
RH (%): 0 null values
Wind Speed (m/s): 0 null values
Gust Speed (m/s): 0 null values
Wind Direction (Degree): 0 null values
Dew Point (Celcius): 0 null values


## Count Datasets rows

In [84]:
df.count()

[Stage 120:>                                                        (0 + 4) / 4]                                                                                

160418

## Drop Duplicate Datas

In [85]:
df = df.select("*").distinct()

## Count Datasets rows after dropping Duplicated Datas

In [86]:
# There isn't any duplicate datas.
df.count()

                                                                                

160418

## Convert Date Datatype from String to Date in (dd-MM-yyyy)

In [90]:
#import library
from pyspark.sql.functions import to_date, date_format

df = df.withColumn("Date", to_date("Date", "yy/MM/dd"))
df = df.withColumn("Date", date_format("Date", "yyyy-MM-dd"))
df = df.withColumn("Date", to_date("Date", "yyyy-MM-dd"))

In [94]:
# Print the schema of the selected columns
df.printSchema()

root
 |-- Line#: integer (nullable = true)
 |-- Date: date (nullable = true)
 |-- Time: string (nullable = true)
 |-- Water Content (m3/m3): double (nullable = true)
 |-- Solar Radiation (W/m2): double (nullable = true)
 |-- Rain (mm): double (nullable = true)
 |-- Temperature (Celcius): double (nullable = true)
 |-- RH (%): double (nullable = true)
 |-- Wind Speed (m/s): double (nullable = true)
 |-- Gust Speed (m/s): double (nullable = true)
 |-- Wind Direction (Degree): double (nullable = true)
 |-- Dew Point (Celcius): double (nullable = true)



## Drop Line# Column

In [98]:
# Drop the "Line#" column
df = df.drop("Line#")

In [99]:
df.printSchema()

root
 |-- Date: date (nullable = true)
 |-- Time: string (nullable = true)
 |-- Water Content (m3/m3): double (nullable = true)
 |-- Solar Radiation (W/m2): double (nullable = true)
 |-- Rain (mm): double (nullable = true)
 |-- Temperature (Celcius): double (nullable = true)
 |-- RH (%): double (nullable = true)
 |-- Wind Speed (m/s): double (nullable = true)
 |-- Gust Speed (m/s): double (nullable = true)
 |-- Wind Direction (Degree): double (nullable = true)
 |-- Dew Point (Celcius): double (nullable = true)



## Save as CSV File in Local Directory

In [100]:
# Path to save the single CSV file
output_file = "/home/student/Data/weather-datasets/phnom_penh/Big Data Weather Prediction Capstone Project/cleaned.csv"

# Save the DataFrame to a single CSV file
df.coalesce(1).write.option("header", True).mode("overwrite").csv("file:///tmp/temp_csv")

# Rename the single part file to `cleaned.csv`
import os
import shutil

# Intermediate directory where the file is saved
temp_dir = "/tmp/temp_csv/"

# Find the part file and rename it to `cleaned.csv`
for file_name in os.listdir(temp_dir):
    if file_name.startswith("part-"):
        shutil.move(
            os.path.join(temp_dir, file_name),
            output_file
        )
        break

# Clean up the intermediate directory
shutil.rmtree(temp_dir)

print(f"DataFrame saved as a single CSV file at {output_file}")

[Stage 138:>                                                        (0 + 1) / 1]

DataFrame saved as a single CSV file at /home/student/Data/weather-datasets/phnom_penh/Big Data Weather Prediction Capstone Project/cleaned.csv


                                                                                

## Save as CSV File but in HDFS

In [103]:
output_path = "hdfs:///user/student/cleaned.csv"

# Write the DataFrame as a single CSV file to HDFS
df.write.option("header", "true").csv(output_path)
print('Successfuly save as CSV file in HDFS!')

                                                                                

Successfuly save as CSV file in HDFS!
