In [2]:
spark

## Read Data from HDFS using PySpark

In [28]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Read CSV from HDFS") \
    .getOrCreate()

# Path to the CSV file on HDFS
file_path = "hdfs:///user/student/combined_raw_data.csv"

# Read the CSV file into a DataFrame
df = spark.read.csv(file_path, header=True, inferSchema=True)

# Show the DataFrame Schema
df.printSchema()

[Stage 43:>                                                         (0 + 4) / 4]

root
 |-- Line#: integer (nullable = true)
 |-- Date: string (nullable = true)
 |-- Time: string (nullable = true)
 |-- Water Content (m3/m3): double (nullable = true)
 |-- Solar Radiation (W/m2): double (nullable = true)
 |-- Rain (mm): double (nullable = true)
 |-- Temperature (Celcius): double (nullable = true)
 |-- RH (%): double (nullable = true)
 |-- Wind Speed (m/s): double (nullable = true)
 |-- Gust Speed (m/s): double (nullable = true)
 |-- Wind Direction (Degree): double (nullable = true)
 |-- Dew Point (Celcius): double (nullable = true)



                                                                                

## Check Null Values with PySpark DataFrame

In [29]:
from pyspark.sql.functions import col, count, when, isnan

null_counts = df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).collect()[0]

# Print null counts for each column
for column, null_count in zip(df.columns, null_counts):
    print(f"{column}: {null_count} null values")

[Stage 44:>                                                         (0 + 4) / 4]

Line#: 0 null values
Date: 0 null values
Time: 0 null values
Water Content (m3/m3): 0 null values
Solar Radiation (W/m2): 0 null values
Rain (mm): 0 null values
Temperature (Celcius): 0 null values
RH (%): 0 null values
Wind Speed (m/s): 0 null values
Gust Speed (m/s): 0 null values
Wind Direction (Degree): 0 null values
Dew Point (Celcius): 0 null values


                                                                                

## Count Datasets rows

In [30]:
df.count()

160418

## Drop Duplicate Datas

In [31]:
df = df.select("*").distinct()

## Count Datasets rows after dropping Duplicated Datas

In [32]:
# There isn't any duplicate datas.
df.count()

                                                                                

160418

## Convert Date Datatype from String to Date in (dd-MM-yyyy)

In [35]:
#import library
from pyspark.sql.functions import to_date, concat, lpad

df = df.withColumn("Date", to_date(concat(
    lpad(substring("Date", 1, 2), 2, "20"),
    "-",
    substring("Date", 4, 2),
    "-",
    substring("Date", 7, 2)
), "yyyy-MM-dd"))

AnalysisException: cannot resolve '`-`' given input columns: [Date, Dew Point (Celcius), Gust Speed (m/s), Line#, RH (%), Rain (mm), Solar Radiation (W/m2), Temperature (Celcius), Time, Water Content (m3/m3), Wind Direction (Degree), Wind Speed (m/s)];
'Project [Line##971, to_date(concat(lpad(substring('Date, 1, 2), 2, 20), '-, substring('Date, 4, 2), '-, substring('Date, 7, 2)), Some(yyyy-MM-dd)) AS Date#1186, Time#973, Water Content (m3/m3)#974, Solar Radiation (W/m2)#975, Rain (mm)#976, Temperature (Celcius)#977, RH (%)#978, Wind Speed (m/s)#979, Gust Speed (m/s)#980, Wind Direction (Degree)#981, Dew Point (Celcius)#982]
+- Deduplicate [Solar Radiation (W/m2)#975, Water Content (m3/m3)#974, Dew Point (Celcius)#982, Gust Speed (m/s)#980, RH (%)#978, Wind Speed (m/s)#979, Temperature (Celcius)#977, Date#972, Line##971, Wind Direction (Degree)#981, Rain (mm)#976, Time#973]
   +- Project [Line##971, Date#972, Time#973, Water Content (m3/m3)#974, Solar Radiation (W/m2)#975, Rain (mm)#976, Temperature (Celcius)#977, RH (%)#978, Wind Speed (m/s)#979, Gust Speed (m/s)#980, Wind Direction (Degree)#981, Dew Point (Celcius)#982]
      +- Relation[Line##971,Date#972,Time#973,Water Content (m3/m3)#974,Solar Radiation (W/m2)#975,Rain (mm)#976,Temperature (Celcius)#977,RH (%)#978,Wind Speed (m/s)#979,Gust Speed (m/s)#980,Wind Direction (Degree)#981,Dew Point (Celcius)#982] csv


In [34]:
selected_columns = [
    "Line#", "Date", "Time", "Water Content (m3/m3)", 
    "Solar Radiation (W/m2)", "Rain (mm)", 
    "Temperature (Celcius)", "RH (%)"
]
df.select(*selected_columns).show(5)

# Print the schema of the selected columns
df.select(*selected_columns).printSchema()



+-----+--------+--------+---------------------+----------------------+---------+---------------------+------+
|Line#|    Date|    Time|Water Content (m3/m3)|Solar Radiation (W/m2)|Rain (mm)|Temperature (Celcius)|RH (%)|
+-----+--------+--------+---------------------+----------------------+---------+---------------------+------+
| 2419|21/05/09|09:30:00|               0.2837|                 326.0|      0.0|                31.61|  75.7|
| 2454|21/05/09|12:25:00|               0.2837|                 411.0|      0.0|                33.47|  66.4|
| 2513|21/05/09|17:20:00|               0.2834|                  57.0|      0.0|                32.61|  69.8|
| 2537|21/05/09|19:20:00|               0.2834|                   1.0|      0.0|                29.41|  85.5|
| 3002|21/05/11|10:05:00|               0.2813|                 791.0|      0.0|                33.84|  66.0|
+-----+--------+--------+---------------------+----------------------+---------+---------------------+------+
only showi

                                                                                