In [1]:
spark

## Read Data from HDFS using PySpark

In [4]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Read CSV from HDFS") \
    .getOrCreate()

# Path to the CSV file on HDFS
file_path = "hdfs:///user/student/combined_raw_data.csv"

# Read the CSV file into a DataFrame
df = spark.read.csv(file_path, header=True, inferSchema=True)

# Show the DataFrame content
df.head(5)

df.printSchema()

                                                                                

root
 |-- Line#: integer (nullable = true)
 |-- Date: string (nullable = true)
 |-- Time: string (nullable = true)
 |-- Water Content (m3/m3): double (nullable = true)
 |-- Solar Radiation (W/m2): double (nullable = true)
 |-- Rain (mm): double (nullable = true)
 |-- Temperature (Celcius): double (nullable = true)
 |-- RH (%): double (nullable = true)
 |-- Wind Speed (m/s): double (nullable = true)
 |-- Gust Speed (m/s): double (nullable = true)
 |-- Wind Direction (Degree): double (nullable = true)
 |-- Dew Point (Celcius): double (nullable = true)



## Check Null Values with PySpark DataFrame

In [3]:
from pyspark.sql.functions import col, count, when, isnan

# Count null values for each column
null_counts = df.select([count(when(col(c).isNull(), c)).alias(c + '_null_count') for c in df.columns])
null_counts.show()

# Get percentage of null values for each column
total_rows = df.count()
null_percentages = df.select([
    (count(when(col(c).isNull(), c)) / total_rows * 100).alias(c + '_null_percentage') 
    for c in df.columns
])
null_percentages.show()

# Identify columns with null values
columns_with_nulls = [c for c in df.columns if df.filter(col(c).isNull()).count() > 0]
print("Columns with null values:", columns_with_nulls)

                                                                                

+----------------+---------------+---------------+--------------------------------+---------------------------------+--------------------+--------------------------------+-----------------+---------------------------+---------------------------+----------------------------------+------------------------------+
|Line#_null_count|Date_null_count|Time_null_count|Water Content (m3/m3)_null_count|Solar Radiation (W/m2)_null_count|Rain (mm)_null_count|Temperature (Celcius)_null_count|RH (%)_null_count|Wind Speed (m/s)_null_count|Gust Speed (m/s)_null_count|Wind Direction (Degree)_null_count|Dew Point (Celcius)_null_count|
+----------------+---------------+---------------+--------------------------------+---------------------------------+--------------------+--------------------------------+-----------------+---------------------------+---------------------------+----------------------------------+------------------------------+
|               0|              0|              0|              

                                                                                

+---------------------+--------------------+--------------------+-------------------------------------+--------------------------------------+-------------------------+-------------------------------------+----------------------+--------------------------------+--------------------------------+---------------------------------------+-----------------------------------+
|Line#_null_percentage|Date_null_percentage|Time_null_percentage|Water Content (m3/m3)_null_percentage|Solar Radiation (W/m2)_null_percentage|Rain (mm)_null_percentage|Temperature (Celcius)_null_percentage|RH (%)_null_percentage|Wind Speed (m/s)_null_percentage|Gust Speed (m/s)_null_percentage|Wind Direction (Degree)_null_percentage|Dew Point (Celcius)_null_percentage|
+---------------------+--------------------+--------------------+-------------------------------------+--------------------------------------+-------------------------+-------------------------------------+----------------------+---------------------------



Columns with null values: []


