In [1]:
spark

## Read Data from HDFS using PySpark

In [4]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Read CSV from HDFS") \
    .getOrCreate()

# Path to the CSV file on HDFS
file_path = "hdfs:///user/student/combined_raw_data.csv"

# Read the CSV file into a DataFrame
df = spark.read.csv(file_path, header=True, inferSchema=True)

# Show the DataFrame content
df.head(5)

df.printSchema()

                                                                                

root
 |-- Line#: integer (nullable = true)
 |-- Date: string (nullable = true)
 |-- Time: string (nullable = true)
 |-- Water Content (m3/m3): double (nullable = true)
 |-- Solar Radiation (W/m2): double (nullable = true)
 |-- Rain (mm): double (nullable = true)
 |-- Temperature (Celcius): double (nullable = true)
 |-- RH (%): double (nullable = true)
 |-- Wind Speed (m/s): double (nullable = true)
 |-- Gust Speed (m/s): double (nullable = true)
 |-- Wind Direction (Degree): double (nullable = true)
 |-- Dew Point (Celcius): double (nullable = true)



## Check Null Values with PySpark DataFrame

In [8]:
from pyspark.sql.functions import col, count, when, isnan

# Assuming 'df' is your PySpark DataFrame from the previous example
# Method 1: Using PySpark SQL functions
null_counts = df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).collect()[0]

# Print null counts for each column
for column, null_count in zip(df.columns, null_counts):
    print(f"{column}: {null_count} null values")



Line#: 0 null values
Date: 0 null values
Time: 0 null values
Water Content (m3/m3): 0 null values
Solar Radiation (W/m2): 0 null values
Rain (mm): 0 null values
Temperature (Celcius): 0 null values
RH (%): 0 null values
Wind Speed (m/s): 0 null values
Gust Speed (m/s): 0 null values
Wind Direction (Degree): 0 null values
Dew Point (Celcius): 0 null values


