In [0]:
from pyspark.sql import SparkSession
from pyspark.sql import Row

# Initialize Spark session
spark = SparkSession.builder.appName("DataWriteExample").getOrCreate()

# Sample DataFrame
data = [Row(id=1, name="Alice", age=25), Row(id=2, name="Bob", age=30), Row(id=3, name="Charlie", age=35)]
df = spark.createDataFrame(data)

# Show DataFrame
df.show()


+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1|  Alice| 25|
|  2|    Bob| 30|
|  3|Charlie| 35|
+---+-------+---+



In [0]:
#Write Data to CSV
df.write.option("header", "true").csv("/FileStore/tables/sample_data_csv", mode="overwrite")


In [0]:
#Write Data to JSON
df.write.json("/FileStore/tables/sample_data_json", mode="overwrite")


In [0]:
#Write Data to Parquet
df.write.parquet("/FileStore/tables/sample_data_parquet", mode="overwrite")


In [0]:
#Write Data to Delta Table
df.write.format("delta").save("/FileStore/tables/newsample_data_delta")


In [0]:
dbutils.fs.ls("/FileStore/tables/")


[FileInfo(path='dbfs:/FileStore/tables/loan.csv', name='loan.csv', size=43967, modificationTime=1732796327000),
 FileInfo(path='dbfs:/FileStore/tables/newsample_data_delta/', name='newsample_data_delta/', size=0, modificationTime=1732800124000),
 FileInfo(path='dbfs:/FileStore/tables/sample_data_csv/', name='sample_data_csv/', size=0, modificationTime=1732794820000),
 FileInfo(path='dbfs:/FileStore/tables/sample_data_delta/', name='sample_data_delta/', size=0, modificationTime=1732794863000),
 FileInfo(path='dbfs:/FileStore/tables/sample_data_json/', name='sample_data_json/', size=0, modificationTime=1732794835000),
 FileInfo(path='dbfs:/FileStore/tables/sample_data_parquet/', name='sample_data_parquet/', size=0, modificationTime=1732794847000)]

In [0]:
#Exploratory Data Analysis (EDA) in Databricks
file_path = "/FileStore/tables/sample_data_csv"
df = spark.read.option("header", "true").csv(file_path, inferSchema=True)
df.show()

df.printSchema()
df.show(5)

print(f"Row count: {df.count()}")
print(f"Column count: {len(df.columns)}")

df.describe().show()

from pyspark.sql.functions import col, isnan, when, count

df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).show()



+---+-------+---+
| id|   name|age|
+---+-------+---+
|  3|Charlie| 35|
|  1|  Alice| 25|
|  2|    Bob| 30|
+---+-------+---+

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  3|Charlie| 35|
|  1|  Alice| 25|
|  2|    Bob| 30|
+---+-------+---+

Row count: 3
Column count: 3
+-------+---+-------+----+
|summary| id|   name| age|
+-------+---+-------+----+
|  count|  3|      3|   3|
|   mean|2.0|   NULL|30.0|
| stddev|1.0|   NULL| 5.0|
|    min|  1|  Alice|  25|
|    max|  3|Charlie|  35|
+-------+---+-------+----+

+---+----+---+
| id|name|age|
+---+----+---+
|  0|   0|  0|
+---+----+---+

