In [1]:
! pip install python-dotenv



In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, LongType
from dotenv import load_dotenv
import os

In [2]:
# Load environment variables
load_dotenv()

HDFS_ENDPOINT = os.getenv("HDFS_ENDPOINT")

spark = SparkSession.builder \
    .appName("DeltaLakeHDFS") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.0.0") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

In [3]:
csv_file_path = "./apartments.csv"
table_name = 'apartment'
table_location = f'hdfs://{HDFS_ENDPOINT}/delta-lake/{table_name}'

In [6]:
df = spark.read.option("delimiter", ";").csv(csv_file_path, header=True, inferSchema=True)

In [7]:
df.show()

df.printSchema()

+----------+--------------------+--------------------+--------------------+--------------------+---------+--------+--------+---+---------+------------+-----+-------------+----------+-----------+--------------------+-------------+-----+--------+---------+------------+----------+
|        id|            category|               title|                body|           amenities|bathrooms|bedrooms|currency|fee|has_photo|pets_allowed|price|price_display|price_type|square_feet|             address|     cityname|state|latitude|longitude|      source|      time|
+----------+--------------------+--------------------+--------------------+--------------------+---------+--------+--------+---+---------+------------+-----+-------------+----------+-----------+--------------------+-------------+-----+--------+---------+------------+----------+
|5668626895|housing/rent/apar...|Studio apartment ...|This unit is loca...|                null|     null|       0|     USD| No|Thumbnail|        None|  790|      

In [8]:
df.write.format("delta").option("path", table_location).save()