In [3]:
from pyspark.sql.functions import current_timestamp

In [1]:
from pyspark.sql import SparkSession

# Инициализируем Spark Session с добавлением PostgreSQL JDBC драйвера
spark = SparkSession.builder \
    .master("spark://spark-master:7077") \
    .appName("JupyterSparkCluster") \
    .config("spark.jars.packages", "org.postgresql:postgresql:42.5.4") \
    .getOrCreate()
spark

In [5]:
# Параметры подключения к PostgreSQL
jdbc_url = "jdbc:postgresql://postgres:5432/postgres"
connection_properties = {
    "user": "postgres",
    "password": "postgres",
    "driver": "org.postgresql.Driver"
}


In [6]:
# Чтение данных из source3.craft_market_craftsmans
df_craftsmans = spark.read.jdbc(
    url=jdbc_url,
    table="source3.craft_market_craftsmans",
    properties=connection_properties
)

In [7]:
df_craftsmans.head(5)

[Row(craftsman_id=1, craftsman_name='Griffith MacVay', craftsman_address='3370 Laurel Point', craftsman_birthday=datetime.date(1994, 2, 15), craftsman_email='drickwood1@soundcloud.com'),
 Row(craftsman_id=2, craftsman_name='Deeyn Jaggs', craftsman_address='85 Stone Corner Center', craftsman_birthday=datetime.date(1996, 12, 26), craftsman_email='clanghorn2@cnbc.com'),
 Row(craftsman_id=3, craftsman_name='Shawn Alentyev', craftsman_address='36752 Lyons Terrace', craftsman_birthday=datetime.date(1993, 1, 18), craftsman_email='lmcdougall3@trellian.com'),
 Row(craftsman_id=4, craftsman_name='Jorrie Brigginshaw', craftsman_address='5693 Boyd Junction', craftsman_birthday=datetime.date(2001, 12, 28), craftsman_email='dfettes4@youtube.com'),
 Row(craftsman_id=5, craftsman_name='Fae Winscomb', craftsman_address='2705 Elgar Trail', craftsman_birthday=datetime.date(2004, 7, 15), craftsman_email='vlapree5@360.cn')]

In [13]:
# Удаление столбца 'craftsman_id'
df_craftsmans = df_craftsmans.drop("craftsman_id")

In [14]:
df_craftsmans.head(5)

[Row(craftsman_name='Griffith MacVay', craftsman_address='3370 Laurel Point', craftsman_birthday=datetime.date(1994, 2, 15), craftsman_email='drickwood1@soundcloud.com', load_dttm=datetime.datetime(2024, 12, 26, 3, 20, 18, 486212)),
 Row(craftsman_name='Deeyn Jaggs', craftsman_address='85 Stone Corner Center', craftsman_birthday=datetime.date(1996, 12, 26), craftsman_email='clanghorn2@cnbc.com', load_dttm=datetime.datetime(2024, 12, 26, 3, 20, 18, 486212)),
 Row(craftsman_name='Shawn Alentyev', craftsman_address='36752 Lyons Terrace', craftsman_birthday=datetime.date(1993, 1, 18), craftsman_email='lmcdougall3@trellian.com', load_dttm=datetime.datetime(2024, 12, 26, 3, 20, 18, 486212)),
 Row(craftsman_name='Jorrie Brigginshaw', craftsman_address='5693 Boyd Junction', craftsman_birthday=datetime.date(2001, 12, 28), craftsman_email='dfettes4@youtube.com', load_dttm=datetime.datetime(2024, 12, 26, 3, 20, 18, 486212)),
 Row(craftsman_name='Fae Winscomb', craftsman_address='2705 Elgar Trail'

In [15]:
# Добавление колонки с временем загрузки
df_craftsmans = df_craftsmans.withColumn("load_dttm", current_timestamp())

# Запись данных в dwh.d_craftsmans (режим перезаписи)
df_craftsmans.write.jdbc(
    url=jdbc_url,
    table="dwh.d_craftsmans",
    mode="append",
    properties=connection_properties
)

print("Таблица dwh.d_craftsmans успешно загружена.")

Таблица dwh.d_craftsmans успешно загружена.
