In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, unix_timestamp, max as spark_max
import datetime

# Initialiser la session Spark
spark = SparkSession.builder.appName("DataIntegration").getOrCreate()
spark.conf.set("spark.sql.files.ignoreMissingFiles", "true")


# Créer une copie de sauvegarde du fichier principal avec le timestamp actuel dans le nom du fichier
current_timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
source_path = "hdfs://localhost:9080/user/anthonycormeaux/data/dfparquet"
backup_dir = "hdfs://localhost:9080/user/anthonycormeaux/data/databackup"
backup_path = f"{backup_dir}/dfparquet_backup_{current_timestamp}"

# Utiliser l'API Hadoop FileSystem pour copier le fichier
hadoop_conf = spark.sparkContext._jsc.hadoopConfiguration()

# Importer URI depuis Java
URI = spark._jvm.java.net.URI

# Obtenir le FileSystem HDFS en spécifiant l'URI
hdfs_uri = URI("hdfs://localhost:9080")
fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(hdfs_uri, hadoop_conf)

# Créer les objets Path
source = spark._jvm.org.apache.hadoop.fs.Path(source_path)
destination = spark._jvm.org.apache.hadoop.fs.Path(backup_path)
backup_dir_path = spark._jvm.org.apache.hadoop.fs.Path(backup_dir)

# Vérifier si le répertoire de sauvegarde existe, sinon le créer
if not fs.exists(backup_dir_path):
    fs.mkdirs(backup_dir_path)

# Copier le fichier au sein de HDFS
FileUtil = spark._jvm.org.apache.hadoop.fs.FileUtil
FileUtil.copy(fs, source, fs, destination, False, hadoop_conf)


In [None]:
df_parquet = spark.read.parquet(source_path)

# Convertir la colonne Timestamp en format Unix timestamp pour la comparaison
df_parquet = df_parquet.withColumn("Timestamp_unix", unix_timestamp(col("Timestamp")))

# Obtenir le timestamp maximum du fichier principal
dfparquet_max_timestamp = df_parquet.select(spark_max("Timestamp_unix")).collect()[0][0]

# Lire les fichiers Parquet du répertoire dataframes
dataframes_path = "hdfs://localhost:9080/user/anthonycormeaux/data/dataframes"
df_dataframes = spark.read.parquet(dataframes_path)


df_filtered_selected = df_dataframes.select('Address', 'City', 'StateCode', 'SchoolCode')

# Effectuer la jointure gauche sur les colonnes OPE ID et SchoolCode
df_parquet = df_parquet.withColumn("OPE ID", col("OPE ID").cast("string"))
df_filtered_selected = df_filtered_selected.withColumn("SchoolCode", col("SchoolCode").cast("string"))

df_joined = df_parquet.join(
    df_filtered_selected,
    df_parquet["OPE ID"] == df_filtered_selected["SchoolCode"],
    "left"
)

print(df_joined.show(10))



In [None]:
current_timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
source_path = "hdfs://localhost:9080/user/anthonycormeaux/data/result/joined_data"
backup_dir = "hdfs://localhost:9080/user/anthonycormeaux/data/backup"
backup_path = f"{backup_dir}/dfparquet_backup_{current_timestamp}"

hadoop_conf = spark.sparkContext._jsc.hadoopConfiguration()

URI = spark._jvm.java.net.URI

hdfs_uri = URI("hdfs://localhost:9080")
fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(hdfs_uri, hadoop_conf)

source = spark._jvm.org.apache.hadoop.fs.Path(source_path)
destination = spark._jvm.org.apache.hadoop.fs.Path(backup_path)
backup_dir_path = spark._jvm.org.apache.hadoop.fs.Path(backup_dir)

if not fs.exists(backup_dir_path):
    fs.mkdirs(backup_dir_path)

FileUtil = spark._jvm.org.apache.hadoop.fs.FileUtil
FileUtil.copy(fs, source, fs, destination, False, hadoop_conf)


# Enregistrer le résultat de la jointure dans un nouveau fichier Parquet
output_path = "hdfs://localhost:9080/user/anthonycormeaux/data/result/joined_data"
df_joined.write.mode("overwrite").parquet(output_path)

spark.stop()