# 1. Connexion Spark à PostgreSQL et Chargement des Données

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("PostgresConnection") \
    .config("spark.jars.packages", 
            "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0,org.apache.spark:spark-streaming-kafka-0-10_2.12:3.3.0,org.postgresql:postgresql:42.2.25") \
    .getOrCreate()

jdbc_url = "jdbc:postgresql://192.168.1.10:5432/postgres"

jdbc_properties = {
    "user": "postgres",
    "password": "test2021",
    "driver": "org.postgresql.Driver"
}

try:
    df_toots = spark.read.jdbc(url=jdbc_url, table="toots", properties=jdbc_properties)
    df_toots.show(5) 
except Exception as e:
    print("Erreur rencontrée:", e)

try:
    df_hashtags = spark.read.jdbc(url=jdbc_url, table="hashtags", properties=jdbc_properties)
    df_hashtags.show(5) 
except Exception as e:
    print("Erreur rencontrée:", e)

+---------+--------------------+-------------------+--------+----------+-----+
|     user|             content|          timestamp|language|      date| time|
+---------+--------------------+-------------------+--------+----------+-----+
| uavideos|<p>Video : <a hre...|2024-10-13 08:15:50|      en|2024-10-13|08:15|
|   guigui|<p>the certificat...|2024-10-13 08:15:54|      en|2024-10-13|08:15|
|i8FunInfo|<p><a href="https...|2024-10-13 08:15:31|      en|2024-10-13|08:15|
|    f1bot|<p>Autosport (@au...|2024-10-13 08:15:53|      en|2024-10-13|08:15|
|    f1bot|<p>Autosport (@au...|2024-10-13 08:15:53|      en|2024-10-13|08:15|
+---------+--------------------+-------------------+--------+----------+-----+
only showing top 5 rows

+--------------+------------------------+--------------------+--------+----+-----+-----------+------------+
|          user|                 content|           timestamp|language|date| time|toot_length|     hashtag|
+--------------+------------------------+-------

# 2. Filtrer les toots en fonction de l'activité des utilisateurs

In [2]:
from pyspark.sql import functions as F

df_user_activity = df_toots.groupBy("user").agg(F.count("*").alias("toot_count"))

df_active_users = df_user_activity.filter(F.col("toot_count") > 10)
df_active_users = df_active_users.orderBy(F.col("toot_count").desc())
df_active_users.show()


+---------------+----------+
|           user|toot_count|
+---------------+----------+
|NewsDailyArabic|        55|
|            344|        36|
|  italy24_press|        29|
|            rff|        24|
| AboveMaidstone|        16|
|          Kahte|        16|
| digitalfoundry|        15|
|       takenoko|        13|
|       htTweets|        12|
|     iembot_mtr|        11|
|     iembot_ffc|        11|
|   onlinereport|        11|
|       rawchili|        11|
+---------------+----------+



# 3. Regrouper par Temps et Hashtags, Calculer le Nombre de Toots, et Identifier le Hashtag le Plus Fréquent

In [3]:
df_hashtags.printSchema()

df_grouped = df_hashtags.groupBy("hashtag").agg(
    F.count("*").alias("toot_count")
)

df_top_hashtag = df_grouped.orderBy(F.desc("toot_count")).limit(10)
df_top_hashtag.show()

root
 |-- user: string (nullable = true)
 |-- content: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- language: string (nullable = true)
 |-- date: date (nullable = true)
 |-- time: string (nullable = true)
 |-- toot_length: integer (nullable = true)
 |-- hashtag: string (nullable = true)

+------------+----------+
|     hashtag|toot_count|
+------------+----------+
|          39|        56|
|lovefighters|         8|
|  nowplaying|         6|
|           3|         2|
|           m|         2|
|           2|         2|
|   msg413713|         1|
|         map|         1|
|         169|         1|
|         gik|         1|
+------------+----------+



# 4. Calculer les Agrégations (Nombre de Toots par Jour, Longueur Moyenne des Toots)

In [4]:
df_with_length = df_toots.withColumn("toot_length", F.length("content"))

df_toots_per_day = df_toots.groupBy(F.to_date("timestamp").alias("date")).agg(  # Utilisez "timestamp" ici
    F.count("*").alias("total_toots")
)

df_avg_toot_length = df_with_length.groupBy(F.to_date("timestamp").alias("date")).agg(  # Utilisez "timestamp" ici
    F.avg("toot_length").alias("avg_toot_length")
)

df_toots_per_day.show()
df_avg_toot_length.show()


+----------+-----------+
|      date|total_toots|
+----------+-----------+
|2024-10-10|          4|
|2024-10-12|          4|
|2024-10-09|          1|
|2024-10-13|       1999|
+----------+-----------+

+----------+-----------------+
|      date|  avg_toot_length|
+----------+-----------------+
|2024-10-10|            131.5|
|2024-10-12|           759.25|
|2024-10-09|            229.0|
|2024-10-13|646.1744127936032|
+----------+-----------------+

