In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, DateType
from pyspark.sql.functions import *

spark = SparkSession.builder.master("local[*]").getOrCreate()
sparkContext = spark.sparkContext

## Création d'un DataFrame d'étudiants

In [2]:
# Définition du schéma
schema_etudiants = StructType([
    StructField("etudiant_id", IntegerType(), True),
    StructField("nom", StringType(), True),
    StructField("prenom", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("niveau", StringType(), True),
    StructField("filiere", StringType(), True),
    StructField("ville_origine", StringType(), True)
])

# Données d'exemple
donnees_etudiants = [
    (1001, "Dupont", "Marie", 20, "L2", "Informatique", "Paris"),
    (1002, "Martin", "Pierre", 22, "M1", "Mathématiques", "Lyon"),
    (1003, "Bernard", "Sophie", 19, "L1", "Physique", "Marseille"),
    (1004, "Dubois", "Thomas", 21, "L3", "Informatique", "Toulouse"),
    (1005, "Moreau", "Emma", 23, "M2", "Mathématiques", "Nice"),
    (1006, "Petit", "Lucas", 20, "L2", "Physique", "Bordeaux"),
    (1007, "Garcia", "Léa", 19, "L1", "Informatique", "Strasbourg"),
    (1008, "Rodriguez", "Antoine", 24, "M2", "Physique", "Lille")
]

# Création du DataFrame
df_etudiants = spark.createDataFrame(donnees_etudiants, schema_etudiants)
df_etudiants.createOrReplaceTempView("etudiants")

## Création du DataFrame de notes

In [3]:
schema_notes = StructType([
    StructField("note_id", IntegerType(), True),
    StructField("etudiant_id", IntegerType(), True),
    StructField("matiere", StringType(), True),
    StructField("type_evaluation", StringType(), True),
    StructField("note", DoubleType(), True),
    StructField("coefficient", IntegerType(), True),
    StructField("semestre", StringType(), True),
    StructField("annee", IntegerType(), True)
])

donnees_notes = [
    (1, 1001, "Algorithmique", "Examen", 16.5, 3, "S1", 2024),
    (2, 1001, "Base de Données", "TP", 14.0, 2, "S1", 2024),
    (3, 1001, "Mathématiques", "Examen", 18.5, 4, "S1", 2024),
    (4, 1002, "Analyse", "Examen", 17.0, 4, "S1", 2024),
    (5, 1002, "Algèbre", "Contrôle", 15.5, 2, "S1", 2024),
    (6, 1003, "Physique Quantique", "Examen", 13.0, 4, "S1", 2024),
    (7, 1003, "Thermodynamique", "TP", 16.0, 2, "S1", 2024),
    (8, 1004, "Programmation", "Projet", 17.5, 3, "S1", 2024),
    (9, 1004, "Réseaux", "Examen", 14.5, 3, "S1", 2024),
    (10, 1005, "Statistiques", "Examen", 19.0, 4, "S1", 2024),
    (11, 1005, "Probabilités", "Contrôle", 16.5, 2, "S1", 2024),
    (12, 1001, "Algorithmique", "Projet", 15.0, 2, "S2", 2024),
    (13, 1002, "Topologie", "Examen", 18.0, 3, "S2", 2024),
    (14, 1003, "Mécanique", "TP", 12.5, 2, "S2", 2024)
]

df_notes = spark.createDataFrame(donnees_notes, schema_notes)
df_notes.createOrReplaceTempView("notes")

In [4]:
# Calculez la moyenne pondérée par coefficient pour chaque étudiant
# Formule: SUM(note * coefficient) / SUM(coefficient)

requete_q1_1 = """
SELECT 
    e.nom,
    e.prenom,
    e.filiere,
    ROUND(SUM(n.note * n.coefficient) / SUM(n.coefficient), 2) as moyenne_ponderee
FROM notes n
JOIN etudiants e ON n.etudiant_id = e.etudiant_id
GROUP BY e.etudiant_id, e.nom, e.prenom, e.filiere
ORDER BY moyenne_ponderee DESC
"""

q1_1 = spark.sql(requete_q1_1)
q1_1.show()

+-------+------+-------------+----------------+
|    nom|prenom|      filiere|moyenne_ponderee|
+-------+------+-------------+----------------+
| Moreau|  Emma|Mathématiques|           18.17|
| Martin|Pierre|Mathématiques|            17.0|
| Dupont| Marie| Informatique|            16.5|
| Dubois|Thomas| Informatique|            16.0|
|Bernard|Sophie|     Physique|           13.63|
+-------+------+-------------+----------------+



In [5]:
# Pour chaque matière, trouvez :
# - La note moyenne
# - La note la plus haute
# - La note la plus basse
# - Le nombre d'évaluations

requete_q1_2 = """
SELECT 
    matiere,
    ROUND(AVG(note), 2) as note_moyenne,
    MAX(note) as note_max,
    MIN(note) as note_min,
    COUNT(*) as nombre_evaluations,
    COUNT(DISTINCT etudiant_id) as nombre_etudiants
FROM notes
GROUP BY matiere
ORDER BY note_moyenne DESC
"""

q1_2 = spark.sql(requete_q1_2)
q1_2.show()

+------------------+------------+--------+--------+------------------+----------------+
|           matiere|note_moyenne|note_max|note_min|nombre_evaluations|nombre_etudiants|
+------------------+------------+--------+--------+------------------+----------------+
|      Statistiques|        19.0|    19.0|    19.0|                 1|               1|
|     Mathématiques|        18.5|    18.5|    18.5|                 1|               1|
|         Topologie|        18.0|    18.0|    18.0|                 1|               1|
|     Programmation|        17.5|    17.5|    17.5|                 1|               1|
|           Analyse|        17.0|    17.0|    17.0|                 1|               1|
|      Probabilités|        16.5|    16.5|    16.5|                 1|               1|
|   Thermodynamique|        16.0|    16.0|    16.0|                 1|               1|
|     Algorithmique|       15.75|    16.5|    15.0|                 2|               1|
|           Algèbre|        15.5

In [6]:
# Classez les étudiants par leur moyenne générale
# Affichez: rang, nom, prénom, moyenne, mention
# Mentions: >16 "Très Bien", >14 "Bien", >12 "Assez Bien", >10 "Passable", <=10 "Insuffisant"

# Votre code ici :
requete_q1_3 = """
SELECT 
    e.nom,
    e.prenom,
    ROUND(SUM(n.note * n.coefficient) / SUM(n.coefficient), 2) as moyenne_generale,
    CASE 
        WHEN ROUND(SUM(n.note * n.coefficient) / SUM(n.coefficient), 2) > 16 THEN 'Très Bien'
        WHEN ROUND(SUM(n.note * n.coefficient) / SUM(n.coefficient), 2) > 14 THEN 'Bien'
        WHEN ROUND(SUM(n.note * n.coefficient) / SUM(n.coefficient), 2) > 12 THEN 'Assez Bien'
        WHEN ROUND(SUM(n.note * n.coefficient) / SUM(n.coefficient), 2) > 10 THEN 'Passable'
        ELSE 'Insuffisant'
    END as mention
FROM notes n
JOIN etudiants e ON n.etudiant_id = e.etudiant_id
GROUP BY e.etudiant_id, e.nom, e.prenom
ORDER BY moyenne_generale DESC
"""

q1_3 = spark.sql(requete_q1_3)
q1_3.show()

+-------+------+----------------+----------+
|    nom|prenom|moyenne_generale|   mention|
+-------+------+----------------+----------+
| Moreau|  Emma|           18.17| Très Bien|
| Martin|Pierre|            17.0| Très Bien|
| Dupont| Marie|            16.5| Très Bien|
| Dubois|Thomas|            16.0|      Bien|
|Bernard|Sophie|           13.63|Assez Bien|
+-------+------+----------------+----------+



## Exercice 2 : Analyse de la fréquentation des cours

In [7]:
schema_presences = StructType([
    StructField("presence_id", IntegerType(), True),
    StructField("etudiant_id", IntegerType(), True),
    StructField("cours", StringType(), True),
    StructField("date_cours", StringType(), True),
    StructField("present", StringType(), True),
    StructField("duree_cours", IntegerType(), True)
])

schema_cours = StructType([
    StructField("cours_id", StringType(), True),
    StructField("nom_cours", StringType(), True),
    StructField("enseignant", StringType(), True),
    StructField("salle", StringType(), True),
    StructField("capacite_salle", IntegerType(), True),
    StructField("filiere", StringType(), True)
])

donnees_presences = [
    (1, 1001, "ALGO101", "2024-01-15", "Oui", 120),
    (2, 1001, "BDD101", "2024-01-15", "Retard", 90),
    (3, 1001, "MATH101", "2024-01-16", "Oui", 120),
    (4, 1002, "ANA201", "2024-01-15", "Oui", 120),
    (5, 1002, "ALG201", "2024-01-16", "Non", 90),
    (6, 1003, "PHY101", "2024-01-15", "Oui", 120),
    (7, 1004, "PROG201", "2024-01-15", "Oui", 180),
    (8, 1004, "RES201", "2024-01-16", "Retard", 90),
    (9, 1005, "STAT301", "2024-01-15", "Oui", 120),
    (10, 1001, "ALGO101", "2024-01-22", "Non", 120),
    (11, 1002, "ANA201", "2024-01-22", "Oui", 120),
    (12, 1003, "PHY101", "2024-01-22", "Retard", 120)
]

donnees_cours = [
    ("ALGO101", "Algorithmique Niveau 1", "Prof. Dupont", "A101", 50, "Informatique"),
    ("BDD101", "Base de Données", "Prof. Martin", "B205", 30, "Informatique"),
    ("MATH101", "Mathématiques Générales", "Prof. Bernard", "C301", 80, "Mathématiques"),
    ("ANA201", "Analyse Avancée", "Prof. Dubois", "C302", 40, "Mathématiques"),
    ("ALG201", "Algèbre Linéaire", "Prof. Moreau", "C303", 35, "Mathématiques"),
    ("PHY101", "Physique Générale", "Prof. Petit", "D101", 60, "Physique"),
    ("PROG201", "Programmation Avancée", "Prof. Garcia", "A102", 25, "Informatique"),
    ("RES201", "Réseaux Informatiques", "Prof. Lopez", "A103", 30, "Informatique"),
    ("STAT301", "Statistiques Avancées", "Prof. Silva", "C304", 45, "Mathématiques")
]

df_presences = spark.createDataFrame(donnees_presences, schema_presences)
df_cours = spark.createDataFrame(donnees_cours, schema_cours)

df_presences.createOrReplaceTempView("presences")
df_cours.createOrReplaceTempView("cours")

In [8]:
# Calculez le taux de présence par étudiant
# Présent = "Oui" ou "Retard", Absent = "Non"
# Affichez: nom, prénom, taux_presence, nombre_cours_suivis

requete_q2_1 = """
WITH stats_presence AS (
    SELECT 
        p.etudiant_id,
        COUNT(*) as total_cours,
        SUM(CASE WHEN p.present IN ('Oui', 'Retard') THEN 1 ELSE 0 END) as cours_presents
    FROM presences p
    GROUP BY p.etudiant_id
)
SELECT 
    e.nom,
    e.prenom,
    sp.total_cours as nombre_cours_suivis,
    ROUND((sp.cours_presents * 100.0 / sp.total_cours), 2) as taux_presence
FROM stats_presence sp
JOIN etudiants e ON sp.etudiant_id = e.etudiant_id
ORDER BY taux_presence DESC
"""

q2_1 = spark.sql(requete_q2_1)
q2_1.show()

+-------+------+-------------------+-------------+
|    nom|prenom|nombre_cours_suivis|taux_presence|
+-------+------+-------------------+-------------+
|Bernard|Sophie|                  2|       100.00|
| Dubois|Thomas|                  2|       100.00|
| Moreau|  Emma|                  1|       100.00|
| Dupont| Marie|                  4|        75.00|
| Martin|Pierre|                  3|        66.67|
+-------+------+-------------------+-------------+



In [9]:
# Pour chaque cours, calculez :
# - Taux de présence moyen
# - Nombre d'étudiants inscrits
# - Taux d'occupation de la salle (étudiants/capacité)

requete_q2_2 = """
SELECT 
    c.nom_cours,
    c.enseignant,
    c.salle,
    c.capacite_salle,
    COUNT(DISTINCT p.etudiant_id) as etudiants_inscrits,
    ROUND((COUNT(DISTINCT p.etudiant_id) * 100.0 / c.capacite_salle), 2) as taux_occupation_salle,
    ROUND((SUM(CASE WHEN p.present IN ('Oui', 'Retard') THEN 1 ELSE 0 END) * 100.0 / COUNT(*)), 2) as taux_presence_moyen
FROM presences p
JOIN cours c ON p.cours = c.cours_id
GROUP BY c.cours_id, c.nom_cours, c.enseignant, c.salle, c.capacite_salle
ORDER BY taux_presence_moyen ASC
"""

q2_2 = spark.sql(requete_q2_2)
q2_2.show()

+--------------------+-------------+-----+--------------+------------------+---------------------+-------------------+
|           nom_cours|   enseignant|salle|capacite_salle|etudiants_inscrits|taux_occupation_salle|taux_presence_moyen|
+--------------------+-------------+-----+--------------+------------------+---------------------+-------------------+
|    Algèbre Linéaire| Prof. Moreau| C303|            35|                 1|                 2.86|               0.00|
|Algorithmique Niv...| Prof. Dupont| A101|            50|                 1|                 2.00|              50.00|
|     Analyse Avancée| Prof. Dubois| C302|            40|                 1|                 2.50|             100.00|
|     Base de Données| Prof. Martin| B205|            30|                 1|                 3.33|             100.00|
|Mathématiques Gén...|Prof. Bernard| C301|            80|                 1|                 1.25|             100.00|
|   Physique Générale|  Prof. Petit| D101|      

In [10]:
# Identifiez les étudiants avec un taux de présence < 75%
# ET une moyenne générale < 12

requete_q2_3 = """
WITH stats_presence AS (
    SELECT 
        p.etudiant_id,
        COUNT(*) as total_cours,
        SUM(CASE WHEN p.present IN ('Oui', 'Retard') THEN 1 ELSE 0 END) as cours_presents
    FROM presences p
    GROUP BY p.etudiant_id
),
moyennes_etudiants AS (
    SELECT 
        etudiant_id,
        ROUND(SUM(note * coefficient) / SUM(coefficient), 2) as moyenne_generale
    FROM notes
    GROUP BY etudiant_id
)
SELECT 
    e.nom,
    e.prenom,
    e.filiere,
    ROUND((sp.cours_presents * 100.0 / sp.total_cours), 2) as taux_presence,
    me.moyenne_generale,
    'À RISQUE' as statut
FROM stats_presence sp
JOIN etudiants e ON sp.etudiant_id = e.etudiant_id
JOIN moyennes_etudiants me ON sp.etudiant_id = me.etudiant_id
WHERE (sp.cours_presents * 100.0 / sp.total_cours) < 75
   AND me.moyenne_generale < 12
ORDER BY taux_presence ASC, moyenne_generale ASC
"""

q2_3 = spark.sql(requete_q2_3)
q2_3.show()

+---+------+-------+-------------+----------------+------+
|nom|prenom|filiere|taux_presence|moyenne_generale|statut|
+---+------+-------+-------------+----------------+------+
+---+------+-------+-------------+----------------+------+

