In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

spark = SparkSession.builder.master("local").getOrCreate()
sc = spark.sparkContext

In [2]:
movies_df = spark.read.options(header=True, inferSchema=True, sep=",").csv("../Data/movies.csv")
ratings_df = spark.read.options(header=True, inferSchema=True, sep=",").csv("../Data/ratings.csv")

print(movies_df.count())
print(ratings_df.count())

9742
100836


In [3]:
# Inner join
df = ratings_df.join(movies_df, "movieId", "inner")

df.show()

+-------+------+------+---------+--------------------+--------------------+
|movieId|userId|rating|timestamp|               title|              genres|
+-------+------+------+---------+--------------------+--------------------+
|      1|     1|   4.0|964982703|    Toy Story (1995)|Adventure|Animati...|
|      3|     1|   4.0|964981247|Grumpier Old Men ...|      Comedy|Romance|
|      6|     1|   4.0|964982224|         Heat (1995)|Action|Crime|Thri...|
|     47|     1|   5.0|964983815|Seven (a.k.a. Se7...|    Mystery|Thriller|
|     50|     1|   5.0|964982931|Usual Suspects, T...|Crime|Mystery|Thr...|
|     70|     1|   3.0|964982400|From Dusk Till Da...|Action|Comedy|Hor...|
|    101|     1|   5.0|964980868|Bottle Rocket (1996)|Adventure|Comedy|...|
|    110|     1|   4.0|964982176|   Braveheart (1995)|    Action|Drama|War|
|    151|     1|   5.0|964984041|      Rob Roy (1995)|Action|Drama|Roma...|
|    157|     1|   5.0|964984100|Canadian Bacon (1...|          Comedy|War|
|    163|   

In [None]:
# Calculer le nombre de notes et la moyenne des notes par film
# Trier par nombre de notes décroissant
ratings_df.groupBy("movieId").agg(
    count("rating").alias("nb_rating"),
    avg("rating").alias("avg_rating")
).join(movies_df, "movieId", "inner").orderBy(desc("nb_rating")).show()

+-------+---------+------------------+--------------------+--------------------+
|movieId|nb_rating|        avg_rating|               title|              genres|
+-------+---------+------------------+--------------------+--------------------+
|    356|      329| 4.164133738601824| Forrest Gump (1994)|Comedy|Drama|Roma...|
|    318|      317| 4.429022082018927|Shawshank Redempt...|         Crime|Drama|
|    296|      307| 4.197068403908795| Pulp Fiction (1994)|Comedy|Crime|Dram...|
|    593|      279| 4.161290322580645|Silence of the La...|Crime|Horror|Thri...|
|   2571|      278| 4.192446043165468|  Matrix, The (1999)|Action|Sci-Fi|Thr...|
|    260|      251| 4.231075697211155|Star Wars: Episod...|Action|Adventure|...|
|    480|      238|              3.75|Jurassic Park (1993)|Action|Adventure|...|
|    110|      237| 4.031645569620253|   Braveheart (1995)|    Action|Drama|War|
|    589|      224| 3.970982142857143|Terminator 2: Jud...|       Action|Sci-Fi|
|    527|      220|         

In [6]:
# **LEFT SEMI** : films qui ont au moins une note
ratings_df.join(movies_df, "movieId", "left_semi").show()

+-------+------+------+---------+
|movieId|userId|rating|timestamp|
+-------+------+------+---------+
|      1|     1|   4.0|964982703|
|      3|     1|   4.0|964981247|
|      6|     1|   4.0|964982224|
|     47|     1|   5.0|964983815|
|     50|     1|   5.0|964982931|
|     70|     1|   3.0|964982400|
|    101|     1|   5.0|964980868|
|    110|     1|   4.0|964982176|
|    151|     1|   5.0|964984041|
|    157|     1|   5.0|964984100|
|    163|     1|   5.0|964983650|
|    216|     1|   5.0|964981208|
|    223|     1|   3.0|964980985|
|    231|     1|   5.0|964981179|
|    235|     1|   4.0|964980908|
|    260|     1|   5.0|964981680|
|    296|     1|   3.0|964982967|
|    316|     1|   3.0|964982310|
|    333|     1|   5.0|964981179|
|    349|     1|   4.0|964982563|
+-------+------+------+---------+
only showing top 20 rows



In [8]:
# **LEFT ANTI** : films sans aucune note
movies_df.join(ratings_df, "movieId", "left_anti").show()

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|   1076|Innocents, The (1...|Drama|Horror|Thri...|
|   2939|      Niagara (1953)|      Drama|Thriller|
|   3338|For All Mankind (...|         Documentary|
|   3456|Color of Paradise...|               Drama|
|   4194|I Know Where I'm ...|   Drama|Romance|War|
|   5721|  Chosen, The (1981)|               Drama|
|   6668|Road Home, The (W...|       Drama|Romance|
|   6849|      Scrooge (1970)|Drama|Fantasy|Mus...|
|   7020|        Proof (1991)|Comedy|Drama|Romance|
|   7792|Parallax View, Th...|            Thriller|
|   8765|This Gun for Hire...|Crime|Film-Noir|T...|
|  25855|Roaring Twenties,...|Crime|Drama|Thriller|
|  26085|Mutiny on the Bou...|Adventure|Drama|R...|
|  30892|In the Realms of ...|Animation|Documen...|
|  32160|Twentieth Century...|              Comedy|
|  32371|Call Northside 77...|Crime|Drama|Film-...|
|  34482|Bro

In [10]:
# **LEFT OUTER** : liste complète des films avec stats si disponibles
movies_df.join(ratings_df, "movieId", "left_outer").groupBy("rating").agg(
    count("rating").alias("rating_count"),
    avg("rating").alias("rating_avg"),
).show()

+------+------------+----------+
|rating|rating_count|rating_avg|
+------+------------+----------+
|   3.5|       13136|       3.5|
|   4.5|        8551|       4.5|
|  NULL|           0|      NULL|
|   2.5|        5550|       2.5|
|   1.0|        2811|       1.0|
|   4.0|       26818|       4.0|
|   0.5|        1370|       0.5|
|   3.0|       20047|       3.0|
|   2.0|        7551|       2.0|
|   1.5|        1791|       1.5|
|   5.0|       13211|       5.0|
+------+------------+----------+



In [14]:
diagnostic = movies_df.join(ratings_df, "movieId", "full")
diagnostic.show()

# MovieId présent uniquement dans ratings
movie_without_ratings = diagnostic.filter(col("title").isNull())
movie_without_ratings.count()


+-------+----------------+--------------------+------+------+----------+
|movieId|           title|              genres|userId|rating| timestamp|
+-------+----------------+--------------------+------+------+----------+
|      1|Toy Story (1995)|Adventure|Animati...|     1|   4.0| 964982703|
|      1|Toy Story (1995)|Adventure|Animati...|     5|   4.0| 847434962|
|      1|Toy Story (1995)|Adventure|Animati...|     7|   4.5|1106635946|
|      1|Toy Story (1995)|Adventure|Animati...|    15|   2.5|1510577970|
|      1|Toy Story (1995)|Adventure|Animati...|    17|   4.5|1305696483|
|      1|Toy Story (1995)|Adventure|Animati...|    18|   3.5|1455209816|
|      1|Toy Story (1995)|Adventure|Animati...|    19|   4.0| 965705637|
|      1|Toy Story (1995)|Adventure|Animati...|    21|   3.5|1407618878|
|      1|Toy Story (1995)|Adventure|Animati...|    27|   3.0| 962685262|
|      1|Toy Story (1995)|Adventure|Animati...|    31|   5.0| 850466616|
|      1|Toy Story (1995)|Adventure|Animati...|    

0

In [None]:
from pyspark.sql.window import Window

# A. **Top 5 global** : films les mieux notés
ratings_df.groupBy('movieId').agg(
    count("rating").alias("rating_nb"),
    avg("rating").alias("avg_rating")
).filter(col("rating_nb") >= 50).join(movies_df, "movieId").orderBy(desc("avg_rating")).show(5)
# B. **Top 10 par genre** : films les mieux notés par genre
movies_exploded = movies_df.select("movieId", "title", explode(split(col("genres"), "\\|")).alias("genre"))
window = Window.partitionBy("genre").orderBy(desc("avg_rating"))

df_with_genres = ratings_df.groupBy("movieId").agg(
    count("rating").alias("nb_ratings"),
    avg("rating").alias("avg_rating")
).join(movies_exploded, "movieId").filter(col("nb_ratings") >= 50)

df_with_genres.withColumn("rank", row_number().over(window)).orderBy("genre", "rank").filter(col("rank") <= 10).show(100)
# C. **Moyenne annuelle** : moyenne des notes par année de sortie  
movies_year = movies_df.withColumn("year", regexp_extract(col("title"), r"\((\d{4})\)", 1).cast("int"))
ratings_df.join(movies_year, "movieId").groupBy("year").agg(
    count("rating").alias("nb_rating"),
    avg("rating").alias("avg_rating")
).filter(col("nb_rating") >= 50).orderBy("year").show()


+-------+---------+-----------------+--------------------+--------------------+
|movieId|rating_nb|       avg_rating|               title|              genres|
+-------+---------+-----------------+--------------------+--------------------+
|    318|      317|4.429022082018927|Shawshank Redempt...|         Crime|Drama|
|    858|      192|        4.2890625|Godfather, The (1...|         Crime|Drama|
|   2959|      218|4.272935779816514|   Fight Club (1999)|Action|Crime|Dram...|
|   1276|       57|4.271929824561403|Cool Hand Luke (1...|               Drama|
|    750|       97|4.268041237113402|Dr. Strangelove o...|          Comedy|War|
+-------+---------+-----------------+--------------------+--------------------+
only showing top 5 rows

+-------+----------+------------------+--------------------+-----------+----+
|movieId|nb_ratings|        avg_rating|               title|      genre|rank|
+-------+----------+------------------+--------------------+-----------+----+
|   2959|       218| 

In [40]:
# D. **Profil utilisateur** : genre préféré de chaque utilisateur
user_genre_df = ratings_df.join(movies_exploded, "movieId").groupBy("userId", "genre").agg(
    avg("rating").alias("avg_rating"),
    count("rating").alias("nb_ratings")
).filter(col("nb_ratings") >= 5)

window_user = Window.partitionBy("userId").orderBy(desc("avg_rating"))

user_genre_df.withColumn("rank", row_number().over(window_user)).filter(col("rank") == 1).show()

+------+---------+------------------+----------+----+
|userId|    genre|        avg_rating|nb_ratings|rank|
+------+---------+------------------+----------+----+
|     1|Animation| 4.689655172413793|        29|   1|
|     2|   Comedy|               4.0|         7|   1|
|     3|   Horror|            4.6875|         8|   1|
|     4|Animation|               4.0|         6|   1|
|     5|  Musical|               4.4|         5|   1|
|     6|  Musical| 4.166666666666667|        12|   1|
|     7|   Horror|               4.0|         5|   1|
|     8|    Crime| 3.888888888888889|         9|   1|
|     9|Adventure|               3.8|        10|   1|
|    10|Animation|3.8666666666666667|        15|   1|
|    11|      War| 4.333333333333333|         6|   1|
|    12|  Romance| 4.571428571428571|        21|   1|
|    13|   Action|               4.2|        10|   1|
|    14|    Drama|3.7083333333333335|        24|   1|
|    15|      War|               4.1|         5|   1|
|    16|Film-Noir|          

In [41]:

# E. **Films sans note** : lister 20 films sans aucune note  
movie_without_ratings.show(20)

+-------+-----+------+------+------+---------+
|movieId|title|genres|userId|rating|timestamp|
+-------+-----+------+------+------+---------+
+-------+-----+------+------+------+---------+



In [43]:
# F. **Validation** : vérifier qu’aucun `movieId` n’est dupliqué dans `movies`

total_movie = movies_df.count()
distinct_movieId = movies_df.select("movieId").distinct().count()

print(f"Nombre de films : {total_movie}")
print(f"Nombre d'id : {distinct_movieId}")

Nombre de films : 9742
Nombre d'id : 9742
