## Visualisation des données

In [1]:
# Importation des librairies nécessaires
import pandas as pd
from collections import Counter


import plotly.express as px

import matplotlib.pyplot as plt
import seaborn as sns

from filmsapisdk import MovieClient, MovieConfig

import time

import json

from pathlib import Path


In [2]:
# Dossiers
output_dir = Path("output")
output_dir.mkdir(exist_ok=True)


In [3]:
# Connexion à l'API via l'ADK

config = MovieConfig(movie_base_url="https://datatech.onrender.com")
client = MovieClient(config=config)

client.health_check()

MOVIE_API_BASE_URL in MovieConfig init: https://datatech.onrender.com


{'Message': 'API MovieLens est opérationnelle!'}

In [4]:
# Utilitaire : sauvegarder les figures Plotly pour la consultation sur GitHub
# - HTML toujours, PNG si kaleido est installé

def save_fig(fig, name: str):
    output_dir.mkdir(exist_ok=True)
    html_path = output_dir / f"{name}.html"
    fig.write_html(html_path)
    print(f"HTML sauvegardé: {name}.html")
    
    try:
        fig.write_image(output_dir / f"{name}.png")
        print(f"PNG sauvegardé: {name}.png")
    except Exception as e:
        print(f"Export PNG ignoré : {e}")


In [5]:
# Récuperation des statistiques analytiques de l'API
analytics = client.get_analytics()
analytics

AnalyticsResponse(movie_count=9742, rating_count=100836, tag_count=3683, link_count=9742)

## Top 10 des genres par nombre de films

In [6]:
# Systeme sans mise en cache
"""# Initialisation du compteur de genres


genre_counter = Counter() 

# parametres pour le batching

limit = 200

skip = 0

while True:

    # Récupération des films par lots
    movies_batch = client.list_movies(limit=limit, skip=skip, output_format='dict')

    if not movies_batch:
        break


    # On extrait les genres de chaque film et on met à jour le compteur
    for movie in movies_batch:
        genres = movie.get("genres", "")
        genre_list = genres.split('|') if genres else []
        genre_counter.update(genre_list)

    skip += limit
    time.sleep(0.5)  # Petite pause pour éviter de surcharger l'API

# Conversion du compteur en DataFrame pour une analyse plus facile
genre_counts_df = pd.DataFrame(genre_counter.items(), columns=['genre', 'count'])   
genre_counts_df = genre_counts_df.sort_values(by='count', ascending=False).head(10)
genre_counts_df



fig = px.bar(
    genre_counts_df,
    x='count',
    y='genre',
    orientation='h',
    title='Distribution des genres de films',
    labels={'count': 'Nombre de films', 'genre': 'Genre'},
    color='count',
    color_continuous_scale='Viridis'
)

fig.update_layout(
    yaxis={'categoryorder': 'total ascending'},
    height=500
)

fig.show()"""

'# Initialisation du compteur de genres\n\n\ngenre_counter = Counter() \n\n# parametres pour le batching\n\nlimit = 200\n\nskip = 0\n\nwhile True:\n\n    # Récupération des films par lots\n    movies_batch = client.list_movies(limit=limit, skip=skip, output_format=\'dict\')\n\n    if not movies_batch:\n        break\n\n\n    # On extrait les genres de chaque film et on met à jour le compteur\n    for movie in movies_batch:\n        genres = movie.get("genres", "")\n        genre_list = genres.split(\'|\') if genres else []\n        genre_counter.update(genre_list)\n\n    skip += limit\n    time.sleep(0.5)  # Petite pause pour éviter de surcharger l\'API\n\n# Conversion du compteur en DataFrame pour une analyse plus facile\ngenre_counts_df = pd.DataFrame(genre_counter.items(), columns=[\'genre\', \'count\'])   \ngenre_counts_df = genre_counts_df.sort_values(by=\'count\', ascending=False).head(10)\ngenre_counts_df\n\n\n\nfig = px.bar(\n    genre_counts_df,\n    x=\'count\',\n    y=\'genr

In [7]:

# Systeme de mise en cache(evite de faire les meme requetes a l'API si les donnees n'ont pas changé, de consommer les ressources inutilement


api_movie_count = analytics.movie_count
print(f"Nombre total de films dans l'API: {api_movie_count}")

genre_data_file = output_dir / "genre_counts.parquet"
meta_file = output_dir / "genre_counts_meta.json"

# Lecture du fichier meta s'il existe
if meta_file.exists():
    with open(meta_file, 'r') as f:
        meta = json.load(f)
    cached_movie_count = meta.get("movie_count", 0)
else:
    meta = {}

    cached_movie_count = 0

# Décision : utiliser le cache ou recalculer
if genre_data_file.exists() and cached_movie_count == api_movie_count:
    print("Chargegement des données mises en cache.")
    genre_counts_df = pd.read_parquet(genre_data_file)
else:
    print("Mise à jour des données depuis l'API...")
    
    # Initialisation du compteur de genres
    genre_counter = Counter()

    # Parametres pour le batching

    limit = 200
    skip = 0
    while True:
        movies_batch = client.list_movies(limit=limit, skip=skip, output_format='dict')
        if not movies_batch:
            break
        for movie in movies_batch:
            genres = movie.get("genres", "")
            genre_list = genres.split('|') if genres else []
            genre_counter.update(genre_list)
        skip += limit
        time.sleep(0.5)

    # Conversion du compteur en DataFrame
    genre_counts_df = pd.DataFrame(genre_counter.items(), columns=['genre', 'count'])   
    genre_counts_df = genre_counts_df.sort_values(by='count', ascending=False).head(10)

    # Sauvegarde des données mises à jour
    genre_counts_df.to_parquet(genre_data_file,index=False)
    
    with open(meta_file, 'w') as f:
        json.dump({"movie_count": api_movie_count}, f)
    
    # Affichage plotly


fig = px.bar(
        genre_counts_df,
        x='count',
        y='genre',
        orientation='h',
        title='Distribution des genres de films',
        labels={'count': 'Nombre de films', 'genre': 'Genre'},
        color='count',
        color_continuous_scale='Viridis'
    )

fig.update_layout(
        yaxis={'categoryorder': 'total ascending'},
        height=500
    )

fig.show()
save_fig(fig, "genre_counts")
        


Nombre total de films dans l'API: 9742
Chargegement des données mises en cache.


HTML sauvegardé: genre_counts.html
PNG sauvegardé: genre_counts.png
PNG sauvegardé: genre_counts.png


## Nombre total de films par année (basée sur le titre)

In [8]:
import re

# === Dossiers ===

#output_dir = Path("output")
#output_dir.mkdir(exist_ok=True)

yearly_data_file = output_dir / "movies_by_year.parquet"
meta_file = output_dir / "movies_by_year_meta.json"

# === Lecture du nombre total de films via anlytics ===

# analytics = client.get_analytics()

api_movie_count = analytics.movie_count

# === Lecture du cache s'il existe ===

if meta_file.exists():
    with open(meta_file, 'r') as f:
        meta = json.load(f)
    cached_movie_count = meta.get("movie_count", 0)
else:
    cached_movie_count = 0   

# === Utilisation du cache ou recalcul ===

if yearly_data_file.exists() and cached_movie_count == api_movie_count:
    print("chargement des données depuis le cache ...")
    df_yearly = pd.read_parquet(yearly_data_file)
else:
    print("Extraction des données depuis l'API...")
   

    
    # Initialisation 
    year_counter = Counter()
    limit = 200
    skip = 0
    year_pattern = re.compile(r"\((\d{4})\)$")
    while True:
        batch = client.list_movies(limit=limit, skip=skip, output_format='dict')
        if not batch:
            break
        for movie in batch:
            title = movie.get("title", "")
            match = year_pattern.search(title)
            if match:
                year = int(match.group(1))
                year_counter[year] += 1
        skip += limit
        time.sleep(0.5)

    # === Construction du DataFrame ===

    df_yearly = pd.DataFrame(
    sorted(year_counter.items()),
    columns=["year", "movie_count"]
)

  # === Sauvegarde du cache ===

    df_yearly.to_parquet(yearly_data_file, index=False)
    with open(meta_file, "w") as f:
        json.dump({"movie_count": api_movie_count}, f)

 # Affichage plotly


fig = px.bar(
        df_yearly,
        x='year',
        y='movie_count',
        title="Nombre total de films par année (basé sur le titre)",
        labels={"year": "Année", "movie_count": "Nombre de films"},
       
    )

fig.update_layout(
        xaxis_title="Année",
        yaxis_title="Nombre de films",
        height=500
    )

fig.show()
save_fig(fig, "movies_by_year")


chargement des données depuis le cache ...


HTML sauvegardé: movies_by_year.html
PNG sauvegardé: movies_by_year.png
PNG sauvegardé: movies_by_year.png


## Top 20 des films par nombre d'évaluations

In [9]:
# === Dossiers ===

#output_dir = Path("output")
#output_dir.mkdir(exist_ok=True)

from collections import defaultdict


top_movie_file = output_dir / "top_movies_by_ratings.parquet"
meta_file = output_dir / "meta_top_movies.json"

# === Récuperation des métriques API ===

# analytics = client.get_analytics()

api_movie_count = analytics.movie_count
api_rating_count = analytics.rating_count

# === Vérification du cache ===

if meta_file.exists():
    with open(meta_file, 'r') as f:
        meta = json.load(f)
    cached_movie_count = meta.get("movie_count", 0)
    cached_rating_count = meta.get("rating_count", 0)
else:
    cached_movie_count = 0
    cached_rating_count = 0

# === Utilisation du cache ou recalcul ===

if (top_movie_file.exists()
    and cached_movie_count == api_movie_count
    and cached_rating_count == api_rating_count
    ):
    print("Chargement des données depuis le cache...")
    top_movies_df = pd.read_parquet(top_movie_file)
   
else:
    print("Récuperation des évaluations depuis l'API...")
    
    # Initialisation des compteurs ===
    movie_rating_count = defaultdict(int)
    movie_rating_sum = defaultdict(float)

    # === Batching des ratings ===
    
    limit = 200
    skip = 0
    
    while True:
        batch = client.list_ratings(limit=limit, skip=skip, output_format='dict')
        if not batch:
            break
        for rating in batch:
            movie_id = rating["movieId"]
            score = rating["rating"]
            movie_rating_count[movie_id] += 1
            movie_rating_sum[movie_id] += score
        skip += limit
        time.sleep(0.5)


    # === Construction du DataFrame ===

    stats = [
        {
            "movieId": movie_id,
            "rating_count": movie_rating_count[movie_id],
            "avg_rating": movie_rating_sum[movie_id] / movie_rating_count[movie_id]
        }
        for movie_id in movie_rating_count
    ]
    stats_df = pd.DataFrame(stats)
    top_movies_df = stats_df.sort_values("rating_count", ascending=False).head(20)

    # === Ajout des titre de films via l'API ===

    movie_titles = {}

    for movie_id in top_movies_df["movieId"]:
        try:
            movie_data = client.get_movie(movie_id)
            movie_titles[movie_id] = movie_data.title
        except Exception as e:
            print(f"Erreur récuperation titre movieId {movie_id} : {e}")
            movie_titles[movie_id] = f"Movie {movie_id}"
        
    top_movies_df["title"] = top_movies_df["movieId"].map(movie_titles)

    # === Sauvegarde dans le cache ===

    top_movies_df.to_parquet(top_movie_file, index=False)
    with open(meta_file, "w") as f:
        json.dump({"movie_count": api_movie_count,
                  "rating_count": api_rating_count
                  },
                f)

 # === Affichage avec plotly ===


fig = px.bar(
        top_movies_df.sort_values("rating_count", ascending=True), # Pour affichage de bas en haut
        x="rating_count",
        y="title",
        color="avg_rating",
        orientation="h",
        title="Top 20 des films par nombre d'évaluations",
        labels={"title": "Titre du film", "rating_count": "Nombre d'évaluations", "avg_rating": "Note moyenne"},
        color_continuous_scale="viridis"
       
)

fig.update_layout(
        yaxis={'categoryorder': 'total ascending'},
        height=700
    )

fig.show()
save_fig(fig, "top_movies_by_ratings")


Chargement des données depuis le cache...


HTML sauvegardé: top_movies_by_ratings.html
PNG sauvegardé: top_movies_by_ratings.png
PNG sauvegardé: top_movies_by_ratings.png


In [10]:
top_movies_df

Unnamed: 0,movieId,rating_count,avg_rating,title
0,356,329,4.164134,Forrest Gump (1994)
1,318,317,4.429022,"Shawshank Redemption, The (1994)"
2,296,307,4.197068,Pulp Fiction (1994)
3,593,279,4.16129,"Silence of the Lambs, The (1991)"
4,2571,278,4.192446,"Matrix, The (1999)"
5,260,251,4.231076,Star Wars: Episode IV - A New Hope (1977)
6,480,238,3.75,Jurassic Park (1993)
7,110,237,4.031646,Braveheart (1995)
8,589,224,3.970982,Terminator 2: Judgment Day (1991)
9,527,220,4.225,Schindler's List (1993)


## Top tags utilisés par les utilisateurs de la plateforme

In [11]:
#output_dir = Path("output")
#output_dir.mkdir(exist_ok=True)

tag_usage_file = output_dir / "user_tag_stats.parquet"
meta_file = output_dir / "meta_users_behavior.json"


# Recupération des métriques d'API pour surveiller les changements :

# analytics = client.get_analytics()

api_rating_count = analytics.rating_count
api_tag_count = analytics.tag_count

if meta_file.exists():
    with open(meta_file, 'r') as f:
        meta = json.load(f)
else:
    meta = {}

cached_rating_count = meta.get("rating_count", 0)
cached_tag_count = meta.get("tag_count", 0)

# tags souvent utilisés par certains utilisateurs

if tag_usage_file.exists() and cached_tag_count == api_tag_count:
    print("Chargement des données depuis le cache : tag utilsés")
    tag_usage_df = pd.read_parquet(tag_usage_file)
else:
    print("Récuperation des tags depuis l'API...")
    
    tag_counter = Counter()
    limit = 200
    skip = 0
    while True:
        batch = client.list_tags(limit=limit, skip=skip, output_format='dict')
        if not batch:
            break
        for tag in batch:
            tag_text = tag.get("tag", "")
            tag_counter[tag_text] += 1
        skip += limit
        time.sleep(0.5)

    # Construction du DataFrame

    tag_usage_data = [
        {"tag": tag, "count": count}
        for tag, count in tag_counter.items()
    ]
    tag_usage_df = pd.DataFrame(tag_usage_data)
    tag_usage_df = tag_usage_df[tag_usage_df['tag'] != '']  # Exclure les tags vides
    tag_usage_df = tag_usage_df.sort_values(by='count', ascending=False).head(20)

    # Sauvegarde du cache
    tag_usage_df.to_parquet(tag_usage_file, index=False)
    
fig4 = px.bar(
        tag_usage_df,
        x='count',
        y='tag',
        orientation='h',
        title='Top tags les plus utilisés par les utilisateurs',
        labels={'count': 'Nombre d\'utilisations', 'tag': 'Tag'},
        color='count',
        color_continuous_scale='Viridis'
    )
fig4.update_layout(
        yaxis={'categoryorder': 'total ascending'},
        height=500
)

fig4.show()
save_fig(fig4, "top_tags")

with open(meta_file, 'w') as f:
    json.dump({"rating_count": api_rating_count,
               "tag_count": api_tag_count
              }, f)


Chargement des données depuis le cache : tag utilsés


HTML sauvegardé: top_tags.html
PNG sauvegardé: top_tags.png
PNG sauvegardé: top_tags.png


In [12]:
tag_usage_df

Unnamed: 0,tag,count
0,In Netflix queue,122
1,atmospheric,24
2,Disney,22
3,aliens,14
4,superhero,12
5,religion,12
6,crime,12
7,boxing,11
8,black comedy,11
9,action,11


## Autres insights sur les tags

In [13]:
import pickle
import os

analytics_path = os.path.join(output_dir, "analytics.pkl")
tags_by_genre_path = os.path.join(output_dir, "tags_by_genre.pkl")
tags_good_rating_path = os.path.join(output_dir, "tags_good_ratings.pkl")
tags_compare_path = os.path.join(output_dir, "tags_compare.pkl")

# Recupération les statistiques actuelles de l'API

currents_stats = client.get_analytics().__dict__

# Fonction utilitaire pour charger ou recalculer les données avec mise en cache
def use_or_generate(path, current_stats, compute_func): 
    if os.path.exists(path) and os.path.exists(analytics_path):
        with open(analytics_path, 'rb') as f:
            saved_stats = pickle.load(f)
        if saved_stats == current_stats:
            return pd.read_parquet(path)
    df = compute_func()
    df.to_parquet(path, index=False)
    with open(analytics_path, 'wb') as f:
        pickle.dump(current_stats, f)
    return df


# 1. Tags les plus utilisés par genre

def compute_tags_by_genre():
    tag_genre_counter = defaultdict(Counter)
    #Chargement par lots
    limit = 200
    skip = 0
    while True:
        movies = client.list_movies(limit=limit, skip=skip, output_format='dict')
        if not movies:
            break
        movie_dict = {m["movieId"] : m["genres"].split('|') if m["genres"] else [] for m in movies}
        tags = client.list_tags(limit=limit, skip=skip, output_format='dict')
        for tag in tags:
            genres = movie_dict.get(tag["movieId"], [])
            for genre in genres:
                tag_genre_counter[genre][tag["tag"]] += 1
        skip += limit
        time.sleep(0.5)
    records = []
    for genre, tag_counter in tag_genre_counter.items():
        for tag, count in tag_counter.items():
            records.append({"genre": genre, "tag": tag, "count": count})
    df = pd.DataFrame(records)
    df = df.sort_values(["genre", "count"], ascending=[True, False])
    return df
    


In [14]:
tags_by_genre_df = use_or_generate(tags_by_genre_path, currents_stats, compute_tags_by_genre)

# Top 3 tags par genre
top_tags_by_genre = tags_by_genre_df.groupby('genre').apply(lambda g: g.nlargest(3, 'count')).reset_index(drop=True)

# Concatène genre + tag pour libellé
top_tags_by_genre['tag_label'] = top_tags_by_genre["tag"] + " (" + top_tags_by_genre["genre"] + ")"
tag_usage_df





Unnamed: 0,tag,count
0,In Netflix queue,122
1,atmospheric,24
2,Disney,22
3,aliens,14
4,superhero,12
5,religion,12
6,crime,12
7,boxing,11
8,black comedy,11
9,action,11


In [15]:
fig = px.bar(
    top_tags_by_genre.sort_values("count"),
    x="count",
    y="tag_label",
    color="genre",
    orientation="h",
    title="Top 3 des tags les plus utilisés par genre",
    labels={"count": "Nombre d'utilisations", "tag_label": "Tag (Genre)"},
    height=800
)
fig.update_layout(
        yaxis=dict(categoryorder='total ascending')
    )
fig.show()

In [16]:
# --------------------------------------------------------
# 2. Tags les plus frequents associés aux films avec de bonnes notes (>=4.0)

# -----------------------------------------------

def compute_tags_for_good_ratings():
    good_ratings = []
    tags_by_movie = defaultdict(list)

    # Charger les ratings >= 4
    skip = 0
    limit = 200
    while True:
        ratings = client.list_ratings(limit=limit, skip=skip, output_format='dict')
        if not ratings:
            break
        good_ratings += [ r for r in ratings if r["rating"] >= 4.0 ]
        skip += limit
        time.sleep(0.5)
    
    # Associer les tags aux films bien notés (Movieid bien noté)

    movie_ids = set([r["movieId"] for r in good_ratings])
    skip = 0
    limit = 200
    while True:
        tags = client.list_tags(limit=limit, skip=skip, output_format='dict')
        if not tags:
            break
        for tag in tags:
            if tag["movieId"] in movie_ids:
                tags_by_movie[tag["movieId"]].append(tag["tag"])
        skip += limit
        time.sleep(0.5)
    df = pd.DataFrame([(tag, len(movie)) for tag, movie in tags_by_movie.items()], columns=["tag", "count"])
    df = df.sort_values("count", ascending=False).head(20)
    return df

tags_good_ratings_df = use_or_generate(tags_good_rating_path, currents_stats, compute_tags_for_good_ratings)

tags_good_ratings_df


Unnamed: 0,tag,count
0,260,10
1,4878,6
2,296,5
3,750,5
4,4226,5
5,7361,5
6,79132,4
7,2959,4
8,541,4
9,8641,4


In [17]:
# Visulalisation : Tags les plus frquents dans films bien notés

fig2 = px.bar(
    tags_good_ratings_df,
    x='count',
    y='tag',
    orientation='h',
    title='Tags les plus fréquents dans les films bien notés (>=4.0)',
    labels={'count': 'Nombre de films bien notés', 'tag': 'Tag'},
    color='count',
    color_continuous_scale='Viridis'
)
fig2.update_layout(
    yaxis={'categoryorder': 'total ascending'},
    height=500
)
fig2.show()

In [18]:
# ------------------------------------------------
# 3. Comparaison des tags entre films bien notés (>=4.0) et mal notés (<4.0)
# -----------------------------------------------

def compute_tags_compaison():
    tags_counter_good = Counter()
    tags_counter_bad = Counter()

    # Chargement par lots
    limit = 200
    skip = 0
    rating_map = {}
    while True:
        ratings = client.list_ratings(limit=limit, skip=skip, output_format='dict')
        if not ratings:
            break
        for r in ratings:
            rating_map[r["movieId"]] = rating_map.get(r["movieId"], []) + [r["rating"]]
        skip += limit
        time.sleep(0.5)
    
    # Moyenne par film
    avg_rating_map = {mid: sum(ratings)/len(ratings) for mid, ratings in rating_map.items()}

    # Tags par lots
    skip = 0

    while True:
        tags = client.list_tags(limit=limit, skip=skip, output_format='dict')
        if not tags:
            break
        for tag in tags:
            avg_rating = avg_rating_map.get(tag["movieId"])
            if avg_rating is not None:
                if avg_rating >= 4.0:
                    tags_counter_good[tag["tag"]] += 1
                elif avg_rating < 4.0:
                    tags_counter_bad[tag["tag"]] += 1
            else:
                tags_counter_bad[tag["tag"]] += 1
        skip += limit
        time.sleep(0.5)
    
    tags = set(tags_counter_good.keys()).union(set(tags_counter_bad.keys()))
    data = []
    for tag in tags:
        data.append({
            "tag": tag,
            "good_count": tags_counter_good.get(tag, 0),
            "bad_count": tags_counter_bad.get(tag, 0)
        })
    df = pd.DataFrame(data)
    df["total_count"] = df["good_count"] + df["bad_count"]
    df = df[df["total_count"] > 5].sort_values("total_count", ascending=False).head(20)

    return df
tags_compare_df = use_or_generate(tags_compare_path, currents_stats, compute_tags_compaison)
tags_compare_df

Unnamed: 0,tag,good_count,bad_count,total_count
599,In Netflix queue,50,72,122
233,atmospheric,11,13,24
639,Disney,0,22,22
97,aliens,0,14,14
443,superhero,0,12,12
591,crime,2,10,12
735,religion,3,9,12
394,boxing,4,7,11
426,black comedy,7,4,11
473,politics,1,10,11


In [19]:
# Graphique de comparaison des tags : Films bien notés vs mal notés

fig3 = px.bar(
    tags_compare_df.melt(id_vars='tag', value_vars=['good_count', 'bad_count'],
                          var_name='rating_category', value_name='count').sort_values('count'),
    x='count',
    y='tag',
    barmode='group',
    title='Comparaison des tags : Films bien notés vs mal notés',
    labels={'count': "Nombre d'occurrences", 'tag': 'Tag'}
)
fig3.update_layout(
    yaxis={'categoryorder': 'total ascending'}, height=600
)
fig3.show()
