In [1]:
import pandas as pd
import difflib
import unicodedata
import re
import requests
import time
import os

In [2]:
base = pd.read_csv("IMDb_All_Genres_etf_clean1.csv") 
large = pd.read_csv("dataset_titre_description.csv")

In [3]:
def normalize_title(s):
    if pd.isna(s):
        return ""
    s = str(s).strip().lower()
    s = unicodedata.normalize('NFD', s)
    s = ''.join(ch for ch in s if unicodedata.category(ch) != 'Mn')
    s = re.sub(r'\(.*?\)', '', s)
    s = re.sub(r'[^a-z0-9\s]', ' ', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s

In [4]:
large_cols = [c.lower() for c in large.columns]

# Trouver colonne titre
title_col = None
for opt in ["title","movie_title","name","original_title","primaryTitle"]:
    if opt in large_cols:
        title_col = large.columns[large_cols.index(opt)]
        break

In [6]:
desc_col = None
for opt in ["overview", "description", "plot", "summary", "synopsis"]:
    if opt in large_cols:
        desc_col = large.columns[large_cols.index(opt)]
        break

In [7]:
print("Titre =", title_col)
print("Description =", desc_col)

Titre = original_title
Description = overview


In [8]:
base["title_norm"] = base["Movie_Title"].apply(normalize_title)
large["title_norm"] = large[title_col].apply(normalize_title)

In [9]:
large_small = large[["title_norm", desc_col]].drop_duplicates(subset=["title_norm"])
large_dict = dict(zip(large_small["title_norm"], large_small[desc_col]))

In [16]:
base["description"] = ""
base["match_type"] = ""

candidates = list(large_small["title_norm"])
exact = 0
fuzzy = 0
unmatched = []

for idx, row in base.iterrows():
    tnorm = row["title_norm"]
    
    if tnorm in large_dict:
        base.at[idx, "description"] = large_dict[tnorm]
        base.at[idx, "match_type"] = "exact"
        exact += 1
        continue
    
    matches = difflib.get_close_matches(tnorm, candidates, n=1, cutoff=0.86)
    
    if matches:
        best = matches[0]
        score = difflib.SequenceMatcher(None, tnorm, best).ratio()
        base.at[idx, "description"] = large_dict[best]
        base.at[idx, "match_type"] = f"fuzzy:{score:.2f}"
        fuzzy += 1
    else:
        unmatched.append(row["Movie_Title"])

In [11]:
print("Films totaux :", len(base))
print("Match exact  :", exact)
print("Match fuzzy  :", fuzzy)
print("Non matchés  :", len(unmatched))

Films totaux : 5562
Match exact  : 598
Match fuzzy  : 17
Non matchés  : 185


In [12]:
base.to_csv("IMDb_descriptions.csv",index=False)
pd.DataFrame(unmatched, columns=["unmatched_titles"]).to_csv("unmatched_titles.csv",index=False)

In [13]:
unmatched

['Kantara',
 'Seppuku',
 'Shichinin no samurai',
 'Kaithi',
 'Asuran',
 'Sita Ramam',
 'Vikram',
 'Spider-Man: Into the Spider-Verse',
 'Avengers: Endgame',
 'Avengers: Infinity War',
 'Top Gun: Maverick',
 'K.G.F: Chapter 2',
 'Shershaah',
 'Oldeuboi',
 'Mononoke-hime',
 'Vikram Vedha',
 'Dangal',
 'Spider-Man: No Way Home',
 'Star Wars: Episode VI - Return of the Jedi',
 'Major',
 '1917',
 'Uri: The Surgical Strike',
 'K.G.F: Chapter 1',
 'Dag II',
 'Gangs of Wasseypur',
 'Kimetsu no Yaiba: Mugen Ressha-Hen',
 'Ran',
 'Yôjinbô',
 'Everything Everywhere All at Once',
 'Ford v Ferrari',
 'Sholay',
 'Kakushi-toride no san-akunin',
 'RRR (Rise Roar Revolt)',
 'Arjun Reddy',
 'Kaththi',
 '1 - Nenokkadine',
 'Thuppakki',
 'Blade Runner 2049',
 'Tropa de Elite 2: O Inimigo Agora é Outro',
 'Karthikeya 2',
 "Zack Snyder's Justice League",
 'Ip Man',
 'Dune: Part One',
 'Taegukgi hwinalrimyeo',
 'Mou gaan dou',
 'Shin seiki Evangelion Gekijô-ban: Air/Magokoro wo, kimi ni',
 'Akira',
 'Sanjuro

### Appeler l’API OMDb pour chaque titre manquant

In [14]:
#donc pour compléter les descriptions manquantes je vais utilisé l'API OMDb (Open Movie Database).
INPUT_CSV  = "IMDB_with_descriptions.csv" 
OUTPUT_CSV = "dataset_plot_final.csv"
API_KEY = "f905cb5f"
SLEEP = 0.3   
SAVE_EVERY = 50 

In [15]:
def get_key():
    if API_KEY:
        return API_KEY.strip()
    env = os.environ.get("OMDB_API_KEY")
    if env:
        return env.strip()
    return input("Colle ta clé OMDb ici : ").strip()

def fetch_plot(title, year, key):
    if not title:
        return None
    params = {"t": title, "apikey": key, "plot": "short"}
    if pd.notna(year):
        try:
            params["y"] = str(int(year))
        except:
            pass
    try:
        r = requests.get("http://www.omdbapi.com/", params=params, timeout=6)
        data = r.json()
        if data.get("Response") == "True":
            plot = data.get("Plot")
            if plot and plot != "N/A":
                return plot
    except Exception as e:
        pass
    return None

In [None]:
key = get_key()
if not key:
    raise SystemExit("Aucune clé fournie.Définit API_KEY ou OMDB_API_KEY.")

df = pd.read_csv(INPUT_CSV)

In [None]:
# S'assurer que la colonne 'description' existe
if "description" not in df.columns:
    if "description_found" in df.columns:
        df.rename(columns={"description_found":"description"}, inplace=True)
    else:
        df["description"] = ""

total = len(df)
to_fill = df["description"].astype(str).str.strip().apply(lambda x: x == "").sum()
print(f"Total lignes: {total} — descriptions à compléter: {to_fill}")

count = 0
for idx in df.index:
    if str(df.at[idx, "description"]).strip():
        continue  # déjà rempli -> on saute
    title = df.at[idx, "Movie_Title"] if "Movie_Title" in df.columns else df.at[idx, "movie_title"]
    year = df.at[idx, "Year"] if "Year" in df.columns else None
    plot = fetch_plot(title, year, key)
    if plot:
        df.at[idx, "description"] = plot
        print(f"[{idx}] trouvé: {title}")
    else:
        df.at[idx,"description"] = ""  # on laisse vide si non trouvé
        print(f"[{idx}] non trouvé: {title}")
    count += 1
    if count % SAVE_EVERY == 0:
        df.to_csv(OUTPUT_CSV, index=False)
        print(f"  sauvegarde intermédiaire après {count} requêtes -> {OUTPUT_CSV}")
    time.sleep(SLEEP)

# Fusion et Complétion des Données Manquantes

In [19]:
df_principal = pd.read_csv("movies_dataset+img_and_synopsis.csv")
df_complement = pd.read_csv("missing_part.csv")

print("Valeurs manquantes AVANT :")
print(df_principal[['poster_url','plot']].isnull().sum())


Valeurs manquantes AVANT :
poster_url    417
plot          431
dtype: int64


In [20]:
print("Valeurs manquantes AVANT :")
print(df_complement[['poster_url','plot']].isnull().sum())

Valeurs manquantes AVANT :
poster_url    11
plot           9
dtype: int64


In [21]:
for index, row in df_principal.iterrows():
    if pd.isnull(row['poster_url']) or pd.isnull(row['plot']):
        film_correspondant = df_complement[df_complement['Movie_Title'] == row['Movie_Title']]        
        if not film_correspondant.empty:
            film_data = film_correspondant.iloc[0]            
            #si manquant on compléte poster_url
            if pd.isnull(row['poster_url']) and not pd.isnull(film_data['poster_url']):
                df_principal.at[index, 'poster_url'] = film_data['poster_url']
            
            #si manquant on compléte plot
            if pd.isnull(row['plot']) and not pd.isnull(film_data['plot']):
                df_principal.at[index, 'plot'] = film_data['plot']

In [22]:
print("\nValeurs manquantes APRES :")
print(df_principal[['poster_url', 'plot']].isnull().sum())


Valeurs manquantes APRES :
poster_url    86
plot          98
dtype: int64


In [23]:
colonnes_avant = df_principal.shape[1]
df_principal = df_principal.drop('imdb_id', axis=1)
colonnes_apres = df_principal.shape[1]

In [24]:
df_principal.to_csv("dataset_complet.csv",index=False)

# 1: NETTOYAGE DES COLONNES TEXTE

In [25]:
df_principal['Movie_Title'] = df_principal['Movie_Title'].str.strip()

In [26]:
df_principal['Director'] = df_principal['Director'].str.replace('Directors:', '', regex=False)
df_principal['Director'] = df_principal['Director'].str.strip()

In [27]:
df_principal['Actors'] = df_principal['Actors'].str.strip()

In [28]:
df_principal['plot'] = df_principal['plot'].str.strip()
# Supprimer les guillemets inutiles
df_principal['plot'] = df_principal['plot'].str.replace('"', '', regex=False)

# Traitement de la colonne Total_Gross

In [29]:
df_principal['Total_Gross'] = df_principal['Total_Gross'].replace('Gross Unkown', 'Unknown')
df_principal['Total_Gross'] = df_principal['Total_Gross'].replace('Gross Unknown', 'Unknown')

In [30]:
def convert_gross_value(value):
    if value == 'Unknown':
        return None
    try:
        return float(value.replace('$', '').replace('M', ''))
    except:
        return None

df_principal['Total_Gross'] = df_principal['Total_Gross'].apply(convert_gross_value)


In [31]:
print(f"Exemple: {df_principal['Total_Gross'].head(5).tolist()}")

Exemple: [nan, 534.86, 377.85, 292.58, 342.55]


In [32]:
#nettoyer les espaces
df_principal['main_genre'] = df_principal['main_genre'].str.strip()

df_principal['side_genre'] = df_principal['side_genre'].str.strip()
df_principal['side_genre'] = df_principal['side_genre'].str.replace(', ', ',', regex=False)
df_principal['side_genre'] = df_principal['side_genre'].str.replace(' ,', ',', regex=False)

In [33]:
print(f"Plage des ratings : {df_principal['Rating'].min()} - {df_principal['Rating'].max()}")
print(f"Ratings manquants : {df_principal['Rating'].isnull().sum()}")

Plage des ratings : 1.0 - 9.3
Ratings manquants : 0


In [34]:
df_principal['Year'] = pd.to_numeric(df_principal['Year'], errors='coerce')

In [35]:
df_principal['Runtime(Mins)'] = pd.to_numeric(df_principal['Runtime(Mins)'], errors='coerce')


In [36]:
print(f"Valeurs problématiques : {df_principal['Year'].isnull().sum()}")
print(f"Valeurs problématiques : {df_principal['Runtime(Mins)'].isnull().sum()}")

Valeurs problématiques : 0
Valeurs problématiques : 0


In [37]:
print("Valeurs manquantes par colonne :")
for colonne in df_principal.columns:
    manquants = df_principal[colonne].isnull().sum()
    if manquants > 0:
        pourcentage = (manquants / len(df_principal)) * 100
        print(f"  {colonne}: {manquants} ({pourcentage:.1f}%)")

Valeurs manquantes par colonne :
  Total_Gross: 861 (15.5%)
  poster_url: 86 (1.5%)
  plot: 98 (1.8%)


In [38]:
df_principal.to_csv("data_complet.csv",index=False)

In [39]:
colonnes_critiques = ['Movie_Title','Year','Rating']
lignes_avant = len(df_principal)
df_principal = df_principal.dropna(subset=colonnes_critiques)
lignes_apres = len(df_principal)

##### Supprimer les lignes sans titre, année ou rating,


In [40]:
print(f"Lignes supprimées (données critiques manquantes) : {lignes_avant - lignes_apres}")
print(f"Films conservés pour l'application : {lignes_apres}")

Lignes supprimées (données critiques manquantes) : 0
Films conservés pour l'application : 5562


In [43]:
#notre data 
df_principal

Unnamed: 0,Movie_Title,Year,Director,Actors,Rating,Runtime(Mins),Censor,Total_Gross,main_genre,side_genre,poster_url,plot
0,Kantara,2022,Rishab Shetty,"Rishab Shetty, Sapthami Gowda, Kishore Kumar G...",9.3,148,UA,,Action,"Adventure, Drama",https://m.media-amazon.com/images/M/MV5BY2VkZj...,"When greed paves the way for betrayal, schemin..."
1,The Dark Knight,2008,Christopher Nolan,"Christian Bale, Heath Ledger, Aaron Eckhart, M...",9.0,152,UA,534.86,Action,"Crime, Drama",https://m.media-amazon.com/images/M/MV5BMTMxNT...,When a menace known as the Joker wreaks havoc ...
2,The Lord of the Rings: The Return of the King,2003,Peter Jackson,"Elijah Wood, Viggo Mortensen, Ian McKellen, Or...",9.0,201,U,377.85,Action,"Adventure, Drama",https://m.media-amazon.com/images/M/MV5BMTZkMj...,Gandalf and Aragorn lead the World of Men agai...
3,Inception,2010,Christopher Nolan,"Leonardo DiCaprio, Joseph Gordon-Levitt, Ellio...",8.8,148,UA,292.58,Action,"Adventure, Sci-Fi",https://m.media-amazon.com/images/M/MV5BMjAxMz...,A thief who steals corporate secrets through t...
4,The Lord of the Rings: The Two Towers,2002,Peter Jackson,"Elijah Wood, Ian McKellen, Viggo Mortensen, Or...",8.8,179,UA,342.55,Action,"Adventure, Drama",https://m.media-amazon.com/images/M/MV5BMGQxMD...,While Frodo and Sam edge closer to Mordor with...
...,...,...,...,...,...,...,...,...,...,...,...,...
5557,Disaster Movie,2008,"Jason Friedberg, Aaron Seltzer","Carmen Electra, Vanessa Lachey, Nicole Parker,...",1.9,87,PG-13,14.19,Comedy,Sci-Fi,https://m.media-amazon.com/images/M/MV5BMTIzMD...,"Over the course of one evening, an unsuspectin..."
5558,The Hottie & the Nottie,2008,Tom Putnam,"Paris Hilton, Joel David Moore, Christine Laki...",1.9,91,PG-13,0.03,Comedy,Romance,https://m.media-amazon.com/images/M/MV5BMTA2ND...,A woman agrees to go on a date with a man only...
5559,From Justin to Kelly,2003,Robert Iscove,"Kelly Clarkson, Justin Guarini, Katherine Bail...",1.9,81,PG,4.92,Comedy,"Musical, Romance",https://m.media-amazon.com/images/M/MV5BOTUzM2...,A waitress from Texas and a college student fr...
5560,Superbabies: Baby Geniuses 2,2004,Bob Clark,"Jon Voight, Scott Baio, Vanessa Angel, Skyler ...",1.5,88,PG,9.11,Comedy,"Family, Sci-Fi",https://m.media-amazon.com/images/M/MV5BNjY4Nj...,A group of smart-talking toddlers find themsel...
