In [252]:
import pandas as pd
import mlflow
import mlflow.sklearn
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier  # ou un autre modèle selon votre cas
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from datetime import datetime



In [253]:
mlflow.set_experiment("Film Box Office Prediction")
df1 = pd.read_csv("films.csv")

In [254]:
print(df1.head())

   film_id                                              titre  \
0    20156                  L'Extraordinaire Voyage de Marona   
1    19778                        Les Filles du Docteur March   
2    18940  Birds of Prey et la fabuleuse histoire de Harl...   
3    18815                                 Sonic, le hérisson   
4    21042                                          Uncharted   

   genre_principale date_sortie_france date_sortie_usa  \
0         Animation         08/01/2020      00/00/0000   
1  Adaptation Livre         01/01/2020      25/12/2019   
2     Adaptation BD         05/02/2020      07/02/2020   
3  Adapt. Jeu Vidéo         12/02/2020      14/02/2020   
4  Adapt. Jeu Vidéo         16/02/2022      18/02/2022   

                                           image_url  \
0  https://www.jpbox-office.com/cinema/images/pos...   
1  https://www.jpbox-office.com/cinema/images/pos...   
2  https://www.jpbox-office.com/cinema/images/pos...   
3  https://www.jpbox-office.com/cine

In [255]:
print(f"Nombre total de films dans le CSV: {len(df1)}")

Nombre total de films dans le CSV: 2062


Création d un df a part pour les lignes ou la terget est null

In [256]:
mask_invalid = df1['entrees_demarrage_france'].isna()
invalid_entries = df1[mask_invalid]

In [257]:
# Nombre de lignes où la variable cible est vide (NaN)
nb_lignes_target_vide = df1['entrees_demarrage_france'].isna().sum()
print(f"Nombre de lignes où entrees_demarrage_france est vide : {nb_lignes_target_vide}")

# Pourcentage de valeurs manquantes
pourcentage_manquant = (nb_lignes_target_vide / len(df1)) * 100
print(f"Pourcentage de valeurs manquantes : {pourcentage_manquant:.2f}%")

Nombre de lignes où entrees_demarrage_france est vide : 327
Pourcentage de valeurs manquantes : 15.86%


In [258]:
print(f"\nNombre de lignes avec entrées au démarrage manquants: {mask_invalid.sum()} sur {len(df1)}")


Nombre de lignes avec entrées au démarrage manquants: 327 sur 2062


Retire les lignées ou la target est null du df

In [259]:
df = df1[~mask_invalid]

In [260]:
print(f"Nombre de valeurs manquantes dans la cible après filtrage: {df['entrees_demarrage_france'].isna().sum()}")

Nombre de valeurs manquantes dans la cible après filtrage: 0


In [261]:
print(f"Nombre total de films dans le CSV: {len(df)}")

Nombre total de films dans le CSV: 1735


Nettoyage des colonnes 

In [262]:
def clean_monetary_value(value):
    if isinstance(value, str):
        
        value = value.replace('$', '').replace(' ', '').replace(',', '')
        
        if value in ['?', '-']:
            return np.nan
        try:
            return float(value)
        except ValueError:
            return np.nan
    return value


monetary_columns = ['budget', 'recette_usa', 'recette_reste_du_monde', 'recette_monde']
for col in monetary_columns:
    df[col] = df[col].apply(clean_monetary_value)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].apply(clean_monetary_value)


In [263]:
df['entrees_demarrage_france'] = pd.to_numeric(df['entrees_demarrage_france'], errors='coerce')
df['entrees_totales_france'] = pd.to_numeric(df['entrees_totales_france'], errors='coerce')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['entrees_demarrage_france'] = pd.to_numeric(df['entrees_demarrage_france'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['entrees_totales_france'] = pd.to_numeric(df['entrees_totales_france'], errors='coerce')


Convertir durée du film en minutes

In [264]:
def convert_duration_to_minutes(duration):
    if isinstance(duration, str):
        hours = 0
        minutes = 0
        if 'h' in duration:
            hours_part = duration.split('h')[0].strip()
            try:
                hours = int(hours_part)
            except ValueError:
                return np.nan
        
        if 'min' in duration:
            minutes_part = duration.split('h')[-1].split('min')[0].strip()
            try:
                minutes = int(minutes_part)
            except ValueError:
                pass
        
        return hours * 60 + minutes
    return np.nan

df['duree_minutes'] = df['duree'].apply(convert_duration_to_minutes)

print(df['duree_minutes'])

0        92.0
1       135.0
2       109.0
3       100.0
4       115.0
        ...  
2053     77.0
2056    128.0
2057    195.0
2059    144.0
2061     98.0
Name: duree_minutes, Length: 1735, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['duree_minutes'] = df['duree'].apply(convert_duration_to_minutes)


In [265]:
print(df['duree_minutes'])

0        92.0
1       135.0
2       109.0
3       100.0
4       115.0
        ...  
2053     77.0
2056    128.0
2057    195.0
2059    144.0
2061     98.0
Name: duree_minutes, Length: 1735, dtype: float64


Mettre la date dans le bon format 

In [266]:
def extract_date(date_str):
    if isinstance(date_str, str) and date_str != '00/00/0000':
        try:
            return datetime.strptime(date_str, '%d/%m/%Y')
        except ValueError:
            return np.nan
    return np.nan

df['date_sortie_france'] = df['date_sortie_france'].apply(extract_date)
df['date_sortie_usa'] = df['date_sortie_usa'].apply(extract_date)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date_sortie_france'] = df['date_sortie_france'].apply(extract_date)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date_sortie_usa'] = df['date_sortie_usa'].apply(extract_date)


Mettre les colonnes dans le bon type

In [267]:
numeric_columns = [
    'entrees_demarrage_france', 'entrees_totales_france', 'budget',
    'recette_usa', 'recette_reste_du_monde', 'recette_monde'
]

for col in numeric_columns:
    # Nettoyage: supprimer $, espaces, etc.
    df[col] = df[col].astype(str).str.replace('$', '', regex=False)
    df[col] = df[col].str.replace(' ', '', regex=False)
    df[col] = df[col].str.replace(',', '', regex=False)
    
    # Remplacer les caractères spéciaux par une chaîne vide
    df[col] = df[col].str.replace('?', '', regex=False)
    df[col] = df[col].str.replace('-', '', regex=False)
    
    df[col] = pd.to_numeric(df[col], errors='coerce')

    # Nettoyage en profondeur avant conversion numérique
df['entrees_demarrage_france'] = df['entrees_demarrage_france'].astype(str)
df['entrees_demarrage_france'] = df['entrees_demarrage_france'].str.replace(' ', '')
df['entrees_demarrage_france'] = df['entrees_demarrage_france'].str.replace(',', '')
df['entrees_demarrage_france'] = pd.to_numeric(df['entrees_demarrage_france'], errors='coerce')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype(str).str.replace('$', '', regex=False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].str.replace(' ', '', regex=False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].str.replace(',', '', regex=False)
A value is trying to be set on a copy

In [268]:
# Nombre de lignes où la variable cible est vide (NaN)
nb_lignes_target_vide = df1['entrees_demarrage_france'].isna().sum()
print(f"Nombre de lignes où entrees_demarrage_france est vide : {nb_lignes_target_vide}")

# Pourcentage de valeurs manquantes
pourcentage_manquant = (nb_lignes_target_vide / len(df1)) * 100
print(f"Pourcentage de valeurs manquantes : {pourcentage_manquant:.2f}%")

Nombre de lignes où entrees_demarrage_france est vide : 327
Pourcentage de valeurs manquantes : 15.86%


In [269]:
print(df.dtypes)

film_id                              int64
titre                               object
genre_principale                    object
date_sortie_france          datetime64[ns]
date_sortie_usa             datetime64[ns]
image_url                           object
synopsis                            object
duree                               object
note_moyenne                        object
acteurs                             object
entrees_demarrage_france           float64
entrees_totales_france             float64
budget                             float64
recette_usa                        float64
recette_reste_du_monde             float64
recette_monde                      float64
duree_minutes                      float64
dtype: object


Création du nouveau csv 

In [270]:
df.to_csv('film_v2.csv', index=False)