## Importation des packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


In [2]:
# Importation de la base de données
df = pd.read_csv('movies.csv')  
df.head()

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,budget,gross,company,runtime
0,The Shining,R,Drama,1980,"June 13, 1980 (United States)",8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,19000000.0,46998772.0,Warner Bros.,146.0
1,The Blue Lagoon,R,Adventure,1980,"July 2, 1980 (United States)",5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,4500000.0,58853106.0,Columbia Pictures,104.0
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,1980,"June 20, 1980 (United States)",8.7,1200000.0,Irvin Kershner,Leigh Brackett,Mark Hamill,United States,18000000.0,538375067.0,Lucasfilm,124.0
3,Airplane!,PG,Comedy,1980,"July 2, 1980 (United States)",7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,United States,3500000.0,83453539.0,Paramount Pictures,88.0
4,Caddyshack,R,Comedy,1980,"July 25, 1980 (United States)",7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,6000000.0,39846344.0,Orion Pictures,98.0


## Nettoyage et préparation des données

### Nettoyage des données

In [4]:
df.shape

(7668, 15)

#### Recherche et traitement des valeurs manquantes

In [5]:
# Recherche des valeurs manquantes
df.isnull().sum()

name           0
rating        77
genre          0
year           0
released       2
score          3
votes          3
director       0
writer         3
star           1
country        3
budget      2171
gross        189
company       17
runtime        4
dtype: int64

- Pour la variable **budget** nous allons remplacer les valeurs manquantes par 0. En effet, la présence de valeurs manquantes est du à un non financement du film.
- Pour les  variables quantitatives nous allons remplacer les valeurs manquantes par la moyenne
- Et pour les variables catégorielles nous allons remplacer les valeurs manquantes par le mode.

In [6]:
# Traitement des valeurs manquantes dans le budget
df["budget"].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["budget"].fillna(0, inplace=True)


In [9]:
def traitement_valeurs_manquantes(df, variable):
    if variable not in df.columns:
        raise ValueError(f"La variable '{variable}' n'est pas présente dans le DataFrame.")
    
    if df[variable].dtype == 'object' or df[variable].nunique() < 10:  # Catégorielle
        mode_value = df[variable].mode()[0]  # Calcul du mode
        df[variable].fillna(mode_value, inplace=True)
    else:  # Quantitative
        mean_value = df[variable].mean()  # Calcul de la moyenne
        df[variable].fillna(mean_value, inplace=True)
    return df

In [10]:
# Imputation des valeurs manquantes
df = traitement_valeurs_manquantes(df, 'rating')
df = traitement_valeurs_manquantes(df, 'released')
df = traitement_valeurs_manquantes(df, 'score')
df = traitement_valeurs_manquantes(df, 'writer')
df = traitement_valeurs_manquantes(df, 'star')
df = traitement_valeurs_manquantes(df, 'country')
df = traitement_valeurs_manquantes(df, 'gross')
df = traitement_valeurs_manquantes(df, 'company')
df = traitement_valeurs_manquantes(df, 'runtime')
df = traitement_valeurs_manquantes(df, 'votes')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7668 entries, 0 to 7667
Data columns (total 15 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   name      7668 non-null   object 
 1   rating    7668 non-null   object 
 2   genre     7668 non-null   object 
 3   year      7668 non-null   int64  
 4   released  7668 non-null   object 
 5   score     7668 non-null   float64
 6   votes     7668 non-null   float64
 7   director  7668 non-null   object 
 8   writer    7668 non-null   object 
 9   star      7668 non-null   object 
 10  country   7668 non-null   object 
 11  budget    7668 non-null   float64
 12  gross     7668 non-null   float64
 13  company   7668 non-null   object 
 14  runtime   7668 non-null   float64
dtypes: float64(5), int64(1), object(9)
memory usage: 898.7+ KB


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[variable].fillna(mode_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[variable].fillna(mean_value, inplace=True)


#### Traitement du type des données

In [11]:
df.dtypes

name         object
rating       object
genre        object
year          int64
released     object
score       float64
votes       float64
director     object
writer       object
star         object
country      object
budget      float64
gross       float64
company      object
runtime     float64
dtype: object

In [12]:
# Recherche des doublons
df.duplicated(keep=False).sum()

0

### Préparation des données

In [None]:
# Création de la colonne reputation(Catégorisation du réalisateur réputé)
moyenne_par_director = df_0.groupby('director')['movie_averageRating'].mean().reset_index()
realisateurs_reputes = moyenne_par_director[moyenne_par_director['movie_averageRating'] >= 7.5]['director_name'].tolist()
df_0['director_repute'] = df_0['director_name'].apply(lambda x: '1' if x in realisateurs_reputes else '0')