In [1]:
# Importation des modules
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import re

In [None]:
# Définir les options d'affichage pour afficher toutes les lignes sans troncature
pd.set_option("display.max_rows",None)
pd.set_option("display.max_colwidth",None)

# Modifier les options d'affichage pour limiter les décimales
pd.set_option('display.float_format', lambda x: '%.2f' % x)
# BDD title_principals_actor

## Bdd title_principals_actor

In [2]:
# Affichage de la BDD
df_title_principals_actor = pd.read_csv('BDD_CSV/df_title_principals_actor.csv')
print(df_title_principals_actor.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36254354 entries, 0 to 36254353
Data columns (total 6 columns):
 #   Column      Dtype 
---  ------      ----- 
 0   tconst      object
 1   ordering    int64 
 2   nconst      object
 3   category    object
 4   job         object
 5   characters  object
dtypes: int64(1), object(5)
memory usage: 1.6+ GB
None


In [3]:
df_title_principals_actor.head()

Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0000005,1,nm0443482,actor,\N,"[""Blacksmith""]"
1,tt0000005,2,nm0653042,actor,\N,"[""Assistant""]"
2,tt0000007,1,nm0179163,actor,\N,\N
3,tt0000007,2,nm0183947,actor,\N,\N
4,tt0000008,1,nm0653028,actor,\N,"[""Sneezing Man""]"


In [None]:
# Transformer les valeurs \N en NaN
df_title_principals_actor = df_title_principals_actor.replace("\\N", np.nan)

In [None]:
# Initialisation des comptes de NaN
nan_counts_per_column = {}
total_nan_counts = 0

# Boucle sur chaque colonne et comptage des NaN
for column in df_title_principals_actor.columns:
    nan_count = df_title_principals_actor[column].isna().sum()
    nan_counts_per_column[column] = nan_count
    total_nan_counts += nan_count

# Affichage des résultats
print("Nombre de NaN par colonne:")
for column, count in nan_counts_per_column.items():
    print(f"Colonne {column}: {count} NaN")

# Nettoyage de la BDD

In [None]:
# Créer un nouveau DataFrame à partir de df_title_principals_actor pour faciliter la manipulation ultérieure
titlePrincipalsActor_clean = pd.DataFrame(df_title_principals_actor)

In [None]:
# Supprimer les colonnes 'job', 'characters', 'ordering'
titlePrincipalsActor_clean.drop(columns=["job", "characters", "ordering"], inplace=True)

In [None]:
titlePrincipalsActor_clean["category"].unique()

#### Renommer les colonnes

In [None]:
# Modification des noms de colonnes
    # Création du dictionnaire associant les anciens noms aux nouveaux noms de colonnes
dictionnaire_colonne = {
    "nconst": "ID_name",
    "tconst": "ID_title"
}

    # On renomme les variables grâce à la méthode rename
titlePrincipalsActor_clean = titlePrincipalsActor_clean.rename(dictionnaire_colonne, axis = 1)

titlePrincipalsActor_clean.sample(2)

In [None]:
titlePrincipalsActor_clean.info()

#### Dichotomisation de la colonne category

In [None]:
# # Split et dichotomisation des valeurs de la colonne 'profession'
# profession_dummies = titlePrincipalsActor_clean['category'].str.get_dummies(',')

# # Concaténer les colonnes dichotomisées avec le DataFrame original
# titlePrincipalsActorClean_dummi = pd.concat([titlePrincipalsActor_clean, profession_dummies], axis=1)

# # Afficher le DataFrame avec la colonne profession dichotomisée
# titlePrincipalsActorClean_dummi.sample()

In [None]:
# # On sélectionne uniquement les colonnes qui nous interesse
# columns_to_keep = ['ID_name', 'ID_title', 'actor', 'actress']

# titlePrincipalsActorClean_Final = titlePrincipalsActorClean_dummi[columns_to_keep]
# titlePrincipalsActorClean_Final.sample()

In [None]:
# # 1-Sauvegarder en format CSV
# titlePrincipalsActorClean_Final.to_csv('titlePrincipalsActorClean_Final.csv', index=False)

# ***FUSION DES DF***

### Merge title_basic et title_rate

In [None]:
# Affichage de la BDD
titleBasic_clean = pd.read_csv('BDD_CSV/last_BDD/titleBasic_clean.csv')
print(titleBasic_clean.info())

In [None]:
# Affichage de la BDD
titleRate_clean = pd.read_csv('BDD_CSV/last_BDD/titleRate_clean.csv')
print(titleRate_clean.info())

#### merge 1

In [None]:
# On merge les 2 tables title_basic2 et title_rate2
mergeTitle_basic_rate = titleBasic_clean.merge(right = titleRate_clean, on = 'ID_title', how = 'left')

In [None]:
# Affichage 
print(mergeTitle_basic_rate.info())
mergeTitle_basic_rate.sample(3)

#### Merge 2

In [None]:
# Affichage de la BDD
#tmdb_clean = pd.read_csv('BDD_CSV/last_BDD/tmdb_clean.csv')
print(tmdb_clean.info())

In [None]:
# On merge les dataframe df_merge1 et tmdb
merge_TMDB = mergeTitle_basic_rate.merge(right = tmdb_clean, on = 'ID_title', how = 'left')

In [None]:
print(merge_TMDB.info())
merge_TMDB.sample()

### création du df merge_TMDB_clean

In [None]:
# Créer un nouveau DataFrame à partir de merge_TMDB 
merge_TMDB_clean = pd.DataFrame(merge_TMDB)

##### *****Re-ordonner les colonnes*****

In [None]:
# Liste des colonnes dans l'ordre désiré
columns_order = [
    'ID_title', 'title_x', 'original_title_x', 'release_year',
    'runtime_minutes_x', 'genres_x', 'average_rating', 'numVotes_x',
    'ID_TMDB', 'title_y', 'original_title_y', 'year', 'runtime_minutes_y',
    'original_language', 'spoken_languages', 'genres_y', 'averageRating',
    'numVotes_y', 'popularity', 'overview', 'poster_path', 
    'production_companies_name'
]

# Réordonner les colonnes selon l'ordre spécifié
merge_TMDB_clean = merge_TMDB_clean[columns_order]

In [None]:
# Afficher les premières lignes du DataFrame réordonné

print(merge_TMDB_clean.sample(2).to_markdown(index=False, numalign="left", stralign="left"))


### Gestion des ***NaN***

In [None]:
# Initialisation des comptes de NaN
nan_counts_per_column = {}
total_nan_counts = 0

# Boucle sur chaque colonne et comptage des NaN
for column in merge_TMDB_clean.columns:
    nan_count = merge_TMDB_clean[column].isna().sum()
    nan_counts_per_column[column] = nan_count
    total_nan_counts += nan_count

# Affichage des résultats
print("Nombre de NaN par colonne:")
for column, count in nan_counts_per_column.items():
    print(f"Colonne {column}: {count} NaN")

In [None]:
# Supprimer les lignes avec des NaN dans la colonne 'title_x'
merge_TMDB_clean = merge_TMDB_clean.dropna(subset=['title_x'])

In [None]:
# Remplacer les NaN dans 'genres_x' par les valeurs correspondantes de 'genres_y'
merge_TMDB_clean['genres_x'] = merge_TMDB_clean['genres_x'].fillna(merge_TMDB_clean['genres_y'])

In [None]:
# supprimer colonne genres_y
merge_TMDB_clean.drop(columns=["genres_y"], inplace= True)

In [None]:
# Remplacer les NaN dans 'average_rating' d'imbd par les valeurs correspondantes de 'averageRating' provenant de TMBD
merge_TMDB_clean['average_rating'] = merge_TMDB_clean['average_rating'].fillna(merge_TMDB_clean['averageRating'])

In [None]:
# Remplacer les NaN dans 'numVotes_x' par les valeurs correspondantes de 'numVotes_y' provenant de TMBD
merge_TMDB_clean['numVotes_x'] = merge_TMDB_clean['numVotes_x'].fillna(merge_TMDB_clean['numVotes_y'])

In [None]:
# Remplacer les NaN dans 'runtime_minutes_x' par les valeurs correspondantes de 'runtime_minutes_y' provenant de TMBD
merge_TMDB_clean['runtime_minutes_x'] = merge_TMDB_clean['runtime_minutes_x'].fillna(merge_TMDB_clean['average_rating'])

In [None]:
columns_to_keep = ['ID_title', 'title_x', 'original_title_x','original_language', 'release_year', 
                   'runtime_minutes_x', 'genres_x', 'average_rating', 'numVotes_x', 'popularity', 'overview', 'poster_path', 'production_companies_name']

merge_TMDB_clean = merge_TMDB_clean[columns_to_keep]

In [None]:
# Vérification NaN 

# Initialisation des comptes de NaN
nan_counts_per_column = {}
total_nan_counts = 0

# Boucle sur chaque colonne et comptage des NaN
for column in merge_TMDB_clean.columns:
    nan_count = merge_TMDB_clean[column].isna().sum()
    nan_counts_per_column[column] = nan_count
    total_nan_counts += nan_count

# Affichage des résultats
print("Nombre de NaN par colonne:")
for column, count in nan_counts_per_column.items():
    print(f"Colonne {column}: {count} NaN")


In [None]:
# Supprimer les lignes dont la colonne 'average_rating' ont la valeur NaN
merge_TMDB_clean = merge_TMDB_clean.dropna(subset=['average_rating'])

In [None]:
# Vérification NaN 

# Initialisation des comptes de NaN
nan_counts_per_column = {}
total_nan_counts = 0

# Boucle sur chaque colonne et comptage des NaN
for column in merge_TMDB_clean.columns:
    nan_count = merge_TMDB_clean[column].isna().sum()
    nan_counts_per_column[column] = nan_count
    total_nan_counts += nan_count

# Affichage des résultats
print("Nombre de NaN par colonne:")
for column, count in nan_counts_per_column.items():
    print(f"Colonne {column}: {count} NaN")

In [None]:
## Supprimer les lignes dont la colonne 'genres_x' ont la valeur NaN
merge_TMDB_clean = merge_TMDB_clean.dropna(subset=['genres_x'])

In [None]:
# Vérification NaN 

# Initialisation des comptes de NaN
nan_counts_per_column = {}
total_nan_counts = 0

# Boucle sur chaque colonne et comptage des NaN
for column in merge_TMDB_clean.columns:
    nan_count = merge_TMDB_clean[column].isna().sum()
    nan_counts_per_column[column] = nan_count
    total_nan_counts += nan_count

# Affichage des résultats
print("Nombre de NaN par colonne:")
for column, count in nan_counts_per_column.items():
    print(f"Colonne {column}: {count} NaN")

In [None]:
# On renomme les colonnes

merge_TMDB_clean = merge_TMDB_clean.rename(columns={
    'title_x' : 'title',
    'original_title_x' : 'original_title',
    'runtime_minutes_x': 'runtime_minutes',
    'genres_x': 'genres',
    'numVotes_x' : 'numVotes'
})

In [None]:
merge_TMDB_clean.info()

In [None]:
# # les différentes modalites de la colonne "genres"
# merge_TMDB_clean[("genres")].value_counts()

In [None]:
# Remplacer 'Sci Fi' par 'ScienceFiction' dans la colonne 'genres'
merge_TMDB_clean['genres'] = merge_TMDB_clean['genres'].str.replace('Sci-Fi', 'ScienceFiction')

merge_TMDB_clean['genres'] = merge_TMDB_clean['genres'].str.replace('Science Fiction', 'ScienceFiction')

#### ***Colonnes numériques****

In [None]:
merge_TMDB_clean.describe()

In [None]:
# Supprimer les lignes où le average_rating == 0
merge_TMDB_clean = merge_TMDB_clean[merge_TMDB_clean["average_rating"] != 0]

In [None]:
# Convertion du type de  release_year en integer
merge_TMDB_clean["release_year"] = round(merge_TMDB_clean["release_year"].astype('int64'),0)

In [None]:
print(merge_TMDB_clean["release_year"].min())
print(merge_TMDB_clean["release_year"].max())

In [None]:
# Convertion du type de  runtime_minutes en integer
# Vérification des NaN dans la colonne 'runtime_minutes' et arrondi des autres valeurs
def convert_runtime(value):
    if pd.notnull(value):
        return round(value)
    else:
        return value

# Application de la fonction de conversion à chaque élément de la colonne 'runtime_minutes'
merge_TMDB_clean['runtime_minutes'] = merge_TMDB_clean['runtime_minutes'].apply(convert_runtime)

# Conversion de la colonne 'runtime_minutes' en type 'Int64' pour pouvoir gérer les NaN
merge_TMDB_clean['runtime_minutes'] = round(merge_TMDB_clean['runtime_minutes'].astype('Int64'),0)


In [None]:
merge_TMDB_clean.describe()

#### ***Sélection de la durée des films***

In [None]:
# Retirer les lignes où 'runtime_minutes'
merge_TMDB_clean = merge_TMDB_clean[
    (merge_TMDB_clean['runtime_minutes'] >= 60) & (merge_TMDB_clean['runtime_minutes'] <= 210)
]

In [None]:
# Minimum de la valeur de "runtime_minutes"
print(merge_TMDB_clean["runtime_minutes"].min())

# Max de la valeur de "runtime_minutes"
print(merge_TMDB_clean["runtime_minutes"].max())

#### ***Colonnes avec données qualitatives***

In [None]:
print(merge_TMDB_clean['genres'].unique())

In [None]:
# 1. Éliminer les crochets et les apostrophes
merge_TMDB_clean['genres'] = merge_TMDB_clean['genres'].str.replace(r'[\[\]\'\"]', '', regex=True)

# # 2. Remplacer les virgules par des points-virgules pour uniformiser
#merge_TMDB_clean['genres'] = merge_TMDB_clean['genres'].str.replace(',', ',')

# 3. Éliminer les espaces superflus au début et à la fin
merge_TMDB_clean['genres'] = merge_TMDB_clean['genres'].str.strip()

# 4. Éliminer les espaces supplémentaires entre les mots
merge_TMDB_clean['genres'] = merge_TMDB_clean['genres'].str.replace(r'\s*;\s*', ', ', regex=True)

In [None]:
print(merge_TMDB_clean['genres'].unique())

In [None]:
# Remplacement des espaces par des virgules
merge_TMDB_clean['genres'] = merge_TMDB_clean['genres'].str.replace(r'\s+', ',', regex=True)

In [None]:
print(merge_TMDB_clean['genres'].unique())

In [None]:
# Remplacement des espaces par des virgules
merge_TMDB_clean['genres'] = merge_TMDB_clean['genres'].str.replace(r'\s+', ',', regex=True)

# Suppression de toutes les virgules en doubles
merge_TMDB_clean['genres'] = merge_TMDB_clean['genres'].str.replace(r',+', ',', regex=True)

# Suppression des virgules en début et fin de chaîne s'il y en a
merge_TMDB_clean['genres'] = merge_TMDB_clean['genres'].str.strip(',')


In [None]:
print(merge_TMDB_clean['genres'].unique())

In [None]:
# Fonction pour détecter les cellules vides, avec guillemets ou espaces
def is_invalid(value):
    # Vérifie si la cellule est un espace vide, une chaîne vide, ou contient seulement des guillemets simples ou doubles
    return value.strip() in ['', "'", '"']

# Suppression des lignes invalides
merge_TMDB_clean = merge_TMDB_clean[~merge_TMDB_clean['genres'].apply(is_invalid)]

# Réinitialiser les index du DataFrame
merge_TMDB_clean = merge_TMDB_clean.reset_index(drop=True)

# Affichage du DataFrame modifié
print(merge_TMDB_clean['genres'].unique())
print(merge_TMDB_clean.info())


In [None]:
# Nettoyer les espaces en début et en fin de chaîne pour toutes les colonnes
for col in merge_TMDB_clean.select_dtypes(include='object').columns:
    merge_TMDB_clean[col] = merge_TMDB_clean[col].astype(str).str.strip()

In [None]:
# Fonction pour nettoyer les espaces en début et fin de chaîne
def clean_string(s):
    if isinstance(s, str):  # Vérifie si l'élément est une chaîne de caractères
        return s.strip()
    return s

# Appliquer la fonction de nettoyage à toutes les colonnes du DataFrame
merge_TMDB_clean = merge_TMDB_clean.applymap(clean_string)

print(merge_TMDB_clean['genres'].unique())

In [None]:
print(merge_TMDB_clean.info())

print(merge_TMDB_clean.sample(2).to_markdown(index=False, numalign="left", stralign="left"))

### ***Dichotomisation de la colonne 'genres'***

In [None]:
# Créer des dummies pour la colonne 'genres'
    # 1. On utilise str.get_dummies pour gérer plusieurs genres séparés par un espace
dummy_genres = merge_TMDB_clean['genres'].str.get_dummies(sep=',')

    # 2. On combine ces dummies avec le DataFrame original
mergeTmdb_CleanDum = pd.concat([merge_TMDB_clean, dummy_genres], axis=1)

In [None]:
print(mergeTmdb_CleanDum.info())

# Print the first 5 rows
print(mergeTmdb_CleanDum.sample(3).to_markdown(index=False, numalign="left", stralign="left"))

In [None]:
# Afficher les colonnes modifiées
mergeTmdb_CleanDum.describe()

#### Ajout de la colonne : moyenne pondérée 

In [None]:
# Calcul de la moyenne pondérée
mean_vote = 6.24
min_num_vote = 341 

mergeTmdb_CleanDum['weighted_averageRating'] = (
    (mergeTmdb_CleanDum['average_rating'] * mergeTmdb_CleanDum['numVotes'] + mean_vote * min_num_vote) / 
    (mergeTmdb_CleanDum['numVotes'] + min_num_vote)
)

## 1-Sauvegarder en format CSV

In [None]:
mergeTmdb_CleanDum.to_csv('mergeTmdb_CleanDum.csv', index=False)

In [4]:
# Affichage de la BDD
mergeTmdb_CleanDum = pd.read_csv('BDD_CSV/last_BDD/mergeTmdb_CleanDum.csv')

In [5]:
print(mergeTmdb_CleanDum.info())
print(mergeTmdb_CleanDum.sample(2).to_markdown(index=False, numalign="left", stralign="left"))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159960 entries, 0 to 159959
Data columns (total 39 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   ID_title                   159960 non-null  object 
 1   title                      159960 non-null  object 
 2   original_title             159960 non-null  object 
 3   original_language          11694 non-null   object 
 4   release_year               159960 non-null  int64  
 5   runtime_minutes            159960 non-null  int64  
 6   genres                     159960 non-null  object 
 7   average_rating             159960 non-null  float64
 8   numVotes                   159960 non-null  float64
 9   popularity                 11694 non-null   float64
 10  overview                   11694 non-null   object 
 11  poster_path                11056 non-null   object 
 12  production_companies_name  11415 non-null   object 
 13  Action                     15

#### ***Sélection la moyenne pondérée des films***

In [None]:
mergeTmdb_CleanDum.describe()

In [None]:
#  Retirer les lignes dont weighted_averageRating 
mergeTmdb_CleanDum = mergeTmdb_CleanDum[
    (mergeTmdb_CleanDum["weighted_averageRating"] > 6.25) & 
    (mergeTmdb_CleanDum["numVotes"] > 100)
]

In [None]:
mergeTmdb_CleanDum.info()

In [None]:
# Initialisation d'un dictionnaire pour stocker le nombre de doublons par colonne
duplicate_counts = {}

# Pour chaque colonne du DataFrame
for col in mergeTmdb_CleanDum.columns:
    # Utilisation de duplicated() pour détecter les doublons
    num_duplicates = mergeTmdb_CleanDum[col].duplicated(keep=False).sum() - mergeTmdb_CleanDum[col].duplicated(keep='first').sum()
    duplicate_counts[col] = num_duplicates

# Affichage du nombre de doublons par colonne
for col, count in duplicate_counts.items():
    print(f"Colonne '{col}' a {count} doublons")

In [None]:
mergeTmdb_CleanDum.info()

### 1-Sauvegarder en format CSV

In [None]:
mergeTmdb_CleanDum.to_csv('mergeTmdb_CleanDum_Final.csv', index=False)

In [6]:
# Affichage de la BDD
mergeTmdb_CleanDum_Final = pd.read_csv('BDD_CSV/last_BDD/mergeTmdb_CleanDum_Final.csv')
print(mergeTmdb_CleanDum_Final.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33553 entries, 0 to 33552
Data columns (total 39 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   ID_title                   33553 non-null  object 
 1   title                      33553 non-null  object 
 2   original_title             33553 non-null  object 
 3   original_language          4627 non-null   object 
 4   release_year               33553 non-null  int64  
 5   runtime_minutes            33553 non-null  int64  
 6   genres                     33553 non-null  object 
 7   average_rating             33553 non-null  float64
 8   numVotes                   33553 non-null  float64
 9   popularity                 4627 non-null   float64
 10  overview                   4627 non-null   object 
 11  poster_path                4483 non-null   object 
 12  production_companies_name  4572 non-null   object 
 13  Action                     33553 non-null  int

In [8]:
print(mergeTmdb_CleanDum_Final.sample(2).to_markdown(index=False, numalign="left", stralign="left"))

| ID_title   | title                      | original_title             | original_language   | release_year   | runtime_minutes   | genres                  | average_rating   | numVotes   | popularity   | overview                                                                                                                                                                                                                                                                                                                                                                                                                                          | poster_path                      | production_companies_name   | Action   | Adventure   | Animation   | Biography   | Comedy   | Crime   | Documentary   | Drama   | Family   | Fantasy   | Game-Show   | History   | Horror   | Music   | Musical   | Mystery   | News   | Reality-TV   | Romance   | ScienceFiction   | Sport   | Talk-Show   | Thriller   | War   | 

In [None]:
mergeTmdb_CleanDum_Final[mergeTmdb_CleanDum_Final['title'] == 'Fight Club']

In [16]:
mergeTmdb_CleanDum_Final[mergeTmdb_CleanDum_Final['ID_title'] == 'tt2370248']

Unnamed: 0,ID_title,title,original_title,original_language,release_year,runtime_minutes,genres,average_rating,numVotes,popularity,...,News,Reality-TV,Romance,ScienceFiction,Sport,Talk-Show,Thriller,War,Western,weighted_averageRating
1503,tt2370248,Short Term 12,Short Term 12,en,2013,96,Drama,7.9,92053.0,8.087,...,0,0,0,0,0,0,0,0,0,7.893873


## Merge 3

###### ***on va associer la table mergeTMDBDummi_Final avec titlePrincipalsActorClean_Final et nameBasic_Final***


In [None]:
# # retirer les lignes dans le df qui n'ont pas de original_language
# mergeTmdb_CleanDum_Final2 = mergeTmdb_CleanDum_Final.dropna(subset= 'original_language')
# mergeTmdb_CleanDum_Final2.info()

##### ***Merge entre name_basic et titlePrincipal***

In [9]:
# Affichage de la BDD
nameBasic_Final = pd.read_csv('BDD_CSV/last_BDD/nameBasic_Final.csv')
print(nameBasic_Final.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20434966 entries, 0 to 20434965
Data columns (total 6 columns):
 #   Column      Dtype 
---  ------      ----- 
 0   ID_name     object
 1   actor_name  object
 2   ID_title    object
 3   actor       int64 
 4   actress     int64 
 5   director    int64 
dtypes: int64(3), object(3)
memory usage: 935.4+ MB
None


In [10]:
print(nameBasic_Final.sample(2).to_markdown(index=False, numalign="left", stralign="left"))

| ID_name    | actor_name   | ID_title   | actor   | actress   | director   |
|:-----------|:-------------|:-----------|:--------|:----------|:-----------|
| nm11369758 | Sandy Gervay | tt22780164 | 0       | 0         | 0          |
| nm14527724 | Nerea Alegre | tt26598475 | 0       | 0         | 0          |


In [None]:
nameBasic_Final[nameBasic_Final['ID_title'] == 'tt0137523']

In [None]:
nameBasic_Final[nameBasic_Final['ID_name'] == 'nm12461406']

In [11]:
# Affichage de la BDD
titlePrincipalsActorClean_Final = pd.read_csv('BDD_CSV/last_BDD/titlePrincipalsActorClean_Final.csv')
print(titlePrincipalsActorClean_Final.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36254354 entries, 0 to 36254353
Data columns (total 4 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   ID_name   object
 1   ID_title  object
 2   actor     int64 
 3   actress   int64 
dtypes: int64(2), object(2)
memory usage: 1.1+ GB
None


In [12]:
print(titlePrincipalsActorClean_Final.sample(2).to_markdown(index=False, numalign="left", stralign="left"))

| ID_name   | ID_title   | actor   | actress   |
|:----------|:-----------|:--------|:----------|
| nm0238546 | tt0830227  | 1       | 0         |
| nm0569996 | tt0978911  | 0       | 1         |


In [None]:
titlePrincipalsActorClean_Final[titlePrincipalsActorClean_Final['ID_title'] == 'tt0137523']

In [None]:
titlePrincipalsActorClean_Final[titlePrincipalsActorClean_Final['ID_name'] == 'nm12461406']

##### Merge titlePrincipalsActorClean_Final & mergeTmdb_CleanDum_Final

In [46]:
TmdbCleanDumActor_Final = mergeTmdb_CleanDum_Final.merge(right = titlePrincipalsActorClean_Final, on = "ID_title", how = 'left')

In [47]:
print(TmdbCleanDumActor_Final.info())
print()
print(TmdbCleanDumActor_Final.sample(2).to_markdown(index=False, numalign="left", stralign="left"))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 267235 entries, 0 to 267234
Data columns (total 42 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   ID_title                   267235 non-null  object 
 1   title                      267235 non-null  object 
 2   original_title             267235 non-null  object 
 3   original_language          30345 non-null   object 
 4   release_year               267235 non-null  int64  
 5   runtime_minutes            267235 non-null  int64  
 6   genres                     267235 non-null  object 
 7   average_rating             267235 non-null  float64
 8   numVotes                   267235 non-null  float64
 9   popularity                 30345 non-null   float64
 10  overview                   30345 non-null   object 
 11  poster_path                29875 non-null   object 
 12  production_companies_name  29975 non-null   object 
 13  Action                     26

In [49]:
TmdbCleanDumActor_Final[TmdbCleanDumActor_Final["title"] == 'Forrest Gump']

Unnamed: 0,ID_title,title,original_title,original_language,release_year,runtime_minutes,genres,average_rating,numVotes,popularity,...,ScienceFiction,Sport,Talk-Show,Thriller,War,Western,weighted_averageRating,ID_name,actor,actress
251146,tt0109830,Forrest Gump,Forrest Gump,,1994,142,"Drama,Romance",8.8,2272161.0,,...,0,0,0,0,0,0,8.799616,nm0000158,1.0,0.0
251147,tt0109830,Forrest Gump,Forrest Gump,,1994,142,"Drama,Romance",8.8,2272161.0,,...,0,0,0,0,0,0,8.799616,nm0000705,0.0,1.0
251148,tt0109830,Forrest Gump,Forrest Gump,,1994,142,"Drama,Romance",8.8,2272161.0,,...,0,0,0,0,0,0,8.799616,nm0000641,1.0,0.0
251149,tt0109830,Forrest Gump,Forrest Gump,,1994,142,"Drama,Romance",8.8,2272161.0,,...,0,0,0,0,0,0,8.799616,nm0000398,0.0,1.0
251150,tt0109830,Forrest Gump,Forrest Gump,,1994,142,"Drama,Romance",8.8,2272161.0,,...,0,0,0,0,0,0,8.799616,nm0931508,0.0,1.0
251151,tt0109830,Forrest Gump,Forrest Gump,,1994,142,"Drama,Romance",8.8,2272161.0,,...,0,0,0,0,0,0,8.799616,nm0402011,1.0,0.0
251152,tt0109830,Forrest Gump,Forrest Gump,,1994,142,"Drama,Romance",8.8,2272161.0,,...,0,0,0,0,0,0,8.799616,nm0381041,1.0,0.0
251153,tt0109830,Forrest Gump,Forrest Gump,,1994,142,"Drama,Romance",8.8,2272161.0,,...,0,0,0,0,0,0,8.799616,nm0446404,1.0,0.0
251154,tt0109830,Forrest Gump,Forrest Gump,,1994,142,"Drama,Romance",8.8,2272161.0,,...,0,0,0,0,0,0,8.799616,nm0672225,1.0,0.0
251155,tt0109830,Forrest Gump,Forrest Gump,,1994,142,"Drama,Romance",8.8,2272161.0,,...,0,0,0,0,0,0,8.799616,nm0709615,1.0,0.0


##### Merge nameBasic_Final & mergeTmdb_CleanDum_Final

In [50]:
TmdbCleanDumActor_Final = TmdbCleanDumActor_Final.merge(right = nameBasic_Final, on = "ID_name", how = 'left')

In [51]:
print(TmdbCleanDumActor_Final.info())
print()
print(TmdbCleanDumActor_Final.sample(2).to_markdown(index=False, numalign="left", stralign="left"))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 958020 entries, 0 to 958019
Data columns (total 47 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   ID_title_x                 958020 non-null  object 
 1   title                      958020 non-null  object 
 2   original_title             958020 non-null  object 
 3   original_language          107031 non-null  object 
 4   release_year               958020 non-null  int64  
 5   runtime_minutes            958020 non-null  int64  
 6   genres                     958020 non-null  object 
 7   average_rating             958020 non-null  float64
 8   numVotes                   958020 non-null  float64
 9   popularity                 107031 non-null  float64
 10  overview                   107031 non-null  object 
 11  poster_path                105716 non-null  object 
 12  production_companies_name  105791 non-null  object 
 13  Action                     95

In [53]:
# df = TmdbCleanDumActor_Final[TmdbCleanDumActor_Final["title"] == 'Forrest Gump']
# df

#### ***# Supprimer les colonnes ***

In [54]:
TmdbCleanDumActor_Final = TmdbCleanDumActor_Final.drop(columns=['actor_y', 'actress_y', 'ID_title_y'])

##### ***Renomme les colonnes***

In [55]:
# Renommer les colonnes
dict_name= {
            'actor_x':'actor',
            'actress_x' : 'actress',
            'ID_title_x' : 'ID_title' 
            
            }

TmdbCleanDumActor_Final = TmdbCleanDumActor_Final.rename(columns=dict_name)

In [56]:
TmdbCleanDumActor_Final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 958020 entries, 0 to 958019
Data columns (total 44 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   ID_title                   958020 non-null  object 
 1   title                      958020 non-null  object 
 2   original_title             958020 non-null  object 
 3   original_language          107031 non-null  object 
 4   release_year               958020 non-null  int64  
 5   runtime_minutes            958020 non-null  int64  
 6   genres                     958020 non-null  object 
 7   average_rating             958020 non-null  float64
 8   numVotes                   958020 non-null  float64
 9   popularity                 107031 non-null  float64
 10  overview                   107031 non-null  object 
 11  poster_path                105716 non-null  object 
 12  production_companies_name  105791 non-null  object 
 13  Action                     95

#### ***NaN***

In [63]:
# Initialisation des comptes de NaN
nan_counts_per_column = {}
total_nan_counts = 0

# Boucle sur chaque colonne et comptage des NaN
for column in TmdbCleanDumActor_Final.columns:
    nan_count = TmdbCleanDumActor_Final[column].isna().sum()
    nan_counts_per_column[column] = nan_count
    total_nan_counts += nan_count

# Affichage des résultats
print("Nombre de NaN par colonne:")
for column, count in nan_counts_per_column.items():
    print(f"Colonne {column}: {count} NaN")

Nombre de NaN par colonne:
Colonne ID_title: 0 NaN
Colonne title: 0 NaN
Colonne original_title: 0 NaN
Colonne original_language: 224178 NaN
Colonne release_year: 0 NaN
Colonne runtime_minutes: 0 NaN
Colonne genres: 0 NaN
Colonne average_rating: 0 NaN
Colonne numVotes: 0 NaN
Colonne popularity: 224178 NaN
Colonne overview: 224178 NaN
Colonne poster_path: 224510 NaN
Colonne production_companies_name: 224512 NaN
Colonne Action: 0 NaN
Colonne Adventure: 0 NaN
Colonne Animation: 0 NaN
Colonne Biography: 0 NaN
Colonne Comedy: 0 NaN
Colonne Crime: 0 NaN
Colonne Documentary: 0 NaN
Colonne Drama: 0 NaN
Colonne Family: 0 NaN
Colonne Fantasy: 0 NaN
Colonne Game-Show: 0 NaN
Colonne History: 0 NaN
Colonne Horror: 0 NaN
Colonne Music: 0 NaN
Colonne Musical: 0 NaN
Colonne Mystery: 0 NaN
Colonne News: 0 NaN
Colonne Reality-TV: 0 NaN
Colonne Romance: 0 NaN
Colonne ScienceFiction: 0 NaN
Colonne Sport: 0 NaN
Colonne Talk-Show: 0 NaN
Colonne Thriller: 0 NaN
Colonne War: 0 NaN
Colonne Western: 0 NaN
Colonn

In [62]:
TmdbCleanDumActor_Final = TmdbCleanDumActor_Final.dropna(subset='actor_name')

### ***Doublons***

In [57]:
TmdbCleanDumActor_Final = TmdbCleanDumActor_Final.drop_duplicates()

In [64]:
TmdbCleanDumActor_Final.info()

<class 'pandas.core.frame.DataFrame'>
Index: 251523 entries, 2 to 958016
Data columns (total 44 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   ID_title                   251523 non-null  object 
 1   title                      251523 non-null  object 
 2   original_title             251523 non-null  object 
 3   original_language          27345 non-null   object 
 4   release_year               251523 non-null  int64  
 5   runtime_minutes            251523 non-null  int64  
 6   genres                     251523 non-null  object 
 7   average_rating             251523 non-null  float64
 8   numVotes                   251523 non-null  float64
 9   popularity                 27345 non-null   float64
 10  overview                   27345 non-null   object 
 11  poster_path                27013 non-null   object 
 12  production_companies_name  27011 non-null   object 
 13  Action                     251523 

In [60]:
#TmdbCleanDumActor_Final[TmdbCleanDumActor_Final["actor_name"] == 'Tom Hanks']

##### ***Merge avec titlePrincipalsActorClean_Final***

In [None]:
TmdbCleanDumActor_Final = mergeTmdb_CleanDum_Final.merge(right = nameBasic_TitlePrincipals, on = "ID_title", how = 'left')

In [None]:
TmdbCleanDumActor_Final.info()

In [None]:
TmdbCleanDumActor_Final[TmdbCleanDumActor_Final["actor_name"] == 'Tom Hanks']

In [None]:
print(TmdbCleanDumActor_Final.sample(2).to_markdown(index=False, numalign="left", stralign="left"))

In [None]:
# supprimer les colonnes 
TmdbCleanDumActor_Final = TmdbCleanDumActor_Final.drop(columns=['production_companies_name'])

### # 1-Sauvegarder en format CSV

In [None]:
TmdbCleanDumActor_Final.to_csv('TmdbCleanDumActor_Final.csv', index=False)

In [None]:
# # Affichage de la BDD
#TmdbCleanDumActor_Final = pd.read_csv('BDD_CSV/last_BDD/TmdbCleanDumActor_Final.csv')
print(TmdbCleanDumActor_Final.info())

In [None]:
# Initialisation des comptes de NaN
nan_counts_per_column = {}
total_nan_counts = 0

# Boucle sur chaque colonne et comptage des NaN
for column in TmdbCleanDumActor_Final.columns:
    nan_count = TmdbCleanDumActor_Final[column].isna().sum()
    nan_counts_per_column[column] = nan_count
    total_nan_counts += nan_count

# Affichage des résultats
print("Nombre de NaN par colonne:")
for column, count in nan_counts_per_column.items():
    print(f"Colonne {column}: {count} NaN")

In [None]:
TmdbCleanDumActor_Final = TmdbCleanDumActor_Final.dropna(subset=['actor_name'])

In [65]:
TmdbCleanDumActor_Final.columns

Index(['ID_title', 'title', 'original_title', 'original_language',
       'release_year', 'runtime_minutes', 'genres', 'average_rating',
       'numVotes', 'popularity', 'overview', 'poster_path',
       'production_companies_name', 'Action', 'Adventure', 'Animation',
       'Biography', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family',
       'Fantasy', 'Game-Show', 'History', 'Horror', 'Music', 'Musical',
       'Mystery', 'News', 'Reality-TV', 'Romance', 'ScienceFiction', 'Sport',
       'Talk-Show', 'Thriller', 'War', 'Western', 'weighted_averageRating',
       'ID_name', 'actor', 'actress', 'actor_name', 'director'],
      dtype='object')

In [66]:
# Réorganisation des colonnes de TmdbCleanDumActor_Final dans l'ordre spécifié
columns_order = [
    'ID_title', 'title', 'original_title', 'ID_name', 'actor_name', 
    'actor', 'actress','director', 'original_language', 'release_year', 'runtime_minutes','weighted_averageRating',
    'average_rating', 'numVotes', 'popularity', 'genres','Action', 'Adventure', 'Biography', 
    'Crime', 'Documentary', 'Fantasy', 'Game-Show', 'Horror', 'Musical', 'Mystery', 'News', 'Reality-TV', 
    'Sport', 'Talk-Show', 'Thriller', 'War', 'Western', 'Animation', 'Comedy', 'Drama', 'Family', 'History',
    'Music', 'Romance', 'ScienceFiction', 'overview', 'poster_path'
]

# Réindexation des colonnes
TmdbCleanDumActor_Final = TmdbCleanDumActor_Final[columns_order]

In [None]:
TmdbCleanDumActor_Final.info()

In [68]:
#TmdbCleanDumActor_Final[TmdbCleanDumActor_Final["actor_name"] == "Tom Hanks"]

In [None]:
# Initialisation d'un dictionnaire pour stocker le nombre de doublons par colonne
duplicate_counts = {}

# Pour chaque colonne du DataFrame
for col in TmdbCleanDumActor_Final.columns:
    # Utilisation de duplicated() pour détecter les doublons
    num_duplicates = TmdbCleanDumActor_Final[col].duplicated(keep=False).sum() - TmdbCleanDumActor_Final[col].duplicated(keep='first').sum()
    duplicate_counts[col] = num_duplicates

# Affichage du nombre de doublons par colonne
for col, count in duplicate_counts.items():
    print(f"Colonne '{col}' a {count} doublons")


In [None]:
# Supprimer les lignes en double
TmdbCleanDumActor_Final = TmdbCleanDumActor_Final.drop_duplicates()
TmdbCleanDumActor_Final.info()

# 1-Sauvegarder en format CSV

In [72]:
# 1-Sauvegarder en format CSV
TmdbCleanDumActor_Final.to_csv('TmdbCleanDumActor_Final.csv', index=False)

In [None]:
print(TmdbCleanDumActor_Final['actor'].value_counts(normalize= True)*100)
print(TmdbCleanDumActor_Final['actress'].value_counts(normalize= True)*100)
print(TmdbCleanDumActor_Final['director'].value_counts(normalize= True)*100)

In [None]:
TmdbCleanDumActor_Final.info()

In [None]:
print(TmdbCleanDumActor_Final.sample(5).to_markdown(index=False, numalign="left", stralign="left"))