In [1]:
import pandas as pd

In [2]:
# Chargement des fichiers
tmdb_df = pd.read_csv("datasets_tmdb_10kmovies.csv")  # contient imdb_id
principals_df = pd.read_csv("title.principals.tsv", sep='\t', na_values='\\N')
name_basics_df = pd.read_csv("name.basics.tsv", sep='\t', na_values='\\N')


In [3]:
# filtrer les acteurs dans principals
principals_df = principals_df[principals_df['category'].isin(['actor', 'actress'])]

In [4]:
# joindre principals avec name_basics pour récupérer les noms
merged_df = principals_df.merge(name_basics_df[['nconst', 'primaryName']], on='nconst', how='left')

In [5]:
# garder que les films présents dans tmdb
merged_df = merged_df[merged_df['tconst'].isin(tmdb_df['imdb_id'])]

In [6]:
# grouper les noms d’acteurs par film
actors_by_film = merged_df.groupby('tconst')['primaryName'].apply(list).reset_index()

In [7]:
# renommer tconst en imdb_id pour join final
actors_by_film = actors_by_film.rename(columns={'tconst': 'imdb_id'})

In [8]:
# merge final
final_df = tmdb_df.merge(actors_by_film, on='imdb_id', how='left')

In [9]:
# renommer la colonne
final_df = final_df.rename(columns={'primaryName': 'actors'})

In [10]:
# suppression des colonnes inutiles
final_df.drop(columns=['Unnamed: 0', 'budget','backdrop_path','adult', 'homepage', 'original_language', 'original_title','revenue','spoken_languages','status','tagline','video','production_companies_name','production_companies_country'], inplace=True)

In [11]:
# suppression des lignes avec des valeurs manquantes
final_df_cleaned = final_df.dropna()

In [12]:
# on garde uniquement l'année de sortie et non pas la date exacte

final_df_cleaned['release_date'] = pd.to_datetime(final_df_cleaned['release_date'])

final_df_cleaned['release_year'] = final_df_cleaned['release_date'].dt.year

final_df_cleaned = final_df_cleaned.drop(columns=['release_date'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df_cleaned['release_date'] = pd.to_datetime(final_df_cleaned['release_date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df_cleaned['release_year'] = final_df_cleaned['release_date'].dt.year


In [13]:
# réordonner les colonnes
colonnes = [
    'title',
    'release_year',
    'genres',
    'runtime',
    'production_countries',
    'actors',
    'vote_average',
    'vote_count',
    'popularity',
    'poster_path',
    'imdb_id',
    'overview',
    'id'
]

final_df_cleaned = final_df_cleaned[colonnes]


In [14]:
final_df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8910 entries, 0 to 10020
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   title                 8910 non-null   object 
 1   release_year          8910 non-null   int32  
 2   genres                8910 non-null   object 
 3   runtime               8910 non-null   int64  
 4   production_countries  8910 non-null   object 
 5   actors                8910 non-null   object 
 6   vote_average          8910 non-null   float64
 7   vote_count            8910 non-null   int64  
 8   popularity            8910 non-null   float64
 9   poster_path           8910 non-null   object 
 10  imdb_id               8910 non-null   object 
 11  overview              8910 non-null   object 
 12  id                    8910 non-null   int64  
dtypes: float64(2), int32(1), int64(3), object(7)
memory usage: 939.7+ KB


In [15]:
final_df_cleaned.head()

Unnamed: 0,title,release_year,genres,runtime,production_countries,actors,vote_average,vote_count,popularity,poster_path,imdb_id,overview,id
0,Blondie,1938,['Comedy'],70,['US'],"[Penny Singleton, Arthur Lake, Larry Simms, Da...",7.214,7,2.852,/zBiHKhXklvTFwj4M1uEUcQGAVJ.jpg,tt0029927,Blondie and Dagwood are about to celebrate the...,3924
1,Judgment Night,1993,"['Action', 'Crime', 'Thriller']",109,['US'],"[Emilio Estevez, Cuba Gooding Jr., Denis Leary...",6.6,284,10.797,/3rvvpS9YPM5HB2f4HYiNiJVtdam.jpg,tt0107286,"While racing to a boxing match, Frank, Mike, J...",6
2,Star Wars,1977,"['Adventure', 'Action', 'Science Fiction']",121,['US'],"[Mark Hamill, Harrison Ford, Carrie Fisher, Al...",8.208,18582,90.988,/6FfCtAuVAW8XJjZ7eWeLibRLWTw.jpg,tt0076759,Princess Leia is captured and held hostage by ...,11
3,Forrest Gump,1994,"['Comedy', 'Drama', 'Romance']",142,['US'],"[Tom Hanks, Robin Wright, Gary Sinise, Sally F...",8.481,24593,52.321,/arw2vcBveWOVZr6pxd9XTd1TdQa.jpg,tt0109830,A man with a low IQ has accomplished great thi...,13
4,The Fifth Element,1997,"['Adventure', 'Fantasy', 'Action', 'Thriller',...",126,"['FR', 'GB']","[Bruce Willis, Milla Jovovich, Gary Oldman, Ia...",7.524,9656,46.823,/fPtlCO1yQtnoLHOwKtWz7db6RGU.jpg,tt0119116,"In 2257, a taxi driver is unintentionally give...",18


In [16]:
final_df_cleaned.to_csv("dataset_mov_act_2.csv")