## Import

In [2]:
import pandas as pd
import numpy as np


## Chargement des données IMDb

In [3]:
basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"
ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"

basics = pd.read_csv(basics_url, sep="\t", low_memory=False)
ratings = pd.read_csv(ratings_url, sep="\t")


## Filtrer les films uniquement

In [4]:
movies = basics[basics["titleType"] == "movie"].copy()

movies = movies[
    (movies["startYear"] != "\\N") &
    (movies["runtimeMinutes"] != "\\N")
]

movies["startYear"] = movies["startYear"].astype(int)
movies["runtimeMinutes"] = movies["runtimeMinutes"].astype(int)


## Jointure avec les notes

In [5]:
movies = movies.merge(ratings, on="tconst", how="inner")


## Filtres

In [6]:
movies = movies[
    (movies["numVotes"] >= 1000) &
    (movies["startYear"] >= 1980)
]


## Réduction de volumétrie (~10 000 films)

In [7]:
movies = movies.sort_values("numVotes", ascending=False).head(10000)


## Nettoyage final

In [8]:
movies = movies[[
    "tconst",
    "primaryTitle",
    "startYear",
    "runtimeMinutes",
    "genres",
    "averageRating",
    "numVotes"
]]

movies = movies.reset_index(drop=True)


## Export CSV

In [12]:
movies.to_csv("../data/movies_clean.csv", index=False)


OSError: Cannot save file into a non-existent directory: '..\data'

## Vérification rapide

In [11]:
movies.head()


Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,genres,averageRating,numVotes
0,tt0111161,The Shawshank Redemption,1994,142,Drama,9.3,3128869
1,tt0468569,The Dark Knight,2008,152,"Action,Crime,Drama",9.1,3104589
2,tt1375666,Inception,2010,148,"Action,Adventure,Sci-Fi",8.8,2758879
3,tt0137523,Fight Club,1999,139,"Crime,Drama,Thriller",8.8,2543645
4,tt0109830,Forrest Gump,1994,142,"Drama,Romance",8.8,2445238
