In [19]:
import pandas as pd

names = pd.read_csv("data/name.basics.tsv", sep="\t", na_values="\\N", low_memory=False)
titles = pd.read_csv("data/title.basics.tsv", sep="\t", na_values="\\N", low_memory=False)
ratings = pd.read_csv("data/title.ratings.tsv", sep="\t", na_values="c\\N", low_memory=False)
crew   = pd.read_csv("data/title.crew.tsv",   sep="\t", na_values="\\N", low_memory=False)
episodes = pd.read_csv("data/title.episode.tsv", sep="\t", na_values="\\N", low_memory=False)

In [20]:
# Q1 - Total number of people in the dataset
total_personnes = len(names)
total_personnes


14894780

 Q1 – Nombre total de personnes
Total number of people in the dataset :"14894780"


In [21]:
import pandas as pd
import datetime as dt

# 1) Cleanly convert the birthYear column to numeric
birth_years = pd.to_numeric(names["birthYear"], errors="coerce")

# 2) Minimum year found
earliest_year = int(birth_years.min())
print("Raw minimum birth year in the file:", earliest_year)

# 3) Show the person(s) with this year
personnes_earliest = names.loc[
    birth_years == earliest_year,
    ["primaryName", "birthYear", "deathYear", "primaryProfession"]
]
personnes_earliest.head()


Raw minimum birth year in the file: 4


Unnamed: 0,primaryName,birthYear,deathYear,primaryProfession
737975,Lucio Anneo Seneca,4.0,65.0,writer


In [22]:
birth_years = pd.to_numeric(names["birthYear"], errors="coerce")

small_years = (
    birth_years
    .dropna()
    .sort_values()
    .unique()[:10]  # the 10 smallest unique values
)

small_years



array([ 4., 20., 37., 43., 46., 59., 61., 69., 70., 95.])

In [23]:
birth_years_clean = birth_years[birth_years >= 1800]
earliest_realistic_year = int(birth_years_clean.min())
earliest_realistic_year


1800

In [24]:
import datetime as dt
current_year = dt.datetime.now().year
age_earliest_realistic = current_year - earliest_realistic_year
current_year, earliest_realistic_year, age_earliest_realistic


(2025, 1800, 225)

### Q2 – Earliest year of birth

By directly analyzing the `birthYear` column, the minimum year found is **4**.  
This would mean that someone was born in the year 0004, which is clearly unrealistic for the modern film industry.  
It is therefore very likely an **outlier / erroneous value** in the dataset.

By restricting the analysis to years of birth **≥ 1800**, the earliest year becomes **1800**.

### Q3 – How many years ago was this person born?

Using the current year (**2025**), this person would have been born approximately **225 years ago**.



In [25]:
import pandas as pd

# Start again from the raw birth_years (without the >= 1800 filter)
birth_years_raw = pd.to_numeric(names["birthYear"], errors="coerce")
earliest_raw_year = int(birth_years_raw.min())
print("Raw minimum birth year:", earliest_raw_year)

# People with this year of birth
people_raw = names.loc[
    birth_years_raw == earliest_raw_year,
    ["nconst", "primaryName", "birthYear", "deathYear",
     "primaryProfession", "knownForTitles"]
]
people_raw


Raw minimum birth year: 4


Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
737975,nm0784172,Lucio Anneo Seneca,4.0,65.0,writer,"tt0043802,tt0218822,tt0049203,tt0972562"


In [26]:
# Extract title IDs for this person or these persons
known_ids = (
    people_raw["knownForTitles"]
    .dropna()
    .str.split(",")
    .explode()
    .unique()
)

known_ids


array(['tt0043802', 'tt0218822', 'tt0049203', 'tt0972562'], dtype=object)

In [27]:
titles["startYear"] = pd.to_numeric(titles["startYear"], errors="coerce").astype("Int64")


In [28]:
# Information about titles associated with this person
titles_known = titles.loc[titles["tconst"].isin(known_ids),
                          ["tconst", "primaryTitle", "titleType", "startYear", "runtimeMinutes", "genres"]]

titles_known.head()


Unnamed: 0,tconst,primaryTitle,titleType,startYear,runtimeMinutes,genres
43026,tt0043802,The Affairs of Messalina,movie,1951,116.0,"Action,Drama,History"
48311,tt0049203,"Fedra, the Devil's Daughter",movie,1956,100.0,Drama
209696,tt0218822,Such Is Life,movie,2000,98.0,"Drama,Romance"
940992,tt0972562,Medea 2,movie,2006,,Drama


In [29]:
start_years = pd.to_numeric(titles_known["startYear"], errors="coerce")
first_title_year = int(start_years.min())
age_at_first_title = first_title_year - earliest_raw_year  # earliest_raw_year = 4

first_title_year, age_at_first_title


(1951, 1947)

### Q4 – Is the birth date “4” correct?

When analyzing the `name.basics` table, the minimum year of birth found in the `birthYear` column is **4**.  
This value is associated in particular with the following person:

- **nconst:** nm0784172  
- **Name:** Lucio Anneo Seneca  
- **Main profession:** writer  
- **knownForTitles:** tt0043802, tt0218822, tt0049203, tt0972562  

Using these title identifiers, we consulted the `title.basics` table.  
The corresponding works have the following release years:

- *The Affairs of Messalina* – **1951**  
- *Fedra, the Devil's Daughter* – **1956**  
- *Such Is Life* – **2000**  
- *Medea 2* – **2006**

If we consider that this person was born in the year **4**, then at the time of their first listed work in **1951**,  
they would have been approximately **1951 − 4 = 1947 years old**, and over 2000 years old for the works from 2000 and 2006.

Using **only the data from the dataset** (year of birth and years of the associated works),  
we therefore see an obvious contradiction: such longevity is biologically impossible.  

We conclude that the birth date **“4” is an outlier/incorrect data point** in the dataset,  
and that it is more reasonable, for subsequent analyses, to consider only “realistic” birth years
(for example, from 1800 onwards).


In [30]:
# Q5 - Most recent year of birth
import datetime as dt
current_year = dt.datetime.now().year

latest_year = int(birth_years[birth_years <= current_year].max())
latest_year


2025

### Q5 – Most recent year of birth

The most recent year of birth in the dataset (excluding future years) is: 2025.


In [31]:
# Q6 - Percentage of people without a birth year

birth_years = pd.to_numeric(names["birthYear"], errors="coerce")

nb_total = len(birth_years)
nb_sans_naissance = birth_years.isna().sum()

pourcentage_sans_naissance = nb_sans_naissance / nb_total * 100

print("Total number of people:", nb_total)
print("Number of people without a birth year:", nb_sans_naissance)
print(f"Percentage without a birth year: {pourcentage_sans_naissance:.2f} %")



Total number of people: 14894780
Number of people without a birth year: 14235193
Percentage without a birth year: 95.57 %


### Q6 – Percentage of people without a date of birth

From the `birthYear` column of the `name.basics` table:

- Total number of people: **14,894,780**
- Number of people without a year of birth entered: **14,235,193**  

- The percentage of the population **without a specified date of birth** is therefore approximately: **95.57%**


In [35]:
import pandas as pd

# Create numeric versions of year and runtime columns
titles["startYear_num"] = pd.to_numeric(titles["startYear"], errors="coerce")
titles["runtime_num"] = pd.to_numeric(titles["runtimeMinutes"], errors="coerce")

print("startYear_num and runtime_num columns created.")


startYear_num and runtime_num columns created.


In [36]:
# Q7 - Length of the longest "short" after 1900 (clean display)

# Filter short films
shorts = titles[
    (titles["titleType"] == "short") &
    (titles["startYear_num"] > 1900) &
    (titles["runtime_num"].notna()) &
    (titles["runtime_num"] > 0)
]

# Get the row of the longest short
idx_longest_short = shorts["runtime_num"].idxmax()
longest_short = shorts.loc[idx_longest_short,
                           ["tconst", "primaryTitle", "startYear_num", "runtime_num", "genres"]]

# Extract clean values (no decimals)
tconst_court = longest_short["tconst"]
titre_court = longest_short["primaryTitle"]
annee_court = int(longest_short["startYear_num"])
duree_minutes = int(longest_short["runtime_num"])
genres_court = longest_short["genres"]

# Clean display
print("=== Q7 - Longest 'short' after 1900 ===")
print(f"ID (tconst): {tconst_court}")
print(f"Title      : {titre_court}")
print(f"Year       : {annee_court}")
print(f"Duration   : {duree_minutes} minutes")
print(f"Genres     : {genres_court}")


=== Q7 - Longest 'short' after 1900 ===
ID (tconst): tt35509411
Title      : Our First Day
Year       : 2025
Duration   : 1311 minutes
Genres     : Drama,Short


### Q7 – Length of the longest “short” after 1900

After filtering on titles of type “short” with “startYear > 1900,” the title with the maximum
“runtimeMinutes” is:

- **Title:** Our First Day  
- **Release year:** 2025  
- **Announced runtime:** 1311 minutes (≈ 21 hours 51 minutes)  
- **Genres:** Drama, Short  

This value is probably an outlier for a short film.  
However, based strictly on the data provided by IMDb, the length of the longest
“short” after 1900 is therefore **1311 minutes**.


In [37]:
# Q8 - Duration of the shortest "movie" after 1900 (clean display)

# Filter movies after 1900 with a known duration > 0
movies = titles[
    (titles["titleType"] == "movie") &
    (titles["startYear_num"] > 1900) &
    (titles["runtime_num"].notna()) &
    (titles["runtime_num"] > 0)
]

# Row of the shortest movie
idx_shortest_movie = movies["runtime_num"].idxmin()
shortest_movie = movies.loc[idx_shortest_movie,
                            ["tconst", "primaryTitle", "startYear_num", "runtime_num", "genres"]]

# Extract clean values
tconst_movie = shortest_movie["tconst"]
titre_movie = shortest_movie["primaryTitle"]
annee_movie = int(shortest_movie["startYear_num"])
duree_movie = int(shortest_movie["runtime_num"])
genres_movie = shortest_movie["genres"]

# Clean display
print("=== Q8 - Shortest 'movie' after 1900 ===")
print(f"ID (tconst): {tconst_movie}")
print(f"Title      : {titre_movie}")
print(f"Year       : {annee_movie}")
print(f"Duration   : {duree_movie} minutes")
print(f"Genres     : {genres_movie}")



=== Q8 - Shortest 'movie' after 1900 ===
ID (tconst): tt0025166
Title      : George White's Scandals
Year       : 1934
Duration   : 1 minutes
Genres     : Comedy,Musical,Romance


### Q8 – Length of the shortest “film” after 1900

By filtering the `title.basics` table on:

- `titleType = “movie”`
- `startYear > 1900`
- a known and strictly positive `runtimeMinutes` duration,

then searching for the minimum duration, we obtain:

- **Title:** George White's Scandals
- **Year of release:** 1934
- **Duration:** 1 min

The duration of the shortest “film” after 1900 is therefore **1 minute**.

In [38]:
# Q9 - List of all genres represented

# Start from the 'genres' column, removing NaN values
genres_series = titles["genres"].dropna().str.split(",")

# "Explode" the list of genres, then keep unique values
all_genres = (
    genres_series
    .explode()           # one row per genre
    .dropna()
    .unique()
)

# Sort them
all_genres_sorted = sorted(all_genres)

print("=== Q9 - Genres represented in the dataset ===")
print("Total number of distinct genres:", len(all_genres_sorted))
print("List of genres:")
print(", ".join(all_genres_sorted))


=== Q9 - Genres represented in the dataset ===
Total number of distinct genres: 28
List of genres:
Action, Adult, Adventure, Animation, Biography, Comedy, Crime, Documentary, Drama, Family, Fantasy, Film-Noir, Game-Show, History, Horror, Music, Musical, Mystery, News, Reality-TV, Romance, Sci-Fi, Short, Sport, Talk-Show, Thriller, War, Western


### Q9 – List of all genres represented

By analyzing the `genres` column of the `title.basics` table and separating multiple values
(e.g., `“Action,Comedy”`), we obtain:

- **Total number of distinct genres:** 28
- **Genres:** Action, Adult, Adventure, Animation, Biography, Comedy, Crime, Documentary, Drama, Family, Fantasy, Film-Noir, Game-Show, History, Horror, Music, Musical, Mystery, News, Reality-TV, Romance, Sci-Fi, Short, Sport, Talk-Show, Thriller, War, Western

In [39]:
# Q10 - Best comedy movie

# 1) Select titles of type "movie" whose genres contain "Comedy"
comedy_movies = titles[
    (titles["titleType"] == "movie") &
    (titles["genres"].notna()) &
    (titles["genres"].str.contains("Comedy"))
]

# 2) Join with ratings
comedy_rated = comedy_movies.merge(ratings, on="tconst", how="inner")

# Safety check
if comedy_rated.empty:
    print("No comedy movie found after filtering.")
else:
    # 3) Best rating
    max_rating = comedy_rated["averageRating"].max()
    best_comedies = comedy_rated[comedy_rated["averageRating"] == max_rating]

    # 4) In case of a tie, keep the one with the highest number of votes
    best_comedy = best_comedies.sort_values("numVotes", ascending=False).iloc[0]

    # Extract useful information
    best_tconst = best_comedy["tconst"]
    best_title = best_comedy["primaryTitle"]
    best_year = int(pd.to_numeric(best_comedy["startYear"], errors="coerce")) \
        if pd.notna(best_comedy["startYear"]) else None
    best_genres = best_comedy["genres"]
    best_rating = best_comedy["averageRating"]
    best_votes = int(best_comedy["numVotes"])

    print("=== Q10 - Best comedy movie ===")
    print(f"ID (tconst): {best_tconst}")
    print(f"Title      : {best_title}")
    print(f"Year       : {best_year}")
    print(f"Genres     : {best_genres}")
    print(f"Rating     : {best_rating}")
    print(f"Votes      : {best_votes}")


=== Q10 - Best comedy movie ===
ID (tconst): tt1423343
Title      : Bob vs. Society
Year       : 2009
Genres     : Comedy
Rating     : 10.0
Votes      : 12


In [41]:
# Q10 - Director(s) of the best comedy movie

# We assume that best_tconst is defined in the previous block
crew_best = crew[crew["tconst"] == best_tconst]

directors_names = []

if crew_best.empty:
    print("No entry found in 'crew' for this tconst.")
else:
    directors_field = crew_best.iloc[0]["directors"]  # string of nconst values separated by commas
    if pd.isna(directors_field):
        print("No director is specified for this movie.")
    else:
        director_ids = directors_field.split(",")
        directors = names[names["nconst"].isin(director_ids)]
        directors_names = directors["primaryName"].tolist()

        print("=== Director(s) of the best comedy movie ===")
        if directors_names:
            for i, d in enumerate(directors_names, start=1):
                print(f"- Director {i}: {d}")
        else:
            print("No matching name found in name.basics.")


=== Director(s) of the best comedy movie ===
- Director 1: David Pring-Mill


In [42]:
# Q10 - Alternative titles of the best comedy movie
# Reading the large file title.akas.tsv in chunks

aka_path = "data/title.akas.tsv"

akas_for_best_list = []

try:
    aka_chunks = pd.read_csv(
        aka_path,
        sep="\t",
        na_values="\\N",
        low_memory=False,
        chunksize=200000  # you can reduce this if you still get OOM (100000 or 50000)
    )

    for chunk in aka_chunks:
        subset = chunk[chunk["titleId"] == best_tconst]
        if not subset.empty:
            akas_for_best_list.append(subset)

    if akas_for_best_list:
        akas_for_best = pd.concat(akas_for_best_list, ignore_index=True)

        # Select a few interesting columns
        cols = ["titleId", "ordering", "title", "region", "language", "isOriginalTitle"]
        cols_existantes = [c for c in cols if c in akas_for_best.columns]

        print("=== Alternative titles of the best comedy movie ===")
        display(akas_for_best[cols_existantes].head(20))  # display the first 20 rows
    else:
        print("No alternative titles found for this movie in title.akas.tsv.")

except FileNotFoundError:
    print("The file 'data/title.akas.tsv' was not found. Please check the path.")


=== Alternative titles of the best comedy movie ===


Unnamed: 0,titleId,ordering,title,region,language,isOriginalTitle
0,tt1423343,1,Bob vs. Society,,,1
1,tt1423343,2,Bob vs. Society,GB,,0
2,tt1423343,3,Bob vs. Society,US,,0


The main alternative titles for the film **Bob vs. Society** (tt1423343) are:

- Bob vs. Society – original title (no specific region)
- Bob vs. Society – title used for the GB region (United Kingdom)
- Bob vs. Society – title used for the US region (United States)
