In [1]:
import sqlite3
import pandas as pd

# Da wir im Ordner `notebooks/` sind, gilt:
DB_PATH = "../data/raw/imdb.sqlite"

conn = sqlite3.connect(DB_PATH)
df = pd.read_sql_query(
    """
    SELECT
      b.tconst,
      b.primaryTitle,
      b.startYear,
      b.genres,
      r.averageRating,
      r.numVotes
    FROM basics b
    JOIN ratings r USING(tconst)
    WHERE b.startYear IS NOT NULL
    """,
    conn
)
conn.close()

print(f"{len(df):,} Filme geladen")
df.head()

330,102 Filme geladen


Unnamed: 0,tconst,primaryTitle,startYear,genres,averageRating,numVotes
0,tt0000009,Miss Jerry,1894,Romance,5.4,224
1,tt0000147,The Corbett-Fitzsimmons Fight,1897,"Documentary,News,Sport",5.3,558
2,tt0000502,Bohemios,1905,,3.8,21
3,tt0000574,The Story of the Kelly Gang,1906,"Action,Adventure,Biography",6.0,986
4,tt0000591,The Prodigal Son,1907,Drama,5.6,31


# Filme Behalten und fehlende Jahresangaben entfernen

## Duplikate prüfen

In [2]:
dupes = df[df.duplicated(subset="tconst", keep=False)]
print("Duplikate:", len(dupes))

Duplikate: 0


## Genres aufsplitten & Missing-Genres markieren

In [3]:
print("ohne Genre:", df["genres"].isna().sum())

# Explode für spätere Genre‑Analysen
df_genres = (
    df.assign(genre=df["genres"].str.split(","))
      .explode("genre")
      .dropna(subset=["genre"])
)

ohne Genre: 11088


## Typen prüfen & konvertieren

In [4]:
print(df.dtypes)

tconst            object
primaryTitle      object
startYear         object
genres            object
averageRating    float64
numVotes           int64
dtype: object


## Ausreisser - Check

In [5]:
df.nlargest(10, "numVotes")[["primaryTitle","numVotes"]]
df[df["averageRating"]>9.5] 

Unnamed: 0,tconst,primaryTitle,startYear,genres,averageRating,numVotes
65705,tt0117319,Pepper's Pow Wow,1996,Documentary,9.6,8
70299,tt0128058,Au-delà de 120 ans avec Jeanne Calment,1995,Documentary,9.8,8
77690,tt0156768,Manik Raitong,1984,,9.8,8
82118,tt0173749,Csonka Bereg,1989,Documentary,9.8,6
86869,tt0187784,Chronique paysanne en Gruyère,1991,Documentary,9.6,5
...,...,...,...,...,...,...
328839,tt9686178,The Best Years Ever,1994,Documentary,9.6,108
328903,tt9695398,Six Locked Doors: The Legacy of Cocoanut Grove,2021,"Documentary,History",9.7,11
329062,tt9731750,Die Abrissbirnen,2019,"Comedy,Drama",9.7,8
329129,tt9742160,Finite Water,2019,Documentary,9.7,17
