In [9]:
import sqlite3
import pandas as pd

# Da wir im Ordner `notebooks/` sind, gilt:
DB_PATH = "../data/raw/imdb.sqlite"

conn = sqlite3.connect(DB_PATH)
df = pd.read_sql_query(
    """
    SELECT
      b.tconst,
      b.primaryTitle,
      b.startYear,
      b.genres,
      r.averageRating,
      r.numVotes
    FROM basics b
    JOIN ratings r USING(tconst)
    WHERE b.startYear IS NOT NULL
    """,
    conn
)
conn.close()

print(f"{len(df):,} Filme geladen")
df.head()

1,559,610 Filme geladen


Unnamed: 0,tconst,primaryTitle,startYear,genres,averageRating,numVotes
0,tt0000001,Carmencita,1894,"Documentary,Short",5.7,2149
1,tt0000002,Le clown et ses chiens,1892,"Animation,Short",5.5,291
2,tt0000003,Poor Pierrot,1892,"Animation,Comedy,Romance",6.5,2186
3,tt0000004,Un bon bock,1892,"Animation,Short",5.3,187
4,tt0000005,Blacksmith Scene,1893,Short,6.2,2932


# Filme Behalten und fehlende Jahresangaben entfernen

## Duplikate prüfen

In [4]:
dupes = df[df.duplicated(subset="tconst", keep=False)]
print("Duplikate:", len(dupes))

NameError: name 'df' is not defined

## Genres aufsplitten & Missing-Genres markieren

In [None]:
print("ohne Genre:", df["genres"].isna().sum())

# Explode für spätere Genre‑Analysen
df_genres = (
    df.assign(genre=df["genres"].str.split(","))
      .explode("genre")
      .dropna(subset=["genre"])
)

ohne Genre: 22216


## Typen prüfen & konvertieren

In [None]:
print(df.dtypes)

tconst            object
titleType         object
primaryTitle      object
startYear          int64
genres            object
averageRating    float64
numVotes           int64
dtype: object


## Ausreisser - Check

In [7]:
df.nlargest(10, "numVotes")[["primaryTitle","numVotes"]]
df[df["averageRating"]>9.5] 

Unnamed: 0,tconst,primaryTitle,startYear,genres,averageRating,numVotes
2931,tt0008863,The Battle Royal,1918,"Comedy,Short",10.0,10
38759,tt0058032,Der doppelte Nikolaus,1964,,9.7,48
38961,tt0058258,Katharina Knie - Ein Seiltänzerstück,1964,Drama,9.7,22
39604,tt0058992,Brooklyn-Ballade,1965,Drama,9.8,15
39741,tt0059144,Die eigenen vier Wände,1965,Comedy,9.6,44
...,...,...,...,...,...,...
1558468,tt9890390,Puppy Honey EP.6,2016,Romance,9.8,7
1558559,tt9892336,The Hollywood Reporter's Official Live Golden ...,2019,Talk-Show,9.9,2827
1558837,tt9898836,Perfect Game,2019,"Action,Adventure,Animation",9.8,93850
1558889,tt9899898,I Do Not!,2022,"Comedy,Short",9.6,6
