## Methode definieren um zu testen

## Dateien laden

In [1]:
import pandas as pd
import requests
from io import BytesIO

def load_csv_from_url(url):
    response = requests.get(url)
    if response.status_code == 200:
        return pd.read_csv(BytesIO(response.content))
    else:
        raise Exception(f"Download fehlgeschlagen: {url} – Status: {response.status_code}")

# SwitchDrive Download-Links
url_imdb_tmdb   = "https://drive.switch.ch/index.php/s/GknMWjEvz9VhuN4/download"
url_tmdb_credits = "https://drive.switch.ch/index.php/s/j36PM3I1C0FaX3C/download"
url_tmdb_movies  = "https://drive.switch.ch/index.php/s/SgdbbF6MkF0fTly/download"

# Lade die CSVs
df_imdb_tmdb   = load_csv_from_url(url_imdb_tmdb)
df_credits     = load_csv_from_url(url_tmdb_credits)
df_movies      = load_csv_from_url(url_tmdb_movies)

# Übersicht
print("IMDB+TMDB Combined:", df_imdb_tmdb.shape)
print("TMDB Credits:", df_credits.shape)
print("TMDB Movies:", df_movies.shape)

: 

## Merge

In [None]:
# Sicherstellen, dass die IDs als Integer vorliegen
df_movies['id'] = df_movies['id'].astype(int)
df_credits['movie_id'] = df_credits['movie_id'].astype(int)

# Merge über TMDB Movie ID
df_tmdb_merged = pd.merge(df_movies, df_credits, how='inner', left_on='id', right_on='movie_id')

print("TMDB Combined:", df_tmdb_merged.shape)

TMDB Combined: (4803, 24)


In [None]:
print(df_tmdb_merged.columns)
print(df_imdb_tmdb.columns)

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title_x', 'vote_average',
       'vote_count', 'movie_id', 'title_y', 'cast', 'crew'],
      dtype='object')
Index(['id', 'title', 'vote_average', 'vote_count', 'status', 'release_date',
       'revenue', 'runtime', 'adult', 'backdrop_path', 'budget', 'homepage',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'tagline', 'production_companies',
       'production_countries', 'spoken_languages', 'keywords', 'release_year',
       'Director', 'AverageRating', 'Poster_Link', 'Certificate',
       'IMDB_Rating', 'Meta_score', 'Star1', 'Star2', 'Star3', 'Star4',
       'Writer', 'Director_of_Photography', 'Producers', 'Music_Composer',
       'genres_list', 'Cast_list'

In [None]:
# Auch hier sicherstellen, dass IDs Integer sind
df_tmdb_merged['id'] = df_tmdb_merged['id'].astype(int)
df_imdb_tmdb['id'] = df_imdb_tmdb['id'].astype(int)

# Merge über TMDB id
df_final = pd.merge(df_tmdb_merged, df_imdb_tmdb, how='inner', on='id')
print("Final Shape:", df_final.shape)

Final Shape: (4796, 65)


In [None]:
print(df_final.columns.tolist())

['budget_x', 'genres', 'homepage_x', 'id', 'keywords_x', 'original_language_x', 'original_title_x', 'overview_x', 'popularity_x', 'production_companies_x', 'production_countries_x', 'release_date_x', 'revenue_x', 'runtime_x', 'spoken_languages_x', 'status_x', 'tagline_x', 'title_x', 'vote_average_x', 'vote_count_x', 'movie_id', 'title_y', 'cast', 'crew', 'title', 'vote_average_y', 'vote_count_y', 'status_y', 'release_date_y', 'revenue_y', 'runtime_y', 'adult', 'backdrop_path', 'budget_y', 'homepage_y', 'imdb_id', 'original_language_y', 'original_title_y', 'overview_y', 'popularity_y', 'poster_path', 'tagline_y', 'production_companies_y', 'production_countries_y', 'spoken_languages_y', 'keywords_y', 'release_year', 'Director', 'AverageRating', 'Poster_Link', 'Certificate', 'IMDB_Rating', 'Meta_score', 'Star1', 'Star2', 'Star3', 'Star4', 'Writer', 'Director_of_Photography', 'Producers', 'Music_Composer', 'genres_list', 'Cast_list', 'overview_sentiment', 'all_combined_keywords']


In [None]:
# Alle Spalten mit _y löschen
df_final = df_final.drop(columns=[col for col in df_final.columns if col.endswith('_y')])

# Alle Spalten mit _x in umbenennen
df_final = df_final.rename(columns={col: col.replace('_x', '') for col in df_final.columns if col.endswith('_x')})

In [None]:
print("Spaltenübersicht nach _y/_x Bereinigung:")
print(df_final.columns.tolist())

Spaltenübersicht nach _y/_x Bereinigung:
['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language', 'original_title', 'overview', 'popularity', 'production_companies', 'production_countries', 'release_date', 'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title', 'vote_average', 'vote_count', 'movie_id', 'cast', 'crew', 'title', 'adult', 'backdrop_path', 'imdb_id', 'poster_path', 'release_year', 'Director', 'AverageRating', 'Poster_Link', 'Certificate', 'IMDB_Rating', 'Meta_score', 'Star1', 'Star2', 'Star3', 'Star4', 'Writer', 'Director_of_Photography', 'Producers', 'Music_Composer', 'genres_list', 'Cast_list', 'overview_sentiment', 'all_combined_keywords']


In [None]:
features_base = ['budget', 'popularity']
df_results = evaluate_models(df_final, features_base)
print("Ergebnis nach Basis-Features:")
print(df_results)

In [None]:
# Nur Filme mit gültigem Umsatz und Budget
df_final_clean = df_final[(df_final['revenue'] > 0) & (df_final['budget'] > 0)].copy()

# Umbenennen für Klarheit
df_final_clean = df_final_clean.rename(columns={
    'revenue': 'revenue',
    'budget': 'budget'
})

print("Shape nach Bereinigung:", df_final_clean.shape)

Shape nach Bereinigung: (3229, 47)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np
import pandas as pd

def evaluate_models(df, features, target='revenue'):
    df = df[features + [target]].dropna()
    X = df[features]
    y = df[target]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    results = []

    for name, model in [("Linear Regression", LinearRegression()),
                        ("Random Forest", RandomForestRegressor(n_estimators=100, random_state=42))]:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        r2 = r2_score(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))

        results.append({
            'Model': name,
            'Features': len(features),
            'R²': round(r2, 3),
            'MAE (Mio $)': round(mae / 1_000_000, 2),
            'RMSE (Mio $)': round(rmse / 1_000_000, 2)
        })

    return pd.DataFrame(results)

In [None]:
features_base = ['budget', 'popularity']
df_results = evaluate_models(df_final_clean, features_base)
print("Ergebnis nach Basis-Feature-Bereinigung:")
print(df_results)

In [None]:
df_final_clean = df_final_clean.rename(columns={'title_x': 'title'})
df_final_clean = df_final_clean.drop(columns=['title']) 

## Feature engineering

### Schauspieler extrahieren

In [None]:
import ast

# Konvertiere JSON-ähnliche Strings zu Python-Listen
df_final_clean['cast'] = df_final_clean['cast'].apply(lambda x: ast.literal_eval(x) if pd.notnull(x) else [])

# Extrahiere den Namen der ersten Person im Cast
df_final_clean['lead_actor'] = df_final_clean['cast'].apply(lambda x: x[0]['name'] if len(x) > 0 and 'name' in x[0] else None)

In [None]:
print(df_final_clean[['original_title', 'lead_actor']].head(10))

                             original_title         lead_actor
0                                    Avatar    Sam Worthington
1  Pirates of the Caribbean: At World's End        Johnny Depp
2                                   Spectre       Daniel Craig
3                     The Dark Knight Rises     Christian Bale
4                               John Carter      Taylor Kitsch
5                              Spider-Man 3      Tobey Maguire
6                                   Tangled       Zachary Levi
7                   Avengers: Age of Ultron  Robert Downey Jr.
8    Harry Potter and the Half-Blood Prince   Daniel Radcliffe
9        Batman v Superman: Dawn of Justice        Ben Affleck


In [None]:
features_with_actor = ['budget', 'popularity', 'lead_actor']
df_results = evaluate_models(df_final_clean, features_with_actor)
print("Ergebnis nach lead_actor:")
print(df_results)

#### Wie oft ist der Schauspieler vorhanden (= lead_actor_freq)

In [None]:
actor_freq = df_final_clean['lead_actor'].value_counts()
df_final_clean['lead_actor_freq'] = df_final_clean['lead_actor'].map(actor_freq)

In [None]:
features_with_actor = ['budget', 'popularity', 'lead_actor', 'lead_actor_freq']
df_results = evaluate_models(df_final_clean, features_with_actor)
print("Ergebnis nach lead_actor_freq:")
print(df_results)

### Genre extrahieren

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

# 'genres' von String zu Liste konvertieren
df_final_clean['genres'] = df_final_clean['genres'].apply(lambda x: ast.literal_eval(x) if pd.notnull(x) else [])

df_final_clean['genre_names'] = df_final_clean['genres'].apply(lambda genre_list: [g['name'] for g in genre_list])

mlb = MultiLabelBinarizer()
genre_dummies = pd.DataFrame(mlb.fit_transform(df_final_clean['genre_names']),
                              columns=mlb.classes_,
                              index=df_final_clean.index)

# Dummies zum DataFrame hinzufügen
df_final_clean = pd.concat([df_final_clean, genre_dummies], axis=1)

In [None]:
print(genre_dummies.head())
print("Neue Spalten:", genre_dummies.columns.tolist())

   Action  Adventure  Animation  Comedy  Crime  Documentary  Drama  Family  \
0       1          1          0       0      0            0      0       0   
1       1          1          0       0      0            0      0       0   
2       1          1          0       0      1            0      0       0   
3       1          0          0       0      1            0      1       0   
4       1          1          0       0      0            0      0       0   

   Fantasy  Foreign  History  Horror  Music  Mystery  Romance  \
0        1        0        0       0      0        0        0   
1        1        0        0       0      0        0        0   
2        0        0        0       0      0        0        0   
3        0        0        0       0      0        0        0   
4        0        0        0       0      0        0        0   

   Science Fiction  Thriller  War  Western  
0                1         0    0        0  
1                0         0    0        0  
2    

In [None]:
genre_features = ['Action', 'Drama', 'Comedy', 'Thriller']
features_with_genres = ['budget', 'popularity', 'lead_actor', 'lead_actor_freq'] + genre_features
df_results = evaluate_models(df_final_clean, features_with_genres)
print("Ergebnis nach Genre-Encoding:")
print(df_results)

### weitere Features

In [None]:
# Durchschnittlicher Revenue pro Director
director_avg_revenue = df_final_clean.groupby('Director')['revenue'].mean()

# Neue Spalte hinzufügen (nicht ersetzen)
df_final_clean['director_avg_revenue'] = df_final_clean['Director'].map(director_avg_revenue)

In [None]:
features_with_director = features_with_genres + ['director_avg_revenue']
df_results = evaluate_models(df_final_clean, features_with_director)
print("Ergebnis nach Hinzufügen von director_avg_revenue:")
print(df_results)

In [None]:
# Durchschnittlicher Revenue pro Hauptdarsteller
actor_avg_revenue = df_final_clean.groupby('lead_actor')['revenue'].mean()

# Neue Spalte hinzufügen
df_final_clean['lead_actor_avg_revenue'] = df_final_clean['lead_actor'].map(actor_avg_revenue)

In [None]:
features_with_actor = features_with_director + ['lead_actor_avg_revenue']
df_results = evaluate_models(df_final_clean, features_with_actor)
print("Ergebnis nach Hinzufügen von lead_actor_avg_revenue:")
print(df_results)

In [None]:
print(df_final_clean[['original_title', 'Director', 'director_avg_revenue', 'lead_actor', 'lead_actor_avg_revenue']].head())

                             original_title                      Director  \
0                                    Avatar                 James Cameron   
1  Pirates of the Caribbean: At World's End                  George Lucas   
2                                   Spectre  Danny Boyle, Loveleen Tandan   
3                     The Dark Knight Rises             Christopher Nolan   
4                               John Carter                  Wes Anderson   

   director_avg_revenue       lead_actor  lead_actor_avg_revenue  
0          7.447307e+08  Sam Worthington            8.419749e+08  
1          6.374869e+08      Johnny Depp            2.430666e+08  
2          8.806746e+08     Daniel Craig            4.323967e+08  
3          6.307131e+08   Christian Bale            2.447794e+08  
4          1.065078e+08    Taylor Kitsch            2.935823e+08  


In [None]:
features = [
    'budget', 'popularity', 'vote_average', 'vote_count',
    'runtime', 'release_year',
    'director_avg_revenue', 'lead_actor_avg_revenue'
]

# + alle Genre-Spalten
genre_cols = [col for col in df_final_clean.columns if col in [
    'Action', 'Drama', 'Comedy', 'Adventure', 'Thriller', 'Science Fiction',
    'Crime', 'Romance', 'Horror', 'Animation', 'Family', 'Fantasy'
]]
features += genre_cols

# Zielvariable
target = 'revenue'

In [None]:
from sklearn.model_selection import train_test_split

df_model = df_final_clean[features + [target]].dropna()

X = df_model[features]
y = df_model[target]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

def evaluate_model(model, X_test, y_test, name):
    preds = model.predict(X_test)
    print(f"\n {name}")
    print("R²:", r2_score(y_test, preds))
    print("MAE:", mean_absolute_error(y_test, preds))
    print("RMSE:", np.sqrt(mean_squared_error(y_test, preds)))

def evaluate_both_models(X_train, X_test, y_train, y_test):
    # Linear Regression
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    evaluate_model(lr, X_test, y_test, "Linear Regression")

    # Random Forest
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)
    evaluate_model(rf, X_test, y_test, "Random Forest")


evaluate_both_models(X_train, X_test, y_train, y_test)


 Linear Regression
R²: 0.7386154483555176
MAE: 60509322.32686698
RMSE: 116114972.22562446

 Random Forest
R²: 0.7819831929802866
MAE: 42272932.01041796
RMSE: 106045746.07819472


In [None]:
import joblib

# Speichere das trainierte Random Forest Modell
joblib.dump(rf, 'model.pkl')

# Speichere die verwendeten Feature-Namen (wichtig für Gradio)
joblib.dump(features, 'features.pkl')

print("Modell und Featureliste erfolgreich gespeichert.")

Modell und Featureliste erfolgreich gespeichert.
