### Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import ast
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.dummy import DummyClassifier
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
import joblib
import os
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

### Datensatz ML Vorbereitungen
- **Dateneinlesen**
- **Vorschau anzeigen**
- **Target-Definition**
- **Feature Engineering**
- **Feature-Auswahl**
- **Champion-Rollen ergänzen**
- **Summoner-Spells als Features**
- **Rollenkombinationen je Team**
- **One-Hot-Encoding & Skalierung**
- **Train/Test-Split**
- **Leckage-Policy festlegen**

### Daten einlesen

In [2]:
games = pd.read_csv("D:/Python/Projektarbeit/Hauptordner/games2.csv")

In [3]:
champ1 = pd.read_csv("D:/Python/Projektarbeit/Hauptordner/champion1.0.csv")

In [4]:
champ2 = pd.read_csv("D:/Python/Projektarbeit/Hauptordner/champion2.0.csv")

In [5]:
spells = pd.read_csv("D:/Python/Projektarbeit/Hauptordner/Summoner_spell2.csv")

In [6]:
# Dauer in Minuten berechnen (Zielwert erstellen)
games["duration_min"] = games["gameDuration"] / 60

# Eingabe- und Zielvariablen definieren
X = games[[
    "t1_towerKills", "t2_towerKills",
    "t1_dragonKills", "t2_dragonKills",
    "t1_baronKills", "t2_baronKills"
]]
y = games["duration_min"]

### Vorschau anzeigen

In [7]:
print("games shape:", games.shape)
display(games.head(2))

games shape: (51053, 69)


Unnamed: 0,gameId,creationTime,gameDuration,seasonId,winner,firstBlood,firstTower,firstInhibitor,firstBaron,firstDragon,...,t2_ban4,t2_ban5,creationTime_dt,auffällig_duration,auffällig_winner,auffällig_season,konsistenz_winner,t1_minus1_bans,t2_minus1_bans,duration_min
0,3326086514,1504279457970,1949,9,1,2,1,1,1,1,...,16,51,2017-09-01 15:24:17.970,False,False,False,True,0,0,32.483333
1,3229566029,1497848803862,1851,9,1,1,1,1,0,1,...,51,420,2017-06-19 05:06:43.862,False,False,False,True,0,0,30.85


In [8]:
print("champ1 shape:", champ1.shape)
display(champ1.head(2))

champ1 shape: (138, 7)


Unnamed: 0,title,id,key,name,version,name_lower,key_lower
0,the Dark Child,1,Annie,Annie,7.17.2,annie,annie
1,the Berserker,2,Olaf,Olaf,7.17.2,olaf,olaf


In [9]:
print("champ2 shape:", champ2.shape)
display(champ2.head(2))

champ2 shape: (139, 9)


Unnamed: 0,tags,title,id,key,name,version,name_lower,key_lower,anzahl_tags
0,[],Placeholder,0,Placeholder,Placeholder,7.18.1,Placeholder,Placeholder,0
1,"['Fighter', 'Tank']",the Monkey King,62,MonkeyKing,Wukong,7.18.1,wukong,monkeyking,2


In [10]:
print("spells shape:", spells.shape)
display(spells.head(2))

spells shape: (17, 7)


Unnamed: 0,id,summonerLevel,name,key,description,version,event_modus
0,1,6,Cleanse,SummonerBoost,Removes all disables (excluding suppression an...,7.17.2,Standard
1,3,4,Exhaust,SummonerExhaust,"Exhausts target enemy champion, reducing their...",7.17.2,Standard


### Target-Definition

In [11]:
y_winner = games['winner']

In [12]:
games['duration_min'] = games['gameDuration'] / 60
games['kurzesSpiel'] = games['duration_min'] < 25

In [13]:
y_kurz = games['kurzesSpiel']

In [14]:
y_dauer = games['duration_min']

### Gegen Check (Ob alles geklappt hat)

In [15]:
games['kurzesSpiel'].head(10)

0    False
1    False
2     True
3    False
4    False
5    False
6    False
7     True
8     True
9    False
Name: kurzesSpiel, dtype: bool

### Feature Engineering

In [16]:
games['creationTime_dt'] = pd.to_datetime(games['creationTime'])
games['weekday'] = games['creationTime_dt'].dt.weekday
games['hour'] = games['creationTime_dt'].dt.hour

def uhrzeit_kategorie(stunde):
    if 0 <= stunde < 6:
        return 'nacht'
    elif 6 <= stunde < 12:
        return 'morgen'
    elif 12 <= stunde < 18:
        return 'mittag'
    else:
        return 'abend'
games['uhrzeitKategorie'] = games['hour'].apply(uhrzeit_kategorie)

In [17]:
sum_cols = [
    't1_champ1_sum1', 't1_champ1_sum2',
    't1_champ2_sum1', 't1_champ2_sum2',
    't2_champ1_sum1', 't2_champ1_sum2',
    't2_champ2_sum1', 't2_champ2_sum2'
]
games['n_flash'] = games[sum_cols].apply(lambda row: (row == 4).sum(), axis=1)

In [18]:
games['duration_min'] = games['gameDuration'] / 60
games['kurzesSpiel'] = games['duration_min'] < 25

### Gegen Check (Ob alles geklappt hat)

In [19]:
games.columns

Index(['gameId', 'creationTime', 'gameDuration', 'seasonId', 'winner',
       'firstBlood', 'firstTower', 'firstInhibitor', 'firstBaron',
       'firstDragon', 'firstRiftHerald', 't1_champ1id', 't1_champ1_sum1',
       't1_champ1_sum2', 't1_champ2id', 't1_champ2_sum1', 't1_champ2_sum2',
       't1_champ3id', 't1_champ3_sum1', 't1_champ3_sum2', 't1_champ4id',
       't1_champ4_sum1', 't1_champ4_sum2', 't1_champ5id', 't1_champ5_sum1',
       't1_champ5_sum2', 't1_towerKills', 't1_inhibitorKills', 't1_baronKills',
       't1_dragonKills', 't1_riftHeraldKills', 't1_ban1', 't1_ban2', 't1_ban3',
       't1_ban4', 't1_ban5', 't2_champ1id', 't2_champ1_sum1', 't2_champ1_sum2',
       't2_champ2id', 't2_champ2_sum1', 't2_champ2_sum2', 't2_champ3id',
       't2_champ3_sum1', 't2_champ3_sum2', 't2_champ4id', 't2_champ4_sum1',
       't2_champ4_sum2', 't2_champ5id', 't2_champ5_sum1', 't2_champ5_sum2',
       't2_towerKills', 't2_inhibitorKills', 't2_baronKills', 't2_dragonKills',
       't2_riftHer

### Feature-Auswahl

In [20]:
X_winner = games[[  
    't1_champ1id', 't1_champ2id', 't1_champ3id',  
    't2_champ1id', 't2_champ2id', 't2_champ3id',  
    'seasonId', 'firstBaron', 'firstTower',  
    'duration_min'
]]

In [21]:
X_kurz = games[[
    't1_champ1id', 't2_champ1id',
    'weekday', 'uhrzeitKategorie', 'seasonId'
]]

In [22]:
X_dauer = games[[ 
    't1_champ1id', 
    't2_champ1id', 
    'seasonId', 
    'weekday', 
    'duration_min'
]]

In [23]:
y_kurz = games['kurzesSpiel']

In [24]:
X_tower = games[
    ['t1_champ1id', 't2_champ1id', 'firstBlood', 'firstDragon', 'seasonId', 'weekday']
]

y_tower = (games['firstTower'] == 1).astype(int)

In [25]:
games['firstObjectiveTeam1'] = (
    (games['firstTower'] == 1) |
    (games['firstBaron'] == 1) |
    (games['firstInhibitor'] == 1)
).astype(int)

X_obj = games[
    ['t1_champ1id', 't2_champ1id', 'seasonId', 'weekday', 'firstBlood', 'firstDragon']
]

y_obj = games['firstObjectiveTeam1']

### Gegen Check (Ob alles geklappt hat)

In [26]:
print(X_winner.columns.tolist())
print(X_kurz.columns.tolist())
print(X_dauer.columns.tolist())

['t1_champ1id', 't1_champ2id', 't1_champ3id', 't2_champ1id', 't2_champ2id', 't2_champ3id', 'seasonId', 'firstBaron', 'firstTower', 'duration_min']
['t1_champ1id', 't2_champ1id', 'weekday', 'uhrzeitKategorie', 'seasonId']
['t1_champ1id', 't2_champ1id', 'seasonId', 'weekday', 'duration_min']


### Champion-Rollen

In [27]:
champ2 = pd.read_csv("D:/Python/Projektarbeit/Hauptordner/champion2.0.csv")

In [28]:
champ2[['role1', 'role2']] = champ2['tags'].apply(
    lambda x: pd.Series(ast.literal_eval(x)) if pd.notnull(x) else pd.Series(['Unknown', 'Unknown'])
)

In [29]:
df = games.copy()

for i in range(1, 6):
    df = df.merge(
        champ2[['id', 'role1']],
        left_on=f't1_champ{i}id',
        right_on='id',
        how='left'
    ).rename(columns={'role1': f't1_champ{i}_role'}).drop(columns=['id'])

In [30]:
for i in range(1, 6):
    df = df.merge(
        champ2[['id', 'role1']],
        left_on=f't2_champ{i}id',
        right_on='id',
        how='left'
    ).rename(columns={'role1': f't2_champ{i}_role'}).drop(columns=['id'])

games = df

In [31]:
rolle_spalten = [col for col in df.columns if '_role' in col]

### Gegen Check (Ob alles geklappt hat)

In [32]:
df[rolle_spalten].isnull().sum()

t1_champ1_role    0
t1_champ2_role    0
t1_champ3_role    0
t1_champ4_role    0
t1_champ5_role    0
t2_champ1_role    0
t2_champ2_role    0
t2_champ3_role    0
t2_champ4_role    0
t2_champ5_role    0
dtype: int64

In [33]:
df[rolle_spalten].head(10)

Unnamed: 0,t1_champ1_role,t1_champ2_role,t1_champ3_role,t1_champ4_role,t1_champ5_role,t2_champ1_role,t2_champ2_role,t2_champ3_role,t2_champ4_role,t2_champ5_role
0,Mage,Support,Marksman,Assassin,Mage,Marksman,Marksman,Fighter,Assassin,Support
1,Marksman,Fighter,Assassin,Fighter,Assassin,Tank,Mage,Fighter,Fighter,Fighter
2,Marksman,Fighter,Support,Fighter,Assassin,Mage,Support,Fighter,Fighter,Marksman
3,Tank,Mage,Marksman,Mage,Fighter,Mage,Fighter,Support,Fighter,Marksman
4,Fighter,Marksman,Support,Marksman,Mage,Support,Tank,Fighter,Assassin,Marksman
5,Support,Fighter,Fighter,Mage,Marksman,Fighter,Marksman,Assassin,Fighter,Tank
6,Mage,Marksman,Support,Marksman,Tank,Tank,Marksman,Fighter,Mage,Fighter
7,Fighter,Marksman,Support,Mage,Fighter,Mage,Fighter,Support,Marksman,Tank
8,Tank,Tank,Marksman,Support,Mage,Marksman,Tank,Mage,Fighter,Mage
9,Support,Assassin,Fighter,Marksman,Fighter,Assassin,Marksman,Mage,Mage,Fighter


### Summoner-Spells als Features

In [34]:
spells = pd.read_csv("D:/Python/Projektarbeit/Hauptordner/Summoner_spell2.csv")

In [35]:
anzahl_flash = (
    (games[['t1_champ1_sum1', 't1_champ1_sum2', 't2_champ1_sum1', 't2_champ1_sum2']] == 4)
    .sum(axis=1)
)

In [36]:
tracked_spells = {
    "anzahl_flash": 4,
    "anzahl_ignite": 14,
    "anzahl_exhaust": 3
}

for feature_name, spell_id in tracked_spells.items():
    games[feature_name] = games[[
        't1_champ1_sum1', 't1_champ1_sum2',
        't1_champ2_sum1', 't1_champ2_sum2',
        't1_champ3_sum1', 't1_champ3_sum2',
        't1_champ4_sum1', 't1_champ4_sum2',
        't1_champ5_sum1', 't1_champ5_sum2',
        't2_champ1_sum1', 't2_champ1_sum2',
        't2_champ2_sum1', 't2_champ2_sum2',
        't2_champ3_sum1', 't2_champ3_sum2',
        't2_champ4_sum1', 't2_champ4_sum2',
        't2_champ5_sum1', 't2_champ5_sum2'
    ]].apply(lambda row: (row == spell_id).sum(), axis=1)

### Gegen Check (Ob alles geklappt hat)

In [37]:
[col for col in games.columns if '_sum' in col]

['t1_champ1_sum1',
 't1_champ1_sum2',
 't1_champ2_sum1',
 't1_champ2_sum2',
 't1_champ3_sum1',
 't1_champ3_sum2',
 't1_champ4_sum1',
 't1_champ4_sum2',
 't1_champ5_sum1',
 't1_champ5_sum2',
 't2_champ1_sum1',
 't2_champ1_sum2',
 't2_champ2_sum1',
 't2_champ2_sum2',
 't2_champ3_sum1',
 't2_champ3_sum2',
 't2_champ4_sum1',
 't2_champ4_sum2',
 't2_champ5_sum1',
 't2_champ5_sum2']

In [38]:
[col for col in games.columns if 'flash' in col or 'ignite' in col or 'exhaust' in col]

['n_flash', 'anzahl_flash', 'anzahl_ignite', 'anzahl_exhaust']

In [39]:
games[['n_flash', 'anzahl_flash', 'anzahl_ignite', 'anzahl_exhaust']].head(10)

Unnamed: 0,n_flash,anzahl_flash,anzahl_ignite,anzahl_exhaust
0,4,9,2,2
1,4,10,3,1
2,4,10,1,1
3,4,10,3,1
4,4,10,0,2
5,4,10,1,2
6,4,10,1,2
7,4,10,2,1
8,4,10,1,2
9,4,10,2,2


### Rollenkombinationen je Team

In [40]:
champ2 = pd.read_csv("D:/Python/Projektarbeit/Hauptordner/champion2.0.csv")
champ2 = champ2[['id', 'tags']]

In [41]:
champ2['main_role'] = champ2['tags'].apply(
    lambda x: eval(x)[0] if pd.notnull(x) and len(eval(x)) > 0 else 'Unknown'
)

In [42]:
games = games.merge(
    champ2[['id', 'main_role']],
    how='left',
    left_on='t1_champ1id',
    right_on='id'
).rename(columns={'main_role': 't1_champ1_role'}).drop(columns='id')

In [43]:
for i in range(1, 6):
    games = games.merge(
        champ2[['id', 'main_role']],
        how='left',
        left_on=f't1_champ{i}id',
        right_on='id'
    ).rename(columns={'main_role': f't1_champ{i}_role'}).drop(columns='id')

In [44]:
for i in range(1, 6):
    games = games.merge(
        champ2[['id', 'main_role']],
        how='left',
        left_on=f't2_champ{i}id',
        right_on='id'
    ).rename(columns={'main_role': f't2_champ{i}_role'}).drop(columns='id')

In [45]:
for role in games[[f't1_champ{i}_role' for i in range(1, 6)]].stack().unique():
    if pd.notnull(role):
        games[f'n_{role.lower()}_t1'] = games[[f't1_champ{i}_role' for i in range(1, 6)]].apply(lambda row: (row == role).sum(), axis=1)

In [46]:
for role in games[[f't2_champ{i}_role' for i in range(1, 6)]].stack().unique():
    if pd.notnull(role):
        games[f'n_{role.lower()}_t2'] = games[[f't2_champ{i}_role' for i in range(1, 6)]].apply(lambda row: (row == role).sum(), axis=1)

### Gegen Check (Ob alles geklappt hat)

In [47]:
games[['n_mage_t1', 'n_fighter_t1', 'n_assassin_t1', 'n_support_t1', 'n_marksman_t1', 'n_tank_t1']].sum(axis=1).value_counts()

11    51053
Name: count, dtype: int64

In [48]:
games[['n_mage_t2', 'n_fighter_t2', 'n_assassin_t2', 'n_support_t2', 'n_marksman_t2', 'n_tank_t2']].sum(axis=1).value_counts()

10    51053
Name: count, dtype: int64

In [49]:
[col for col in games.columns if col.startswith('n_') and ('_t1' in col or '_t2' in col)]

['n_mage_t1',
 'n_support_t1',
 'n_marksman_t1',
 'n_assassin_t1',
 'n_fighter_t1',
 'n_tank_t1',
 'n_marksman_t2',
 'n_fighter_t2',
 'n_assassin_t2',
 'n_support_t2',
 'n_tank_t2',
 'n_mage_t2']

In [50]:
games[games[['n_mage_t1', 'n_fighter_t1', 'n_assassin_t1', 'n_support_t1', 'n_marksman_t1', 'n_tank_t1']].sum(axis=1) > 5].head(3)


# 1 Champion kann mehrere rollen haben ist somit also kein fehler.

Unnamed: 0,gameId,creationTime,gameDuration,seasonId,winner,firstBlood,firstTower,firstInhibitor,firstBaron,firstDragon,...,n_marksman_t1,n_assassin_t1,n_fighter_t1,n_tank_t1,n_marksman_t2,n_fighter_t2,n_assassin_t2,n_support_t2,n_tank_t2,n_mage_t2
0,3326086514,1504279457970,1949,9,1,2,1,1,1,1,...,2,2,0,0,4,2,2,2,0,0
1,3229566029,1497848803862,1851,9,1,1,1,1,0,1,...,3,4,4,0,0,6,0,0,2,2
2,3327363504,1504360103310,1493,9,1,2,1,1,1,2,...,3,2,4,0,2,4,0,2,0,2


### One-Hot-Encoding & Skalierung

In [51]:
games_encoded = games.copy()

In [52]:
games_encoded = pd.get_dummies(games_encoded, columns=[
    'seasonId', 'hour',
    't1_champ1id', 't1_champ2id', 't1_champ3id', 't1_champ4id', 't1_champ5id',
    't2_champ1id', 't2_champ2id', 't2_champ3id', 't2_champ4id', 't2_champ5id'
])

In [53]:
numerical_cols = [
    'duration_min',
    't1_towerKills', 't1_inhibitorKills', 't1_baronKills', 't1_dragonKills', 't1_riftHeraldKills',
    't2_towerKills', 't2_inhibitorKills', 't2_baronKills', 't2_dragonKills', 't2_riftHeraldKills',
    'n_flash', 'anzahl_flash', 'anzahl_ignite', 'anzahl_exhaust',
    'n_mage_t1', 'n_support_t1', 'n_marksman_t1', 'n_assassin_t1', 'n_fighter_t1', 'n_tank_t1',
    'n_marksman_t2', 'n_fighter_t2', 'n_assassin_t2', 'n_support_t2', 'n_tank_t2', 'n_mage_t2'
]

In [54]:
scaler = StandardScaler()
games_encoded[numerical_cols] = scaler.fit_transform(games_encoded[numerical_cols])

### Gegen Check (Ob alles geklappt hat)

In [55]:
[o for o in games_encoded.columns if '_' in o and any(prefix in o for prefix in ['weekday', 'uhrzeitKategorie'])]

[]

In [56]:
games_encoded.head(3)

Unnamed: 0,gameId,creationTime,gameDuration,winner,firstBlood,firstTower,firstInhibitor,firstBaron,firstDragon,firstRiftHerald,...,t2_champ5id_268,t2_champ5id_412,t2_champ5id_420,t2_champ5id_421,t2_champ5id_427,t2_champ5id_429,t2_champ5id_432,t2_champ5id_497,t2_champ5id_498,t2_champ5id_516
0,3326086514,1504279457970,1949,1,2,1,1,1,1,2,...,False,True,False,False,False,False,False,False,False,False
1,3229566029,1497848803862,1851,1,1,1,1,0,1,1,...,False,False,False,False,False,False,False,False,False,False
2,3327363504,1504360103310,1493,1,2,1,1,1,2,0,...,False,False,False,False,False,False,False,False,False,False


In [57]:
numerical_cols = [
    't1_towerKills', 't1_inhibitorKills', 't1_baronKills',
    't1_dragonKills', 't1_riftHeraldKills',
    't2_towerKills', 't2_inhibitorKills', 't2_baronKills',
    't2_dragonKills', 't2_riftHeraldKills'
]
games_encoded[numerical_cols].describe().T[['mean', 'std']]

Unnamed: 0,mean,std
t1_towerKills,6.429999000000001e-17,1.00001
t1_inhibitorKills,7.710432000000001e-17,1.00001
t1_baronKills,6.73619e-17,1.00001
t1_dragonKills,5.845454e-17,1.00001
t1_riftHeraldKills,-2.894891e-17,1.00001
t2_towerKills,2.7278780000000004e-17,1.00001
t2_inhibitorKills,-3.0619040000000002e-18,1.00001
t2_baronKills,2.3938520000000002e-17,1.00001
t2_dragonKills,-1.002078e-17,1.00001
t2_riftHeraldKills,-3.451601e-17,1.00001


In [58]:
print("Gesamtzahl Spalten nach Encoding & Skalierung:", games_encoded.shape[1])

Gesamtzahl Spalten nach Encoding & Skalierung: 1481


### Train/Test-Split

In [59]:
y = games_encoded['winner']
X = games_encoded.drop(columns=['winner'])

In [60]:
X = X.select_dtypes(exclude=["datetime64[ns]", "object"])

In [61]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [62]:
y = games_encoded['kurzesSpiel']

### Gegen Check (Ob alles geklappt hat)

In [63]:
print("Trainingsmenge X_train:", X_train.shape)
print("Trainingsmenge y_train:", y_train.shape)
print("Testmenge     X_test :", X_test.shape)
print("Testmenge     y_test :", y_test.shape)

Trainingsmenge X_train: (40842, 1457)
Trainingsmenge y_train: (40842,)
Testmenge     X_test : (10211, 1457)
Testmenge     y_test : (10211,)


In [64]:
print("Verteilung y_train:")
print(y_train.value_counts(normalize=True))
print("\nVerteilung y_test:")
print(y_test.value_counts(normalize=True))

Verteilung y_train:
winner
1    0.506464
2    0.493536
Name: proportion, dtype: float64

Verteilung y_test:
winner
1    0.506513
2    0.493487
Name: proportion, dtype: float64


In [65]:
X_train.head(3)

Unnamed: 0,gameId,creationTime,gameDuration,firstBlood,firstTower,firstInhibitor,firstBaron,firstDragon,firstRiftHerald,t1_champ1_sum1,...,t2_champ5id_268,t2_champ5id_412,t2_champ5id_420,t2_champ5id_421,t2_champ5id_427,t2_champ5id_429,t2_champ5id_432,t2_champ5id_497,t2_champ5id_498,t2_champ5id_516
5224,3326433891,1504298387524,1854,1,1,1,1,1,0,4,...,False,False,False,False,False,False,False,False,False,False
44065,3296857185,1502357350265,2261,2,2,2,0,2,0,4,...,False,False,False,False,False,False,False,False,False,False
14448,3320958175,1503900549781,1297,1,1,1,0,1,1,4,...,False,False,False,False,False,False,False,False,False,False


### Leckage-Policy

In [66]:
verbotene_spalten = ['winner', 'kurzesSpiel']

for spalte in verbotene_spalten:
    if spalte in X_train.columns:
        print(f"Achtung: Leckage! '{spalte}' ist in den Trainingsdaten enthalten.")
    else:
        print(f"✓ '{spalte}' ist NICHT im Trainingsset – alles gut.")

✓ 'winner' ist NICHT im Trainingsset – alles gut.
Achtung: Leckage! 'kurzesSpiel' ist in den Trainingsdaten enthalten.


In [67]:
X_train = X_train.drop(columns=[col for col in verbotene_spalten if col in X_train.columns])
X_test = X_test.drop(columns=[col for col in verbotene_spalten if col in X_test.columns])

### Gegen Check (Ob alles geklappt hat)

In [68]:
for spalte in verbotene_spalten:
    print(f"{spalte} in X_train? →", spalte in X_train.columns)
    print(f"{spalte} in X_test?  →", spalte in X_test.columns)

winner in X_train? → False
winner in X_test?  → False
kurzesSpiel in X_train? → False
kurzesSpiel in X_test?  → False


### Modellphase
- **Modellauswahl treffen**
- **Modelltraining & Prediction**
- **Cross-Validation vorbereiten**
- **Baseline einbauen**
- **Feature Importance vorbereiten**
- **Simulationen vorbereiten**
- **Clustering vorbereiten**

### Modellauswahl

In [69]:
modelle = {
    "Lineare Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Entscheidungsbaum": DecisionTreeRegressor(random_state=42)
}

### Gegen Check (Ob alles geklappt hat)

In [70]:
for name, modell in modelle.items():
    print(f"{name:22s}  →  Typ: {type(modell).__name__}")

Lineare Regression      →  Typ: LinearRegression
Random Forest           →  Typ: RandomForestRegressor
Entscheidungsbaum       →  Typ: DecisionTreeRegressor


### Modelltraining & Prediction

In [71]:
# Zielvariable für Regression
y = games["duration_min"]

# Eingabematrix (X) – dieselben Features wie im Modelltraining
X = games[[
    "t1_towerKills", "t2_towerKills",
    "t1_dragonKills", "t2_dragonKills",
    "t1_baronKills", "t2_baronKills"
]]

# Train/Test-Split speichern, damit X_test verfügbar bleibt
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    shuffle=True
)

In [72]:
cv_scores = {name: {"R2": [], "MSE": [], "RMSE": [], "MAE": []} for name in modelle}
RANDOM_STATE = 42

In [73]:
for i in range(5):
    X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=RANDOM_STATE + i)
    
    for name, modell in modelle.items():
        modell.fit(X_tr, y_tr)
        y_hat = modell.predict(X_te)

        mse = mean_squared_error(y_te, y_hat)
        cv_scores[name]["R2"].append(r2_score(y_te, y_hat))
        cv_scores[name]["MSE"].append(mse)
        cv_scores[name]["RMSE"].append(np.sqrt(mse))
        cv_scores[name]["MAE"].append(mean_absolute_error(y_te, y_hat))

In [74]:
print("Durchschnittliche Metriken über 5 Durchläufe:")
for name, sc in cv_scores.items():
    print(
        f"{name:22s} R²={np.mean(sc['R2']):.4f} | "
        f"MSE={np.mean(sc['MSE']):.4f} | "
        f"RMSE={np.mean(sc['RMSE']):.4f} | "
        f"MAE={np.mean(sc['MAE']):.4f}"
    )

Durchschnittliche Metriken über 5 Durchläufe:
Lineare Regression     R²=0.7913 | MSE=15.1942 | RMSE=3.8978 | MAE=3.0791
Random Forest          R²=0.8039 | MSE=14.2732 | RMSE=3.7777 | MAE=2.8922
Entscheidungsbaum      R²=0.7819 | MSE=15.8752 | RMSE=3.9840 | MAE=3.0096


### Gegen Check (Ob alles geklappt hat)

In [75]:
for name, modell in modelle.items():
    print(f"{name:22s}  →  Typ: {type(modell).__name__}")
print("\nAlle Modelle wurden erfolgreich trainiert und bewertet.")

Lineare Regression      →  Typ: LinearRegression
Random Forest           →  Typ: RandomForestRegressor
Entscheidungsbaum       →  Typ: DecisionTreeRegressor

Alle Modelle wurden erfolgreich trainiert und bewertet.


### Baseline

In [76]:
y_pred_base = [y_train.mean()] * len(y_test)
r2   = r2_score(y_test, y_pred_base)
mse  = mean_squared_error(y_test, y_pred_base)
rmse = np.sqrt(mse)
mae  = mean_absolute_error(y_test, y_pred_base)

print("Baseline-Vorhersage (immer Mittelwert von y_train):\n")
print(f"R²:   {r2:.4f}")
print(f"MSE:  {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAE:  {mae:.4f}")

Baseline-Vorhersage (immer Mittelwert von y_train):

R²:   -0.0003
MSE:  73.3659
RMSE: 8.5654
MAE:  6.5155


### Gegen Check (Ob alles geklappt hat)

In [77]:
print(f"\nErste 5 Baseline-Vorhersagen: {y_pred_base[:5]}")
print(f"Einheitlicher Wert? {all(v == y_pred_base[0] for v in y_pred_base)}")
print(f"Mittelwert, den alle nutzen: {y_pred_base[0]:.4f}")


Erste 5 Baseline-Vorhersagen: [np.float64(30.50271901473973), np.float64(30.50271901473973), np.float64(30.50271901473973), np.float64(30.50271901473973), np.float64(30.50271901473973)]
Einheitlicher Wert? True
Mittelwert, den alle nutzen: 30.5027


### Feature Importance

In [78]:
feature_names = X_train.columns

In [79]:
importances = modelle["Random Forest"].feature_importances_
feature_ranking = list(zip(feature_names, importances))
feature_ranking.sort(key=lambda x: x[1], reverse=True)

In [80]:
print("Top 10 wichtigste Features (Random Forest):\n")
for name, score in feature_ranking[:10]:
    print(f"{name:30s} → Wichtigkeit: {score:.4f}")

Top 10 wichtigste Features (Random Forest):

t2_towerKills                  → Wichtigkeit: 0.4161
t1_towerKills                  → Wichtigkeit: 0.3651
t1_dragonKills                 → Wichtigkeit: 0.0804
t2_dragonKills                 → Wichtigkeit: 0.0739
t2_baronKills                  → Wichtigkeit: 0.0340
t1_baronKills                  → Wichtigkeit: 0.0305


### Gegen Check (Ob alles geklappt hat)

In [81]:
hasattr(modelle["Random Forest"], "feature_importances_")

True

### Simulationen

In [82]:
beispiel = pd.DataFrame([{
    "t1_towerKills": 7,
    "t2_towerKills": 4,
    "t1_dragonKills": 2,
    "t2_dragonKills": 1,
    "t1_baronKills": 1,
    "t2_baronKills": 0
}])

In [83]:
print("Vorhersage der Spieldauer (in Minuten):\n")

for name, modell in modelle.items():
    vorhersage = modell.predict(beispiel)[0]
    print(f"{name:22s} → Vorhersage: {vorhersage:.2f} Minuten")


Vorhersage der Spieldauer (in Minuten):

Lineare Regression     → Vorhersage: 31.25 Minuten
Random Forest          → Vorhersage: 30.52 Minuten
Entscheidungsbaum      → Vorhersage: 30.48 Minuten


In [84]:
print("\n(Hinweis: Dies ist eine simulierte Vorhersage – kein echter Testwert.)")


(Hinweis: Dies ist eine simulierte Vorhersage – kein echter Testwert.)


### Gegen Check (Ob alles geklappt hat)

In [85]:
print("Beispiel hat", beispiel.shape[1], "Merkmale")

Beispiel hat 6 Merkmale


### Clustering

In [86]:
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(X_train)

0,1,2
,n_clusters,3
,init,'k-means++'
,n_init,'auto'
,max_iter,300
,tol,0.0001
,verbose,0
,random_state,42
,copy_x,True
,algorithm,'lloyd'


In [87]:
cluster_labels = kmeans.predict(X_test)

print("Cluster-Zuordnung für erste 5 Spiele:", cluster_labels[:5])

Cluster-Zuordnung für erste 5 Spiele: [0 2 0 2 0]


### Gegen Check (Ob alles geklappt hat)

In [88]:
unique, counts = np.unique(cluster_labels, return_counts=True)

for c, n in zip(unique, counts):
    print(f"Cluster {c}: {n} Spiele")

Cluster 0: 1887 Spiele
Cluster 1: 4250 Spiele
Cluster 2: 4074 Spiele


### Struktur & Dokumentation
- **Strukturierte Pipeline bauen**
- **Random-State setzen**
- **Artefakte sichern**

### Strukturierte Pipeline

In [89]:
pipeline_lr = Pipeline([
    ("scaler", StandardScaler()),
    ("regression", LinearRegression())
])

pipeline_lr.fit(X_train, y_train)

0,1,2
,steps,"[('scaler', ...), ('regression', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [90]:
y_vor_lr = pipeline_lr.predict(X_test)

r2   = r2_score(y_test, y_vor_lr)
mse  = mean_squared_error(y_test, y_vor_lr)
rmse = np.sqrt(mse)
mae  = mean_absolute_error(y_test, y_vor_lr)

print("Ergebnisse der strukturierten Pipeline:")
print(f"R²:   {r2:.4f}")
print(f"MSE:  {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAE:  {mae:.4f}")

Ergebnisse der strukturierten Pipeline:
R²:   0.7934
MSE:  15.1485
RMSE: 3.8921
MAE:  3.0772


### Gegen Check (Ob alles geklappt hat)

In [91]:
print("Pipeline besteht aus:", pipeline_lr.named_steps.keys())

Pipeline besteht aus: dict_keys(['scaler', 'regression'])


### Random-State setzen

In [92]:
y = games["duration_min"]
X = games[[
    "t1_towerKills", "t2_towerKills",
    "t1_dragonKills", "t2_dragonKills",
    "t1_baronKills", "t2_baronKills"
]]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    shuffle=True
)

In [93]:
modell_lr = LinearRegression()
modell_dt = DecisionTreeRegressor(random_state=42)
modell_rf = RandomForestRegressor(random_state=42)

### Gegen Check (Ob alles geklappt hat)

In [94]:
if "regression" in pipeline_lr.named_steps:
    modell = pipeline_lr.named_steps["regression"]
    if hasattr(modell, "random_state"):
        print("RandomState im Pipeline-Modell:", modell.random_state)
    else:
        print(f"Modell {type(modell).__name__} hat keinen random_state.")
else:
    print("Kein 'regression'-Schritt in pipeline_lr gefunden.")


Modell LinearRegression hat keinen random_state.


In [95]:
# Durchschnittliche Metriken anzeigen
print("Durchschnittliche Metriken über 5 Durchläufe:\n")
for name, sc in cv_scores.items():
    print(
        f"{name:22s}  →  "
        f"R² = {np.mean(sc['R2']):.4f} | "
        f"MSE = {np.mean(sc['MSE']):.2f} | "
        f"RMSE = {np.mean(sc['RMSE']):.2f} | "
        f"MAE = {np.mean(sc['MAE']):.2f}"
    )


Durchschnittliche Metriken über 5 Durchläufe:

Lineare Regression      →  R² = 0.7913 | MSE = 15.19 | RMSE = 3.90 | MAE = 3.08
Random Forest           →  R² = 0.8039 | MSE = 14.27 | RMSE = 3.78 | MAE = 2.89
Entscheidungsbaum       →  R² = 0.7819 | MSE = 15.88 | RMSE = 3.98 | MAE = 3.01
