# Feature Engineering for League of Legends Match Outcome Prediction

In this notebook we:
- Identify pick and ban columns.
- Build champion pick one-hot encodings.
- Add ban features.
- Create role composition features using champion metadata.
- Assemble final feature set (X) and target (y).


In [1]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
import json

# Load dataset
df = pd.read_csv("../data/games.csv")

print("Shape:", df.shape)
print(df.columns[:20])  # preview first 20 column names

Shape: (51490, 61)
Index(['gameId', 'creationTime', 'gameDuration', 'seasonId', 'winner',
       'firstBlood', 'firstTower', 'firstInhibitor', 'firstBaron',
       'firstDragon', 'firstRiftHerald', 't1_champ1id', 't1_champ1_sum1',
       't1_champ1_sum2', 't1_champ2id', 't1_champ2_sum1', 't1_champ2_sum2',
       't1_champ3id', 't1_champ3_sum1', 't1_champ3_sum2'],
      dtype='object')


In [2]:
# detect pick and ban columns automatically
t1_pick_cols = [c for c in df.columns if c.lower().startswith('t1_champ')]
t2_pick_cols = [c for c in df.columns if c.lower().startswith('t2_champ')]
t1_ban_cols  = [c for c in df.columns if c.lower().startswith('t1_ban')]
t2_ban_cols  = [c for c in df.columns if c.lower().startswith('t2_ban')]

print("Team1 picks:", t1_pick_cols)
print("Team2 picks:", t2_pick_cols)
print("Team1 bans:", t1_ban_cols)
print("Team2 bans:", t2_ban_cols)

Team1 picks: ['t1_champ1id', 't1_champ1_sum1', 't1_champ1_sum2', 't1_champ2id', 't1_champ2_sum1', 't1_champ2_sum2', 't1_champ3id', 't1_champ3_sum1', 't1_champ3_sum2', 't1_champ4id', 't1_champ4_sum1', 't1_champ4_sum2', 't1_champ5id', 't1_champ5_sum1', 't1_champ5_sum2']
Team2 picks: ['t2_champ1id', 't2_champ1_sum1', 't2_champ1_sum2', 't2_champ2id', 't2_champ2_sum1', 't2_champ2_sum2', 't2_champ3id', 't2_champ3_sum1', 't2_champ3_sum2', 't2_champ4id', 't2_champ4_sum1', 't2_champ4_sum2', 't2_champ5id', 't2_champ5_sum1', 't2_champ5_sum2']
Team1 bans: ['t1_ban1', 't1_ban2', 't1_ban3', 't1_ban4', 't1_ban5']
Team2 bans: ['t2_ban1', 't2_ban2', 't2_ban3', 't2_ban4', 't2_ban5']


In [3]:
# Collect picks as lists
df['t1_picks'] = df[t1_pick_cols].values.tolist()
df['t2_picks'] = df[t2_pick_cols].values.tolist()

# One-hot encode champion IDs
mlb = MultiLabelBinarizer()
t1_onehot = mlb.fit_transform(df['t1_picks'])
t2_onehot = mlb.transform(df['t2_picks'])

t1_onehot_df = pd.DataFrame(t1_onehot, columns=[f"t1_champ_{c}" for c in mlb.classes_], index=df.index)
t2_onehot_df = pd.DataFrame(t2_onehot, columns=[f"t2_champ_{c}" for c in mlb.classes_], index=df.index)

df = pd.concat([df, t1_onehot_df, t2_onehot_df], axis=1)
print("One-hot features shape:", df.shape)

One-hot features shape: (51490, 339)


In [4]:
# Process bans similarly
df['t1_bans_list'] = df[t1_ban_cols].values.tolist()
df['t2_bans_list'] = df[t2_ban_cols].values.tolist()

mlb_ban = MultiLabelBinarizer()
t1_ban_onehot = mlb_ban.fit_transform(df['t1_bans_list'])
t2_ban_onehot = mlb_ban.transform(df['t2_bans_list'])

t1_ban_df = pd.DataFrame(t1_ban_onehot, columns=[f"t1_ban_{c}" for c in mlb_ban.classes_], index=df.index)
t2_ban_df = pd.DataFrame(t2_ban_onehot, columns=[f"t2_ban_{c}" for c in mlb_ban.classes_], index=df.index)

df = pd.concat([df, t1_ban_df, t2_ban_df], axis=1)
print("With bans added:", df.shape)

With bans added: (51490, 619)


In [5]:
# Load champion metadata (champion_info_2.json is usually best)
with open("../data/champion_info_2.json", "r", encoding="utf-8") as f:
    champ_data = json.load(f)["data"]

# Map champion ID -> tags (roles like Mage, Fighter, etc.)
role_map = {}
for _, v in champ_data.items():
    cid = int(v["id"])
    tags = v.get("tags", [])
    role_map[cid] = tags

def role_counts(picks):
    counts = {}
    for cid in picks:
        tags = role_map.get(int(cid), [])
        for t in tags:
            counts[t] = counts.get(t, 0) + 1
    return counts

df['t1_roles'] = df['t1_picks'].apply(role_counts)
df['t2_roles'] = df['t2_picks'].apply(role_counts)

roles = ['Fighter','Tank','Mage','Assassin','Marksman','Support']
for r in roles:
    df[f"t1_role_{r}"] = df['t1_roles'].apply(lambda d: d.get(r, 0))
    df[f"t2_role_{r}"] = df['t2_roles'].apply(lambda d: d.get(r, 0))

print(df[[c for c in df.columns if "role" in c]].head())

                                            t1_roles  \
0  {'Mage': 10, 'Tank': 4, 'Support': 2, 'Marksma...   
1  {'Marksman': 1, 'Assassin': 5, 'Mage': 7, 'Fig...   
2  {'Marksman': 1, 'Assassin': 5, 'Mage': 10, 'Fi...   
3  {'Tank': 4, 'Mage': 10, 'Support': 2, 'Fighter...   
4  {'Fighter': 2, 'Tank': 3, 'Mage': 10, 'Support...   

                                            t2_roles  t1_role_Fighter  \
0  {'Marksman': 3, 'Assassin': 3, 'Fighter': 6, '...                4   
1  {'Tank': 5, 'Fighter': 7, 'Mage': 7, 'Support'...                5   
2  {'Mage': 8, 'Assassin': 4, 'Support': 3, 'Figh...                3   
3  {'Mage': 8, 'Assassin': 4, 'Tank': 4, 'Fighter...                3   
4  {'Support': 3, 'Mage': 9, 'Tank': 4, 'Fighter'...                2   

   t2_role_Fighter  t1_role_Tank  t2_role_Tank  t1_role_Mage  t2_role_Mage  \
0                6             4             3            10             7   
1                7             3             5             7        

In [6]:
# Target: 1 if Team1 wins, else 0
df['target'] = (df['winner'] == 1).astype(int)

print(df['target'].value_counts(normalize=True))

target
1    0.506448
0    0.493552
Name: proportion, dtype: float64


## Save engineered dataset
We'll keep only the columns needed for modeling and export a clean CSV for the next step.

In [7]:
# Feature columns: champ one-hot + bans + roles
feature_cols = [c for c in df.columns if c.startswith('t1_champ_') or c.startswith('t2_champ_')]
feature_cols += [c for c in df.columns if 'ban_' in c]
feature_cols += [c for c in df.columns if 'role_' in c]

X = df[feature_cols]
y = df['target']

print("Final feature matrix shape:", X.shape)

# Save for modeling
X.to_csv("../data/features.csv", index=False)
y.to_csv("../data/labels.csv", index=False)

Final feature matrix shape: (51490, 566)
