# Preprocessing Pipeline for T20I Match Pedictions Program

CELL 1 — Imports & Load Data

In [1]:
import pandas as pd
import numpy as np
import json
from collections import Counter


In [2]:
df = pd.read_csv("../data/raw/t20i_Matches_Data.csv")
print(df.shape)
df.head()


(2592, 33)


Unnamed: 0,T20I Match No,Match ID,Match Name,Series ID,Series Name,Match Date,Match Format,Team1 ID,Team1 Name,Team1 Captain,...,Umpire 2,Match Referee,Toss Winner,Toss Winner Choice,Match Winner,Match Result Text,MOM Player,Team1 Playing 11,Team2 Playing 11,Debut Players
0,52,291356,Australia Vs India Only T20I,291355,India tour of Australia - 2007 (2007/08),2008-02-01,T20,6,India,7593,...,SJA Taufel,JJ Crowe,India,bat,Australia,Australia won by 9 wickets (with 52 balls rema...,8876.0,"['7773', '7781', '8813', '8742', '48405', '759...","['4176', '8876', '6253', '6256', '4382', '1198...","['11984', '49327', '48319']"
1,54,300436,New Zealand Vs England 2Nd T20I,300418,England tour of New Zealand - 2008 (2007/08),2008-02-07,T20,1,England,2314,...,GAV Baxter,AG Hurst,England,bat,England,England won by 50 runs,2314.0,"['11556', '44660', '8107', '7822', '2314', '63...","['44946', '10384', '44930', '10381', '9570', '...","['47488', '10325']"
2,65,361531,Netherlands Vs Scotland 2Nd Semi Final,353665,"ICC World Twenty20 Qualifier Bermuda, Canada, ...",2008-08-04,T20,30,Scotland,45548,...,PK Baldwin,BC Broad,Netherlands,bowl,Netherlands,Netherlands won by 5 wickets (with 12 balls re...,45358.0,"['45548', '46048', '46142', '8221', '4334', '4...","['10323', '48655', '6362', '49443', '45358', '...",[]
3,66,354459,Kenya Vs Scotland 3Rd Place Playoff,353665,"ICC World Twenty20 Qualifier Bermuda, Canada, ...",2008-08-04,T20,26,Kenya,2265,...,PK Baldwin,BC Broad,Kenya,bat,Scotland,Scotland won by 9 wickets (with 11 balls remai...,45548.0,"['10364', '2264', '49383', '2265', '2268', '50...","['45548', '46048', '46142', '8221', '4334', '4...","['50293', '50293']"
4,69,361653,Sri Lanka Vs Zimbabwe 1St Match,361644,T20 Canada in Canada - 2008 (2008/09),2008-10-10,T20,9,Zimbabwe,45326,...,MR Benson,JJ Crowe,Sri Lanka,bowl,Sri Lanka,Sri Lanka won by 5 wickets (with 6 balls remai...,50377.0,"['10639', '10423', '47619', '10421', '21364', ...","['48468', '7419', '15273', '8195', '6315', '48...","['50377', '47210', '12209', '15273', '48468', ..."


CELL 2 — Drop Unwanted Columns (Noise Removal)

In [3]:
DROP_COLS = [
    'MOM Player','Match Result Text','T20I Match No','Match ID',
    'Series ID','Series Name','Match Format',
    'Team1 ID','Team2 ID',
    'Team1 Captain','Team2 Captain',
    'Umpire 1','Umpire 2','Match Referee',
    'Team1 Playing 11','Team2 Playing 11','Debut Players',
    'Match Name','Team2 Extras Rec','Team1 Extras Rec',
    'Match Date','Team1 Wickets Fell','Team2 Wickets Fell'
]

df = df.drop(columns=DROP_COLS, errors="ignore")
print(df.shape)


(2592, 10)


CELL 3 — Keep Only Top Teams

In [4]:
team_counts = Counter(df["Team1 Name"]) + Counter(df["Team2 Name"])
top_teams = [team for team, _ in team_counts.most_common(20)]

df = df[
    df["Team1 Name"].isin(top_teams) &
    df["Team2 Name"].isin(top_teams) &
    df["Match Winner"].isin(top_teams)
]

print("Remaining teams:", len(top_teams))
print("Remaining rows:", df.shape[0])


Remaining teams: 20
Remaining rows: 1218


CELL 4 — Remove Missing / Invalid Rows

In [5]:
df = df.replace("no result", np.nan)
df = df.dropna()
print(df.shape)


(1218, 10)


CELL 5 — TEAM STRENGTH (Target Encoding)

In [6]:
team_win_counts = df["Match Winner"].value_counts()
team_match_counts = (
    df["Team1 Name"].value_counts() +
    df["Team2 Name"].value_counts()
)

team_strength = (team_win_counts / team_match_counts).fillna(0)

team_strength = team_strength.clip(0.3, 0.8)  # prevent extremes
with open("team_strength.json", "w") as f:
    json.dump(team_strength.to_dict(), f, indent=4)


df["Team1 Strength"] = df["Team1 Name"].map(team_strength)
df["Team2 Strength"] = df["Team2 Name"].map(team_strength)
df["Toss Winner Strength"] = df["Toss Winner"].map(team_strength)


CELL 6 — Venue Behaviour Encoding

In [7]:
venue_matches = df.groupby("Match Venue (Stadium)").size()
venue_first_bat_wins = df[df["Match Winner"] == df["Toss Winner"]].groupby("Match Venue (Stadium)").size()

venue_bat_bias = (venue_first_bat_wins / venue_matches).fillna(0.5)
venue_match_count = venue_matches / venue_matches.max()

venue_encoding = {
    "bat_bias": venue_bat_bias.to_dict(),
    "match_density": venue_match_count.to_dict()
}

with open("venue_encoding.json", "w") as f:
    json.dump(venue_encoding, f, indent=4)

df["Venue Bat Bias"] = df["Match Venue (Stadium)"].map(venue_bat_bias)
df["Venue Match Density"] = df["Match Venue (Stadium)"].map(venue_match_count)


CELL 7 — Toss Choice Encoding (Properly)

In [8]:
df["Toss Bat"] = (df["Toss Winner Choice"] == "bat").astype(int)


CELL 8 — Feature Engineering (H2H + Strength Diff)

In [9]:
df["Strength Diff"] = df["Team1 Strength"] - df["Team2 Strength"]
df["Is Home Match"] = (df["Team1 Name"] == df["Match Venue (Country)"]).astype(int)


CELL 9 — Encode Target (Match Winner Binary)

In [10]:
df["Team1 Won"] = (df["Match Winner"] == df["Team1 Name"]).astype(int)


CELL 10 — Final Feature Set

In [11]:
FINAL_FEATURES = [
    "Team1 Strength",
    "Team2 Strength",
    "Strength Diff",
    "Venue Bat Bias",
    "Venue Match Density",
    "Toss Bat",
    "Toss Winner Strength"
]

TARGET = "Team1 Won"

final_df = df[FINAL_FEATURES + [TARGET]]


CELL 11 — Save Final Dataset

In [12]:
final_df.to_csv("t20i_Matches_Data_final.csv", index=False)
print("Final dataset saved:", final_df.shape)


Final dataset saved: (1218, 8)
