In [1]:
# ============================
# T20I: 1st innings score + 2nd innings win probability
# Works with ball format: 0.1, 1.3, 19.6 etc.
# Columns expected (your dataset):
# match_id date city venue team1 team2 inning_number team ball batsman bowler non_striker
# runs_batsman runs_extras runs_total winner toss_winner toss_decision overs
# ============================

import re
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    mean_absolute_error, mean_squared_error, r2_score,
    accuracy_score, roc_auc_score, log_loss
)
import joblib

from xgboost import XGBRegressor


In [2]:
# ----------------------------
# CONFIG
# ----------------------------
DATA_PATH = "your_cricket_data.csv"  # <-- change this
BALLS_PER_INNINGS = 120  # T20

In [3]:
# ----------------------------
# Helpers
# ----------------------------
def parse_ball_to_over_ball(x):
    """
    Converts cricket notation like 0.1, 1.3, 19.6 into (over, ball_in_over),
    where ball_in_over is 1..6.
    Handles strings too.
    """
    s = str(x).strip()
    if "." in s:
        o, b = s.split(".", 1)
        over = int(re.sub(r"[^0-9]", "", o) or 0)
        ball_in_over = int(re.sub(r"[^0-9]", "", b) or 0)
    else:
        # if someone stored it as "5" or 5.0
        over = int(float(s))
        ball_in_over = 0

    # clip ball_in_over to 1..6 if it exists; if 0, treat as 0 (rare)
    if ball_in_over != 0:
        ball_in_over = max(1, min(6, ball_in_over))
    return over, ball_in_over


def add_wicket_inference(df):
    """
    Infers wicket_flag and player_out using pair-change heuristic.
    For each match_id + innings, compare previous (batsman, non_striker) set to current.
    If exactly one player added AND one removed, count wicket and set player_out = removed player.
    """
    prev_bats = df.groupby(["match_id", "innings"])["batsman"].shift(1)
    prev_non  = df.groupby(["match_id", "innings"])["non_striker"].shift(1)

    prev_set = pd.Series(list(zip(prev_bats, prev_non)), index=df.index).apply(
        lambda t: set([x for x in t if pd.notna(x)])
    )
    cur_set = pd.Series(list(zip(df["batsman"], df["non_striker"])), index=df.index).apply(
        lambda t: set([x for x in t if pd.notna(x)])
    )

    wicket_flag = np.zeros(len(df), dtype=int)
    player_out  = [None] * len(df)

    for i, (ps, cs) in enumerate(zip(prev_set, cur_set)):
        if not ps or not cs:
            continue
        added = cs - ps
        removed = ps - cs
        if len(added) == 1 and len(removed) == 1:
            wicket_flag[i] = 1
            player_out[i] = list(removed)[0]

    df["wicket_flag"] = wicket_flag
    df["player_out"] = player_out
    return df


def rolling_sum(group, col, window):
    return group[col].rolling(window=window, min_periods=1).sum()



In [4]:
# ----------------------------
# Load & normalize schema
# ----------------------------
df_raw = pd.read_csv(DATA_PATH)

df = df_raw.copy()

# Map columns to a consistent schema
df = df.rename(columns={
    "team": "batting_team",
    "runs_batsman": "runs_off_bat",
    "runs_extras": "extras",
    "runs_total": "total_runs",
    "inning_number": "innings"
})

# Ensure match_id is string
df["match_id"] = df["match_id"].astype(str)

In [5]:
# Ensure innings numeric 1/2
innings_map = {"1st innings": 1, "2nd innings": 2, "1": 1, "2": 2}
if df["innings"].dtype == "object":
    df["innings"] = df["innings"].map(innings_map).fillna(df["innings"]).astype(int)
else:
    df["innings"] = df["innings"].astype(int)

# Derive bowling team using team1/team2
df["bowling_team"] = np.where(df["batting_team"] == df["team1"], df["team2"], df["team1"])

In [6]:
# Parse ball -> over, ball_in_over -> ball_number
over_ball = df["ball"].apply(parse_ball_to_over_ball)
df["over"] = over_ball.apply(lambda t: t[0])
df["ball_in_over"] = over_ball.apply(lambda t: t[1])

# Convert to sequential ball number within innings
# If ball_in_over==0 (rare), treat ball_number = over*6
df["ball_number"] = df["over"] * 6 + df["ball_in_over"]
df["ball_number"] = df["ball_number"].astype(int)

# Sort correctly for cumulative/rolling features
df = df.sort_values(["match_id", "innings", "ball_number"]).reset_index(drop=True)

# Infer wickets
df = add_wicket_inference(df)

In [8]:
# ----------------------------
# Core match-state features (per match_id + innings)
# ----------------------------
grp = df.groupby(["match_id", "innings"], sort=False)

df["current_score"] = grp["total_runs"].cumsum()
df["balls_bowled"] = grp.cumcount() + 1
df["wickets_fallen"] = grp["wicket_flag"].cumsum()

In [9]:
# last 5 overs = 30 balls
df["runs_last_5"] = grp.apply(lambda g: rolling_sum(g, "total_runs", 30)).reset_index(level=[0,1], drop=True)
df["wickets_last_5"] = grp.apply(lambda g: rolling_sum(g, "wicket_flag", 30)).reset_index(level=[0,1], drop=True)

# Current run rate
df["crr"] = (df["current_score"] / df["balls_bowled"]) * 6

  df["runs_last_5"] = grp.apply(lambda g: rolling_sum(g, "total_runs", 30)).reset_index(level=[0,1], drop=True)
  df["wickets_last_5"] = grp.apply(lambda g: rolling_sum(g, "wicket_flag", 30)).reset_index(level=[0,1], drop=True)


In [10]:
# ==========================================================
# 1) FIRST INNINGS SCORE PREDICTION (XGBoost Regressor Pipeline)
# ==========================================================
df_1 = df[df["innings"] == 1].copy()

# Final total for 1st innings per match
final_1 = df_1.groupby("match_id")["total_runs"].sum().rename("final_total_runs")
df_1 = df_1.merge(final_1, on="match_id", how="left")

# Optional: drop early phase (first 5 overs) to reduce noise (matches your earlier intent)
df_1 = df_1[df_1["balls_bowled"] > 30].copy()


In [11]:
# Feature set (IMPORTANT: no player columns, no team1/team2)
X1_cols = [
    "batting_team", "bowling_team", "venue",
    "current_score", "balls_bowled", "wickets_fallen",
    "runs_last_5", "wickets_last_5", "crr"
]
y1_col = "final_total_runs"

X1 = df_1[X1_cols]
y1 = df_1[y1_col]

In [12]:
X1_train, X1_test, y1_train, y1_test = train_test_split(
    X1, y1, test_size=0.2, random_state=42
)

cat_cols_1 = ["batting_team", "bowling_team", "venue"]
num_cols_1 = [c for c in X1_cols if c not in cat_cols_1]

preprocess_xgb = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols_1),
        ("num", "passthrough", num_cols_1),
    ],
    remainder="drop"
)


In [13]:
score_model = Pipeline(steps=[
    ("preprocess", preprocess_xgb),
    ("model", XGBRegressor(
        objective="reg:squarederror",
        n_estimators=500,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.85,
        colsample_bytree=0.85,
        random_state=42,
        n_jobs=-1
    ))
])

score_model.fit(X1_train, y1_train)

pred1 = score_model.predict(X1_test)
mae1 = mean_absolute_error(y1_test, pred1)
rmse1 = np.sqrt(mean_squared_error(y1_test, pred1))
r21 = r2_score(y1_test, pred1)

print("\n=== First Innings Score Model (XGBRegressor Pipeline) ===")
print(f"MAE : {mae1:.3f}")
print(f"RMSE: {rmse1:.3f}")
print(f"R²  : {r21:.3f}")


=== First Innings Score Model (XGBRegressor Pipeline) ===
MAE : 11.769
RMSE: 16.104
R²  : 0.862


In [14]:
joblib.dump(score_model, "t20i_first_innings_score_model.joblib")
print("Saved: t20i_first_innings_score_model.joblib")

Saved: t20i_first_innings_score_model.joblib


In [15]:
# ==========================================================
# 2) SECOND INNINGS WIN PROBABILITY (Logistic Regression Pipeline)
# ==========================================================
# Build match-level target from first innings final
first_innings_total = df[df["innings"] == 1].groupby("match_id")["total_runs"].sum().rename("innings1_total")
df_2 = df[df["innings"] == 2].merge(first_innings_total, on="match_id", how="left").copy()

# Target for chase
df_2["target"] = df_2["innings1_total"] + 1

df_2["balls_left"] = BALLS_PER_INNINGS - df_2["balls_bowled"]
df_2["runs_left"] = df_2["target"] - df_2["current_score"]
df_2["wickets_left"] = 10 - df_2["wickets_fallen"]

In [16]:
# Required run rate (handle division by zero)
df_2["rrr"] = np.where(
    df_2["balls_left"] > 0,
    (df_2["runs_left"] / df_2["balls_left"]) * 6,
    999.0
)

# Label: chasing team wins?
# winner column should be team name; y=1 if winner == batting_team (in innings 2)
df_2["win"] = (df_2["winner"] == df_2["batting_team"]).astype(int)

# Optional: remove impossible/finished rows (after innings end / bad data)
df_2 = df_2[(df_2["balls_bowled"] >= 1) & (df_2["balls_bowled"] <= BALLS_PER_INNINGS)].copy()


In [17]:
X2_cols = [
    "batting_team", "bowling_team", "venue",
    "target", "current_score", "runs_left", "balls_left",
    "wickets_left", "crr", "rrr", "runs_last_5", "wickets_last_5"
]
y2_col = "win"

X2 = df_2[X2_cols]
y2 = df_2[y2_col]

In [19]:
X2_train, X2_test, y2_train, y2_test = train_test_split(
    X2, y2, test_size=0.2, random_state=42, stratify=y2
)

cat_cols_2 = ["batting_team", "bowling_team", "venue"]
num_cols_2 = [c for c in X2_cols if c not in cat_cols_2]

preprocess_lr = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols_2),
        ("num", StandardScaler(), num_cols_2)
    ],
    remainder="drop"
)

In [20]:
win_model = Pipeline(steps=[
    ("preprocess", preprocess_lr),
    ("model", LogisticRegression(max_iter=3000))
])

win_model.fit(X2_train, y2_train)

proba2 = win_model.predict_proba(X2_test)[:, 1]
pred2 = (proba2 >= 0.5).astype(int)

acc2 = accuracy_score(y2_test, pred2)
auc2 = roc_auc_score(y2_test, proba2)
ll2  = log_loss(y2_test, proba2)

print("\n=== Second Innings Win Model (LogisticRegression Pipeline) ===")
print(f"Accuracy: {acc2:.3f}")
print(f"ROC-AUC : {auc2:.3f}")
print(f"LogLoss : {ll2:.3f}")


=== Second Innings Win Model (LogisticRegression Pipeline) ===
Accuracy: 0.873
ROC-AUC : 0.948
LogLoss : 0.291


In [21]:
joblib.dump(win_model, "t20i_second_innings_winprob_model.joblib")
print("Saved: t20i_second_innings_winprob_model.joblib")


Saved: t20i_second_innings_winprob_model.joblib


In [22]:
# ----------------------------
# Quick sanity test: single-row inference (optional)
# ----------------------------
print("\nSanity check: model objects loaded ok.")
_ = joblib.load("t20i_first_innings_score_model.joblib")
_ = joblib.load("t20i_second_innings_winprob_model.joblib")
print("Both pipelines reload successfully.")


Sanity check: model objects loaded ok.
Both pipelines reload successfully.
