In [5]:
%pip install scikit-learn xgboost

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns

# Load datasets
balls = pd.read_csv("/workspaces/SuperApp/datasets/each_ball_records.csv")
matches = pd.read_csv("/workspaces/SuperApp/datasets/each_match_records.csv")

Note: you may need to restart the kernel to use updated packages.


In [7]:
# Example: create match-ball level win probability features
match_summary = matches[["match_number", "venue", "team1", "team2", "toss_won", "toss_decision", "winner"]]
ball_features = balls.groupby("match_no").apply(lambda x: {
    "runs_so_far": x["score"].cumsum().tolist(),
    "wickets_down": x.get("is_wicket", pd.Series([0]*len(x))).cumsum().tolist() if "is_wicket" in x else [0]*len(x)
}).reset_index()

# Merge into training set (simplified example)
df = pd.merge(matches, balls, left_on="match_number", right_on="match_no", how="left")


  ball_features = balls.groupby("match_no").apply(lambda x: {


In [10]:
# Encode categorical
le = LabelEncoder()
df["venue"] = le.fit_transform(df["venue"].astype(str))
df["batter"] = le.fit_transform(df["batter"].astype(str))

# Train-test split
X = df[["venue", "batter", "score", "over", "ballnumber"]]
# Get batter's team for each row
batter_team = df["team1"].where(df["ballnumber"] <= 60, df["team2"])
y = (df["winner"] == batter_team).astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

# Pipeline
clf = Pipeline([
    ("scaler", StandardScaler()),
    ("model", xgb.XGBClassifier(eval_metric="logloss"))
])

clf.fit(X_train, y_train)
preds = clf.predict_proba(X_test)[:,1]

print("AUC:", roc_auc_score(y_test, preds))


AUC: 0.8863032506817968


In [4]:
# Simplified MoM feature table
mom_df = matches[[
    "match_number", "venue", "team1", "team2", 
    "toss_won", "toss_decision", "winner", "player_of_the_match"
]].dropna(subset=["player_of_the_match"])

# Example aggregated player stats (from ball records)
player_stats = (
    balls.groupby(["match_no", "batter"])
    .agg({
        "batsman_runs": "sum",
        "ballnumber": "count"
    })
    .reset_index()
    .rename(columns={"ballnumber": "balls_faced", 
                     "match_no": "match_number", 
                     "batter": "batsman"})
)

# Merge with matches
mom_train = pd.merge(player_stats, mom_df, on="match_number", how="inner")

# Encode target (MoM = 1, otherwise 0)
mom_train["mom_label"] = (
    mom_train["batsman"].str.strip().str.lower() ==
    mom_train["player_of_the_match"].str.strip().str.lower()
).astype(int)

# Features and target
X = mom_train[["batsman_runs", "balls_faced"]]
y = mom_train["mom_label"]

# Handle extreme imbalance (very few 1s compared to 0s)
if y.sum() == 0:
    raise ValueError("No player_of_the_match found in batsman stats. Check joins/data.")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Train model
model = RandomForestClassifier(
    n_estimators=200,
    class_weight="balanced",   # handles imbalance
    random_state=42
)
model.fit(X_train, y_train)

# Predictions
preds = model.predict(X_test)

print(classification_report(y_test, preds, zero_division=0))


NameError: name 'matches' is not defined