In [52]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns
get_ipython().run_line_magic('matplotlib', 'inline')
sns.set(color_codes=True)

# Load the dataset - Colab Specific
try:
    from google.colab import files
    import io
    file_name = 't20i_Matches_Data_final.csv'
    print(f"Please upload the file: {file_name} (Click 'Choose Files' below)")
    # This line will open the upload dialog in Colab
    uploaded = files.upload()

    if file_name in uploaded:
        df = pd.read_csv(io.BytesIO(uploaded[file_name]))
        print(f"Data '{file_name}' loaded successfully.")
    else:
        df = pd.read_csv(file_name)
        print("Data loaded successfully from local Colab environment.")
except ImportError:
    # Standard non-Colab environment fallback
    df = pd.read_csv('t20i_Matches_Data_final.csv')
    print("Data loaded using standard file path (assuming local environment).")
except FileNotFoundError:
    print("Error: 't20i_Matches_Data_final.csv' not found. Please ensure the file is uploaded/available.")
    df = pd.DataFrame()
    raise SystemExit("Cannot proceed without data.")

Please upload the file: t20i_Matches_Data_final.csv (Click 'Choose Files' below)


Saving t20i_Matches_Data_final.csv to t20i_Matches_Data_final (5).csv
Data loaded successfully from local Colab environment.


In [53]:
import pandas as pd
import numpy as np

# Assuming df is loaded in an earlier cell (e.g., in Cell 1: df = pd.read_csv('...'))
# If you are loading the data in Cell 2, ensure the line below is present:
# df = pd.read_csv('../data/raw/t20i_Matches_Data.csv')
# OR
# df = pd.read_csv('t20i_Matches_Data_final.csv')

# --- FIX 1: Reconstruct 'Toss Winner Choice' from one-hot encoded columns ---
def reconstruct_toss_choice(row):
    """Reconstructs the single categorical 'Toss Winner Choice' column."""
    if 'Toss Winner Choice_bat' in row and row['Toss Winner Choice_bat'] == 1:
        return 'BAT'
    elif 'Toss Winner Choice_bowl' in row and row['Toss Winner Choice_bowl'] == 1:
        return 'BOWL'
    else:
        return 'NO CHOICE'

# Apply the reconstruction to create the column the rest of the script expects
if 'Toss Winner Choice' not in df.columns:
    df['Toss Winner Choice'] = df.apply(reconstruct_toss_choice, axis=1)
# --- END FIX 1 ---


# Filter to relevant columns for feature engineering and modeling
df_proc = df[['Team1 Name', 'Team1 Runs Scored', 'Team2 Name', 'Team2 Runs Scored',
              'Match Venue (Stadium)', 'Toss Winner', 'Toss Winner Choice', 'Match Winner']].copy()

# --- FIX 2: Reverse Min-Max Scaling for Runs Scored to the range [60, 220] ---
RUNS_MIN = 60
RUNS_MAX = 220
RUNS_RANGE = RUNS_MAX - RUNS_MIN

for col in ['Team1 Runs Scored', 'Team2 Runs Scored']:
    # Descaling Formula: Actual Value = Scaled Value * Range + Min
    df_proc[col] = (df_proc[col] * RUNS_RANGE) + RUNS_MIN
# --- END FIX 2 ---


# Handle missing data (Impute/Drop based on previous steps)
# Impute numerical missing values with the mean (these are now unscaled run values)
for col in ['Team1 Runs Scored', 'Team2 Runs Scored']:
    df_proc[col] = df_proc[col].fillna(df_proc[col].mean())

# Impute categorical missing values
df_proc['Match Winner'] = df_proc['Match Winner'].fillna('NO RESULT')

# Remove rows where a meaningful outcome is not possible (for prediction models)
# Also convert team/venue columns to string to prevent OHE issues later
df_clean = df_proc[df_proc['Match Winner'] != 'NO RESULT'].copy()
df_clean = df_clean[df_clean['Toss Winner Choice'] != 'NO CHOICE'].copy()

# Ensure all categorical columns are treated as strings (important for numerical codes)
df_clean['Team1 Name'] = df_clean['Team1 Name'].astype(str)
df_clean['Team2 Name'] = df_clean['Team2 Name'].astype(str)
df_clean['Match Venue (Stadium)'] = df_clean['Match Venue (Stadium)'].astype(str)
df_clean['Toss Winner'] = df_clean['Toss Winner'].astype(str)
df_clean['Match Winner'] = df_clean['Match Winner'].astype(str)


print(f"Remaining rows after cleaning: {len(df_clean)}")
print(f"Team1 Runs Scored is now descaled: {df_clean['Team1 Runs Scored'].min():.2f} - {df_clean['Team1 Runs Scored'].max():.2f}")

Remaining rows after cleaning: 2581
Team1 Runs Scored is now descaled: 60.00 - 220.00


In [61]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# Declare global variables needed for Cell 6
global X_winner_encoded, X_toss_encoded, best_dt_winner, dt_toss

# Assumed X_winner, y_winner, X_toss, y_toss are available from Cell 3

# --- 1. MODEL 1: Match Winner Prediction ---

# 1.1 Data Preparation & Robust OHE
X_winner_cat = X_winner[['Team1 Name', 'Team2 Name', 'Match Venue (Stadium)']]
X_winner_num = X_winner.drop(columns=['Team1 Name', 'Team2 Name', 'Match Venue (Stadium)'])

# One-Hot Encode categorical features
X_winner_encoded = pd.get_dummies(X_winner_cat, drop_first=False)
X_winner_final = pd.concat([X_winner_num, X_winner_encoded], axis=1)

# 1.2 Train/Test Split
X_train_w, X_test_w, y_train_w, y_test_w = train_test_split(X_winner_final, y_winner, test_size=0.2, random_state=42)

# Store the final feature column names for use in the prediction function (Cell 6)
X_winner_encoded = X_train_w.copy() # Use the trained set columns

# 1.3 Hyperparameter Tuning (Grid Search for best Decision Tree)
param_grid_w = {
    'max_depth': [3, 5, 7, 9, None],
    'min_samples_split': [2, 5, 10],
    'criterion': ['gini', 'entropy']
}
dt_winner = DecisionTreeClassifier(random_state=42)
grid_search_w = GridSearchCV(dt_winner, param_grid_w, cv=5, scoring='accuracy')
grid_search_w.fit(X_train_w, y_train_w)

best_dt_winner = grid_search_w.best_estimator_ # This is the model used in Cell 6

# 1.4 Evaluation
y_pred_w = best_dt_winner.predict(X_test_w)
accuracy_w = accuracy_score(y_test_w, y_pred_w)

print("---  Match Winner Prediction ---")
print(f"Best Match Winner Model (Decision Tree) Accuracy: {accuracy_w:.4f}")
print("\nClassification Report:")
print(classification_report(y_test_w, y_pred_w))
print(f"Best Hyperparameters: {grid_search_w.best_params_}")


# --- 2. MODEL 2: Toss Decision Prediction ---

# 2.1 Data Preparation & Robust OHE
X_toss_cat = X_toss[['Match Venue (Stadium)']]
X_toss_num = X_toss.drop(columns=['Match Venue (Stadium)'])

# One-Hot Encode categorical features
X_toss_encoded = pd.get_dummies(X_toss_cat, drop_first=False)
X_toss_final = pd.concat([X_toss_num, X_toss_encoded], axis=1)

# 2.2 Train/Test Split
X_train_t, X_test_t, y_train_t, y_test_t = train_test_split(X_toss_final, y_toss, test_size=0.2, random_state=42)

# Store the final feature column names for use in the prediction function (Cell 6)
X_toss_encoded = X_train_t.copy()

# 2.3 Model Training (Simple Decision Tree)
dt_toss = DecisionTreeClassifier(max_depth=5, random_state=42)
dt_toss.fit(X_train_t, y_train_t)

# 2.4 Evaluation
y_pred_t = dt_toss.predict(X_test_t)
accuracy_t = accuracy_score(y_test_t, y_pred_t)

print("\n---  Toss Decision Prediction ---")
print(f"Toss Decision Model (Decision Tree) Accuracy: {accuracy_t:.4f}")
print("\nClassification Report:")
print(classification_report(y_test_t, y_pred_t))

---  Match Winner Prediction ---
Best Match Winner Model (Decision Tree) Accuracy: 0.7311

Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.91      0.78       268
           1       0.85      0.53      0.66       249

    accuracy                           0.73       517
   macro avg       0.77      0.72      0.72       517
weighted avg       0.76      0.73      0.72       517

Best Hyperparameters: {'criterion': 'entropy', 'max_depth': 5, 'min_samples_split': 2}

---  Toss Decision Prediction ---
Toss Decision Model (Decision Tree) Accuracy: 0.5126

Classification Report:
              precision    recall  f1-score   support

           0       0.38      0.02      0.05       248
           1       0.52      0.96      0.67       269

    accuracy                           0.51       517
   macro avg       0.45      0.49      0.36       517
weighted avg       0.45      0.51      0.37       517



In [65]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Declare global variables needed for the prediction cell
global X_safe_encoded, dt_safe_score

# --- MODEL 3: Safe Score Prediction ---

# 3.1 Data Preparation for Safe Score
# X_safe and y_safe are assumed to be available from Cell 3
X_safe_cat = X_safe[['Match Venue (Stadium)', 'Batting_First_Team']]
X_safe_num = X_safe.drop(columns=['Match Venue (Stadium)', 'Batting_First_Team'])

# 3.2 One-Hot Encoding
# Use pandas get_dummies for OHE on the entire dataset to capture all categories
X_safe_encoded = pd.get_dummies(X_safe_cat, drop_first=False)
X_safe_final = pd.concat([X_safe_num, X_safe_encoded], axis=1)

# Ensure all columns names are clean for the model
X_safe_final.columns = X_safe_final.columns.str.replace('[^A-Za-z0-9_]+', '', regex=True)

# 3.3 Train/Test Split
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_safe_final, y_safe, test_size=0.2, random_state=42)

# Store the final feature column names for use in prediction function (Cell 6)
X_safe_encoded = X_train_s.copy()

# 3.4 Model Training (Using Random Forest Regressor for better stability)
# Using RandomForestRegressor is generally better than a single Decision Tree for score prediction
dt_safe_score = RandomForestRegressor(n_estimators=100, random_state=42)
dt_safe_score.fit(X_train_s, y_train_s)

# 3.5 Evaluation
y_pred_s = dt_safe_score.predict(X_test_s)
safe_score_mse = mean_squared_error(y_test_s, y_pred_s)
safe_score_rmse = safe_score_mse ** 0.5

print("\n---  Safe Score Prediction (Regression) ---")
print(f"Mean Safe Score: {y_safe.mean():.2f}")
print(f"Safe Score RMSE: {safe_score_rmse:.2f} runs (lower is better)")


---  Safe Score Prediction (Regression) ---
Mean Safe Score: 134.73
Safe Score RMSE: 20.69 runs (lower is better)


In [67]:
import pandas as pd
import numpy as np

# Assuming helper functions (calculate_overall_win_rate, calculate_head_to_head_rate,
# and calculate_venue_bat_win_bias) are correctly defined in Cell 3.

# Declare global variables needed for the prediction cell
global df_clean, best_dt_winner, dt_toss, dt_safe_score, X_winner_encoded, X_toss_encoded, X_safe_encoded

# --- 1. Core Prediction Helper Functions (Redefined for robustness) ---

def predict_match_outcome(team_a, team_b, venue, winner_model, X_train_cols):
    """Predicts the match winner and returns the team code."""
    team_a = str(team_a)
    team_b = str(team_b)
    venue = str(venue)

    # Feature calculation (assumes helper functions from Cell 3 are available globally)
    team_a_rate = calculate_overall_win_rate(df_clean, team_a)
    team_b_rate = calculate_overall_win_rate(df_clean, team_b)
    h2h_rate = calculate_head_to_head_rate(df_clean, team_a, team_b)

    input_data = pd.DataFrame([{'Team1 Name': team_a, 'Team2 Name': team_b, 'Match Venue (Stadium)': venue,
                                'team1_overall_win_rate': team_a_rate, 'team2_overall_win_rate': team_b_rate,
                                'head_to_head_rate': h2h_rate}])

    # OHE and Column Alignment
    input_data_encoded = pd.get_dummies(input_data, columns=['Team1 Name', 'Team2 Name', 'Match Venue (Stadium)'], drop_first=False)

    # Ensure columns match training set, filling missing OHE columns with 0
    input_data_aligned = pd.DataFrame(0, index=[0], columns=X_train_cols)
    for col in input_data_encoded.columns:
        if col in X_train_cols:
            input_data_aligned[col] = input_data_encoded[col].iloc[0]

    # Prediction: 1 means Team 1 won, 0 means Team 2 won
    prediction = winner_model.predict(input_data_aligned)[0]

    # Correctly return the team code based on the prediction
    predicted_winner = team_a if prediction == 1 else team_b
    return predicted_winner

def predict_toss_decision(team_a, team_b, venue, toss_model, X_train_cols):
    """Predicts the toss winner's best decision (BAT or BOWL)."""
    team_a = str(team_a)
    team_b = str(team_b)
    venue = str(venue)

    team_a_rate = calculate_overall_win_rate(df_clean, team_a)
    team_b_rate = calculate_overall_win_rate(df_clean, team_b)
    h2h_rate = calculate_head_to_head_rate(df_clean, team_a, team_b)
    venue_bias = calculate_venue_bat_win_bias(venue)

    input_data = pd.DataFrame([{'Match Venue (Stadium)': venue, 'team1_overall_win_rate': team_a_rate,
                                'team2_overall_win_rate': team_b_rate, 'head_to_head_rate': h2h_rate,
                                'venue_bat_win_bias': venue_bias}])

    # OHE and Column Alignment
    input_data_encoded = pd.get_dummies(input_data, columns=['Match Venue (Stadium)'], drop_first=False)

    input_data_aligned = pd.DataFrame(0, index=[0], columns=X_train_cols)
    for col in input_data_encoded.columns:
        if col in X_train_cols:
            input_data_aligned[col] = input_data_encoded[col].iloc[0]

    # Prediction: 1 = BOWL, 0 = BAT
    prediction = toss_model.predict(input_data_aligned)[0]
    return 'BOWL' if prediction == 1 else 'BAT'

def predict_safe_score(batting_team, venue, safe_score_model, X_train_cols):
    """Predicts the safe score for the team batting first."""
    batting_team = str(batting_team)
    venue = str(venue)

    input_data = pd.DataFrame([{'Match Venue (Stadium)': venue, 'Batting_First_Team': batting_team}])

    # OHE and Column Alignment
    input_data_encoded = pd.get_dummies(input_data, columns=['Match Venue (Stadium)', 'Batting_First_Team'], drop_first=False)

    # Ensure columns match training set, filling missing OHE columns with 0
    input_data_aligned = pd.DataFrame(0, index=[0], columns=X_train_cols)

    for col in input_data_encoded.columns:
        if col in X_train_cols:
            input_data_aligned[col] = input_data_encoded[col].iloc[0]

    # Prediction
    predicted_score = int(round(safe_score_model.predict(input_data_aligned)[0]))
    return predicted_score

# --- 2. Main Prediction Function (The one you run) ---

def get_predictions_from_user_corrected():
    # Attempt to access global variables (which should be defined if Cells 3 and 5 ran)
    try:
        winner_cols = X_winner_encoded.columns
        toss_cols = X_toss_encoded.columns
        safe_cols = X_safe_encoded.columns

        # Get representative examples
        example_team1 = str(df_clean['Team1 Name'].iloc[0])
        example_team2 = str(df_clean['Team2 Name'].iloc[1])
        example_venue = str(df_clean['Match Venue (Stadium)'].iloc[2])

    except NameError:
        print("Error: Models and feature columns are not globally available. Ensure Cells 3, 4, and 5 have run successfully.")
        return

    print("\n--- Enter Match Details for Prediction (Use Numerical Codes) ---")

    team1 = input(f"Enter Team 1 Code (e.g., {example_team1}): ")
    team2 = input(f"Enter Team 2 Code (e.g., {example_team2}): ")
    venue = input(f"Enter Venue Code (e.g., {example_venue}): ")

    team1 = str(team1)
    team2 = str(team2)
    venue = str(venue)

    # --- INPUT VALIDATION CHECK ---
    if team1 == team2:
        print("\n Error: Team 1 Code and Team 2 Code cannot be the same. Please enter two different teams.")
        return
    # ------------------------------

    # Check if inputs are in historical data
    known_teams = set(df_clean['Team1 Name'].astype(str).unique()) | set(df_clean['Team2 Name'].astype(str).unique())
    known_venues = set(df_clean['Match Venue (Stadium)'].astype(str).unique())

    if team1 not in known_teams or team2 not in known_teams or venue not in known_venues:
        print("\n‚ö†Ô∏è Warning: One or more inputs are not present in the historical data. Prediction reliability may be low.")

    try:
        # Get Predictions
        winner = predict_match_outcome(team1, team2, venue, best_dt_winner, winner_cols)
        toss_decision = predict_toss_decision(team1, team2, venue, dt_toss, toss_cols)
        safe_score = predict_safe_score(team1, venue, dt_safe_score, safe_cols)

        print("\n======================================================")
        print("üèè AI Decision Tree Model Predictions")
        print("======================================================")
        print(f"1) Predicted Match Winner Code: {winner}")
        print(f"2) Predicted Best Toss Decision (for Toss Winner): {toss_decision}")
        print(f"3) Predicted Safe Score for First Batting Team (Code {team1}): {safe_score} runs")
        print("======================================================")

    except Exception as e:
        print(f"An error occurred during prediction. Please check your inputs and trained models. Error: {e}")

# Run the final prediction function
try:
    # This check relies on the models being available from Cells 4 and 5
    if 'best_dt_winner' in globals() and 'dt_toss' in globals() and 'dt_safe_score' in globals():
        get_predictions_from_user_corrected()
    else:
        print("Error: Models are not yet trained. Please ensure Cells 1 through 5 have run successfully.")
except NameError:
    print("Error: Core variables (df_clean, X_encoded, etc.) are missing. Please ensure all cells have been run sequentially and successfully.")


--- Enter Match Details for Prediction (Use Numerical Codes) ---
Enter Team 1 Code (e.g., 41): 44
Enter Team 2 Code (e.g., 66): 56
Enter Venue Code (e.g., 32): 78

üèè AI Decision Tree Model Predictions
1) Predicted Match Winner Code: 56
2) Predicted Best Toss Decision (for Toss Winner): BOWL
3) Predicted Safe Score for First Batting Team (Code 44): 136 runs
