In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/cricket-dataset/y_train1.csv
/kaggle/input/cricket-dataset/X_train1.csv
/kaggle/input/cricket-dataset/X_test1.csv
/kaggle/input/cricket-dataset/final_odi_matches_with_full_weather2.csv
/kaggle/input/cricket-dataset/y_test1.csv


In [None]:
!pip install tabpfn


In [None]:
import pandas as pd
import joblib
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from tabpfn import TabPFNClassifier

# Load dataset
df = pd.read_csv("/kaggle/input/cricket-dataset/final_odi_matches_with_full_weather2.csv")

# Clean and engineer
df = df.dropna(subset=['Match Winner', 'Team1 Name', 'Team2 Name', 'Avg_Temp_C'])
df['Avg_Temp_C'] = df['Avg_Temp_C'].replace(-99, np.nan)
df = df.dropna(subset=['Avg_Temp_C'])

df.rename(columns={
    'Team1 Name': 'Team1',
    'Team2 Name': 'Team2',
    'Match Winner': 'Winner',
    'Toss Winner': 'TossWinner',
    'Match Venue (Country)': 'VenueCountry'
}, inplace=True)

df['home_advantage'] = df.apply(
    lambda row: 1 if pd.notna(row['VenueCountry']) and row['VenueCountry'].lower() in row['Team1'].lower() else 0,
    axis=1
)
df['won_toss'] = df.apply(lambda row: 1 if row['TossWinner'] == row['Team1'] else 0, axis=1)

team_rank = {
    'India': 1, 'New Zealand': 2, 'Australia': 3, 'Sri Lanka': 4,
    'Pakistan': 5, 'South Africa': 6, 'Afghanistan': 7,
    'England': 8, 'West Indies': 9, 'Bangladesh': 10
}
df['team1_rank'] = df['Team1'].map(team_rank)
df['team2_rank'] = df['Team2'].map(team_rank)
df = df.dropna(subset=['team1_rank', 'team2_rank'])
df['rank_diff'] = df['team2_rank'] - df['team1_rank']
df['target'] = df.apply(lambda row: 1 if row['Winner'] == row['Team1'] else 0, axis=1)

# Final features
features = ['Avg_Temp_C', 'home_advantage', 'won_toss', 'team1_rank', 'team2_rank', 'rank_diff']
X = df[features]
y = df['target']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, stratify=y, random_state=42)

# Train TabPFN
tabpfn_clf = TabPFNClassifier(device="cuda" if torch.cuda.is_available() else "cpu")
tabpfn_clf.fit(X_train, y_train)

# Predict & Evaluate
y_pred = tabpfn_clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
# Train TabPFN
tabpfn_clf = TabPFNClassifier(device="cuda" if torch.cuda.is_available() else "cpu")
tabpfn_clf.fit(X_train, y_train)

# Save model to disk
joblib.dump(tabpfn_clf, "tabpfn_model.pkl")
print("✅ Model saved as tabpfn_model.pkl")


In [None]:
# cricket_ensemble_pipeline.py
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from tabpfn import TabPFNClassifier
import torch
import joblib

# ------------------ Step 1: Load & Clean ------------------
df = pd.read_csv("/kaggle/input/cricket-dataset/final_odi_matches_with_full_weather2.csv")

df = df.dropna(subset=['Match Winner', 'Team1 Name', 'Team2 Name'])
df['Avg_Temp_C'] = df['Avg_Temp_C'].replace(-99, np.nan)
df = df.dropna(subset=['Avg_Temp_C'])

df.rename(columns={
    'Team1 Name': 'Team1',
    'Team2 Name': 'Team2',
    'Match Winner': 'Winner',
    'Toss Winner': 'TossWinner',
    'Match Venue (Country)': 'VenueCountry'
}, inplace=True)

# ------------------ Step 2: Feature Engineering ------------------
df['home_advantage'] = df.apply(lambda row: 1 if pd.notna(row['VenueCountry']) and row['VenueCountry'].lower() in row['Team1'].lower() else 0, axis=1)
df['won_toss'] = df.apply(lambda row: 1 if row['TossWinner'] == row['Team1'] else 0, axis=1)

team_rank = {
    'India': 1, 'New Zealand': 2, 'Australia': 3, 'Sri Lanka': 4,
    'Pakistan': 5, 'South Africa': 6, 'Afghanistan': 7,
    'England': 8, 'West Indies': 9, 'Bangladesh': 10
}
df['team1_rank'] = df['Team1'].map(team_rank)
df['team2_rank'] = df['Team2'].map(team_rank)
df = df.dropna(subset=['team1_rank', 'team2_rank'])
df['rank_diff'] = df['team2_rank'] - df['team1_rank']
df['target'] = df.apply(lambda row: 1 if row['Winner'] == row['Team1'] else 0, axis=1)

# ------------------ Step 3: Preprocessing ------------------
features = ['Avg_Temp_C', 'home_advantage', 'won_toss', 'team1_rank', 'team2_rank', 'rank_diff']
X = df[features]
y = df['target']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, stratify=y, random_state=42
)

# ------------------ Step 4: Model Training ------------------
# XGBoost with hyperparameter tuning
xgb_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 7, 10],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_grid = GridSearchCV(xgb, xgb_params, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
xgb_grid.fit(X_train, y_train)
xgb_probs = xgb_grid.predict_proba(X_test)[:, 1]

# Random Forest with hyperparameter tuning
rf_params = {
    'n_estimators': [100, 200],
    'max_depth': [10, 15, None],
    'min_samples_split': [2, 5, 10]
}
rf = RandomForestClassifier(random_state=42)
rf_grid = GridSearchCV(rf, rf_params, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
rf_grid.fit(X_train, y_train)
rf_probs = rf_grid.predict_proba(X_test)[:, 1]

# TabPFN (no tuning required)
tabpfn = TabPFNClassifier(device="cuda" if torch.cuda.is_available() else "cpu")
tabpfn.fit(X_train, y_train)
tabpfn_probs = tabpfn.predict_proba(X_test)[:, 1]

# ------------------ Step 5: Ensemble Prediction ------------------
# Weighted average
ensemble_probs = (0.4 * xgb_probs) + (0.3 * rf_probs) + (0.3 * tabpfn_probs)
ensemble_preds = (ensemble_probs > 0.5).astype(int)

# ------------------ Step 6: Evaluation ------------------
print("\n\n🎯 Ensemble Accuracy:", accuracy_score(y_test, ensemble_preds))
print("\n📊 Classification Report:")
print(classification_report(y_test, ensemble_preds))

# ------------------ Step 7: Save Models ------------------
joblib.dump(xgb_grid.best_estimator_, "xgb_model.pkl")
joblib.dump(rf_grid.best_estimator_, "rf_model.pkl")
joblib.dump(tabpfn, "tabpfn_model.pkl")
joblib.dump(scaler, "scaler.pkl")


In [None]:
# cricket_ensemble_pipeline_optimized.py
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from tabpfn import TabPFNClassifier
import torch
import joblib

# ------------------ Step 1: Load & Clean ------------------
df = pd.read_csv("/kaggle/input/cricket-dataset/final_odi_matches_with_full_weather2.csv")

df = df.dropna(subset=['Match Winner', 'Team1 Name', 'Team2 Name'])
df['Avg_Temp_C'] = df['Avg_Temp_C'].replace(-99, np.nan)
df = df.dropna(subset=['Avg_Temp_C'])

df.rename(columns={
    'Team1 Name': 'Team1',
    'Team2 Name': 'Team2',
    'Match Winner': 'Winner',
    'Toss Winner': 'TossWinner',
    'Match Venue (Country)': 'VenueCountry'
}, inplace=True)

# ------------------ Step 2: Feature Engineering ------------------
df['home_advantage'] = df.apply(lambda row: 1 if pd.notna(row['VenueCountry']) and row['VenueCountry'].lower() in row['Team1'].lower() else 0, axis=1)
df['won_toss'] = df.apply(lambda row: 1 if row['TossWinner'] == row['Team1'] else 0, axis=1)

team_rank = {
    'India': 1, 'New Zealand': 2, 'Australia': 3, 'Sri Lanka': 4,
    'Pakistan': 5, 'South Africa': 6, 'Afghanistan': 7,
    'England': 8, 'West Indies': 9, 'Bangladesh': 10
}
df['team1_rank'] = df['Team1'].map(team_rank)
df['team2_rank'] = df['Team2'].map(team_rank)
df = df.dropna(subset=['team1_rank', 'team2_rank'])
df['rank_diff'] = df['team2_rank'] - df['team1_rank']
df['rank_ratio'] = df['team1_rank'] / df['team2_rank']
df['target'] = df.apply(lambda row: 1 if row['Winner'] == row['Team1'] else 0, axis=1)

# ------------------ Step 3: Preprocessing ------------------
features = ['Avg_Temp_C', 'home_advantage', 'won_toss', 'team1_rank', 'team2_rank', 'rank_diff', 'rank_ratio']
X = df[features]
y = df['target']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, stratify=y, random_state=42
)

# ------------------ Step 4: Model Training ------------------
kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# XGBoost
xgb_params = {
    'n_estimators': [100, 200],
    'max_depth': [5, 7],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_grid = GridSearchCV(xgb, xgb_params, cv=kf, scoring='accuracy', verbose=1, n_jobs=-1)
xgb_grid.fit(X_train, y_train)
xgb_probs = xgb_grid.predict_proba(X_test)[:, 1]

# Random Forest
rf_params = {
    'n_estimators': [100, 200],
    'max_depth': [10, None],
    'min_samples_split': [2, 5]
}
rf = RandomForestClassifier(random_state=42)
rf_grid = GridSearchCV(rf, rf_params, cv=kf, scoring='accuracy', verbose=1, n_jobs=-1)
rf_grid.fit(X_train, y_train)
rf_probs = rf_grid.predict_proba(X_test)[:, 1]

# TabPFN
tabpfn = TabPFNClassifier(device="cuda" if torch.cuda.is_available() else "cpu")
tabpfn.fit(X_train, y_train)
tabpfn_probs = tabpfn.predict_proba(X_test)[:, 1]

# ------------------ Step 5: Ensemble via Meta Learner ------------------
meta_X = np.vstack((xgb_probs, rf_probs, tabpfn_probs)).T
meta_clf = LogisticRegression()
meta_clf.fit(meta_X, y_test)
meta_probs = meta_clf.predict_proba(meta_X)[:, 1]
ensemble_preds = (meta_probs > 0.5).astype(int)

# ------------------ Step 6: Evaluation ------------------
print("\n\n🎯 Ensemble Accuracy:", accuracy_score(y_test, ensemble_preds))
print("\n📊 Classification Report:")
print(classification_report(y_test, ensemble_preds))

# ------------------ Step 7: Save Models ------------------
joblib.dump(xgb_grid.best_estimator_, "xgb_model.pkl")
joblib.dump(rf_grid.best_estimator_, "rf_model.pkl")
joblib.dump(tabpfn, "tabpfn_model.pkl")
joblib.dump(meta_clf, "meta_ensemble_model.pkl")
joblib.dump(scaler, "scaler.pkl")


In [None]:
pip install autogluon


In [None]:
pip install xgboost==1.7.6 scikit-learn==1.3.2 --upgrade


In [None]:
!apt-get install -y graphviz graphviz-dev
!pip install pygraphviz


In [None]:
# cricket_autogluon_pipeline.py
import pandas as pd
from autogluon.tabular import TabularPredictor

# ------------------ Step 1: Load & Clean ------------------
df = pd.read_csv("/kaggle/input/cricket-dataset/final_odi_matches_with_full_weather2.csv")
df = df.dropna(subset=['Match Winner', 'Team1 Name', 'Team2 Name'])
df['Avg_Temp_C'] = df['Avg_Temp_C'].replace(-99, pd.NA)
df = df.dropna(subset=['Avg_Temp_C'])

df.rename(columns={
    'Team1 Name': 'Team1',
    'Team2 Name': 'Team2',
    'Match Winner': 'Winner',
    'Toss Winner': 'TossWinner',
    'Match Venue (Country)': 'VenueCountry'
}, inplace=True)

# ------------------ Step 2: Feature Engineering ------------------
df['home_advantage'] = df.apply(lambda row: 1 if pd.notna(row['VenueCountry']) and row['VenueCountry'].lower() in row['Team1'].lower() else 0, axis=1)
df['won_toss'] = df.apply(lambda row: 1 if row['TossWinner'] == row['Team1'] else 0, axis=1)
team_rank = {
    'India': 1, 'New Zealand': 2, 'Australia': 3, 'Sri Lanka': 4,
    'Pakistan': 5, 'South Africa': 6, 'Afghanistan': 7,
    'England': 8, 'West Indies': 9, 'Bangladesh': 10
}
df['team1_rank'] = df['Team1'].map(team_rank)
df['team2_rank'] = df['Team2'].map(team_rank)
df = df.dropna(subset=['team1_rank', 'team2_rank'])

df['rank_diff'] = df['team2_rank'] - df['team1_rank']
df['target'] = df.apply(lambda row: 1 if row['Winner'] == row['Team1'] else 0, axis=1)

# ------------------ Step 3: Prepare Dataset for AutoGluon ------------------
features = ['Avg_Temp_C', 'home_advantage', 'won_toss', 'team1_rank', 'team2_rank', 'rank_diff']
target = 'target'
data = df[features + [target]]

# Split train/test
train_data = data.sample(frac=0.8, random_state=42)
test_data = data.drop(train_data.index)

# ------------------ Step 4: AutoGluon Training ------------------
predictor = TabularPredictor(label=target, eval_metric='accuracy', path='autogluon_models')
predictor.fit(train_data, time_limit=600, presets='best_quality')  # 10 min max

# ------------------ Step 5: Evaluation ------------------
print("\n📊 AutoGluon Evaluation:")
performance = predictor.evaluate(test_data)

# ------------------ Step 6: Best Model & Ensemble Info ------------------
lb = predictor.leaderboard(silent=True)
best_model = lb.loc[0, 'model']
print("🏆 Best Model Used:", best_model)

# If running in Jupyter, you can use:
predictor.plot_ensemble_model()

# ------------------ Step 7: Save Model ------------------
import joblib

# Save AutoGluon predictor directory path as a joblib object
joblib.dump(predictor, 'autogluon_predictor.pkl')
print("\n✅ Predictor also saved as autogluon_predictor.pkl")

predictor.save()
print("\n✅ Model saved at: autogluon_models/")


In [None]:
# cricket_autogluon_pipeline.py
import pandas as pd
from autogluon.tabular import TabularPredictor

# ------------------ Step 1: Load & Clean ------------------
df = pd.read_csv("/kaggle/input/cricket-dataset/final_odi_matches_with_full_weather2.csv")

df = df.dropna(subset=['Match Winner', 'Team1 Name', 'Team2 Name'])
df['Avg_Temp_C'] = df['Avg_Temp_C'].replace(-99, pd.NA)
df = df.dropna(subset=['Avg_Temp_C'])

df.rename(columns={
    'Team1 Name': 'Team1',
    'Team2 Name': 'Team2',
    'Match Winner': 'Winner',
    'Toss Winner': 'TossWinner',
    'Match Venue (Country)': 'VenueCountry'
}, inplace=True)

# ------------------ Step 2: Feature Engineering ------------------
df['home_advantage'] = df.apply(lambda row: 1 if pd.notna(row['VenueCountry']) and row['VenueCountry'].lower() in row['Team1'].lower() else 0, axis=1)
df['won_toss'] = df.apply(lambda row: 1 if row['TossWinner'] == row['Team1'] else 0, axis=1)

team_rank = {
    'India': 1, 'New Zealand': 2, 'Australia': 3, 'Sri Lanka': 4,
    'Pakistan': 5, 'South Africa': 6, 'Afghanistan': 7,
    'England': 8, 'West Indies': 9, 'Bangladesh': 10
}
df['team1_rank'] = df['Team1'].map(team_rank)
df['team2_rank'] = df['Team2'].map(team_rank)
df = df.dropna(subset=['team1_rank', 'team2_rank'])

df['rank_diff'] = df['team2_rank'] - df['team1_rank']
df['target'] = df.apply(lambda row: 1 if row['Winner'] == row['Team1'] else 0, axis=1)

# ------------------ Step 3: Prepare Dataset for AutoGluon ------------------
features = ['Avg_Temp_C', 'home_advantage', 'won_toss', 'team1_rank', 'team2_rank', 'rank_diff']
target = 'target'
data = df[features + [target]]

# Split train/test
train_data = data.sample(frac=0.8, random_state=42)
test_data = data.drop(train_data.index)

# ------------------ Step 4: AutoGluon Training ------------------
predictor = TabularPredictor(label=target, eval_metric='accuracy', path='autogluon_models')
predictor.fit(train_data, time_limit=600, presets='best_quality')  # Best models, 10 mins

# ------------------ Step 5: Evaluation ------------------
performance = predictor.evaluate(test_data)
print("🎯 AutoGluon Performance:")
print(performance)

# ------------------ Step 6: Leaderboard & Best Model ------------------
print("\n📊 Leaderboard:")
leaderboard_df = predictor.leaderboard(test_data, silent=True)
print(leaderboard_df)

# Get best model
best_model = leaderboard_df.loc[leaderboard_df['score_test'].idxmax(), 'model']
print(f"\n🏆 Best Model Used: {best_model}")

# ------------------ Step 7: Save Predictor ------------------
predictor.save()
print("\n✅ Predictor saved to 'autogluon_models' folder")

# ------------------ Optional: Plot Ensemble Network ------------------
try:
    predictor.plot_ensemble_model()
except ImportError:
    print("\n📉 To visualize ensemble structure, install pygraphviz:\n"
          "!apt-get install -y graphviz graphviz-dev && pip install pygraphviz")


In [None]:
# cricket_autogluon_full_pipeline.py
import pandas as pd
from autogluon.tabular import TabularPredictor
from sklearn.model_selection import KFold
import numpy as np
import os

# ------------------ Step 1: Load Dataset ------------------
df = pd.read_csv("/kaggle/input/cricket-dataset/final_odi_matches_with_full_weather2.csv")

df = df.dropna(subset=['Match Winner', 'Team1 Name', 'Team2 Name'])
df['Avg_Temp_C'] = df['Avg_Temp_C'].replace(-99, pd.NA)
df = df.dropna(subset=['Avg_Temp_C'])

df.rename(columns={
    'Team1 Name': 'Team1',
    'Team2 Name': 'Team2',
    'Match Winner': 'Winner',
    'Toss Winner': 'TossWinner',
    'Match Venue (Country)': 'VenueCountry',
    'Toss Winner Choice': 'TossDecision',
}, inplace=True)

# ------------------ Step 2: Feature Engineering ------------------
df['home_advantage'] = df.apply(lambda row: 1 if pd.notna(row['VenueCountry']) and row['VenueCountry'].lower() in row['Team1'].lower() else 0, axis=1)
df['won_toss'] = df.apply(lambda row: 1 if row['TossWinner'] == row['Team1'] else 0, axis=1)
df['toss_bat'] = df.apply(lambda row: 1 if row['TossDecision'].lower() == 'bat' and row['TossWinner'] == row['Team1'] else 0, axis=1)

team_rank = {
    'India': 1, 'New Zealand': 2, 'Australia': 3, 'Sri Lanka': 4,
    'Pakistan': 5, 'South Africa': 6, 'Afghanistan': 7,
    'England': 8, 'West Indies': 9, 'Bangladesh': 10
}
df['team1_rank'] = df['Team1'].map(team_rank)
df['team2_rank'] = df['Team2'].map(team_rank)
df = df.dropna(subset=['team1_rank', 'team2_rank'])

df['rank_diff'] = df['team2_rank'] - df['team1_rank']
df['month'] = pd.to_datetime(df['Match Date'], errors='coerce').dt.month
df = df.dropna(subset=['month'])

df['target'] = df.apply(lambda row: 1 if row['Winner'] == row['Team1'] else 0, axis=1)

# ------------------ Step 3: Select Features ------------------
features = [
    'Avg_Temp_C', 'home_advantage', 'won_toss', 'toss_bat',
    'team1_rank', 'team2_rank', 'rank_diff', 'month'
]
target = 'target'
data = df[features + [target]]

# ------------------ Step 4: K-Fold Cross-Validation ------------------
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = []
fold = 1

for train_idx, test_idx in kf.split(data):
    print(f"\n📂 Training Fold {fold}...")
    train_data = data.iloc[train_idx]
    test_data = data.iloc[test_idx]

    fold_path = f"autogluon_cv_models/fold_{fold}/"
    os.makedirs(fold_path, exist_ok=True)

    predictor = TabularPredictor(label=target, path=fold_path, eval_metric='accuracy').fit(
        train_data=train_data,
        time_limit=600,
        presets='best_quality',
        num_stack_levels=2,
        num_bag_folds=5,
        verbosity=2
    )

    performance = predictor.evaluate(test_data)
    print("🎯 Fold Performance:", performance)
    cv_scores.append(performance['accuracy'])
    fold += 1

# ------------------ Step 5: CV Results ------------------
print("\n📊 Cross-Validation Accuracy Scores:", cv_scores)
print(f"✅ Average CV Accuracy: {np.mean(cv_scores):.4f}")

# ------------------ Step 6: Train Final Model on All Data ------------------
print("\n🚀 Training Final Model on Full Dataset...")
final_path = "autogluon_final_model/"
os.makedirs(final_path, exist_ok=True)

predictor_final = TabularPredictor(label=target, path=final_path, eval_metric='accuracy').fit(
    train_data=data,
    time_limit=900,
    presets='best_quality',
    num_stack_levels=2,
    num_bag_folds=5,
    verbosity=2
)
predictor_final.save()
print("\n✅ Final model saved to:", final_path)


In [None]:
# evaluate_cricket_model.py
import pandas as pd
from autogluon.tabular import TabularPredictor

# ------------------ Step 1: Load Final Model ------------------
predictor = TabularPredictor.load("autogluon_final_model/")
print("✅ Final model loaded.")

# ------------------ Step 2: Load Dataset ------------------
df = pd.read_csv("/kaggle/input/cricket-dataset/final_odi_matches_with_full_weather2.csv")

# Apply same preprocessing
df = df.dropna(subset=['Match Winner', 'Team1 Name', 'Team2 Name'])
df['Avg_Temp_C'] = df['Avg_Temp_C'].replace(-99, pd.NA)
df = df.dropna(subset=['Avg_Temp_C'])

df.rename(columns={
    'Team1 Name': 'Team1',
    'Team2 Name': 'Team2',
    'Match Winner': 'Winner',
    'Toss Winner': 'TossWinner',
    'Match Venue (Country)': 'VenueCountry',
    'Toss Winner Choice': 'TossDecision',
}, inplace=True)

df['home_advantage'] = df.apply(lambda row: 1 if pd.notna(row['VenueCountry']) and row['VenueCountry'].lower() in row['Team1'].lower() else 0, axis=1)
df['won_toss'] = df.apply(lambda row: 1 if row['TossWinner'] == row['Team1'] else 0, axis=1)
df['toss_bat'] = df.apply(lambda row: 1 if row['TossDecision'].lower() == 'bat' and row['TossWinner'] == row['Team1'] else 0, axis=1)

team_rank = {
    'India': 1, 'New Zealand': 2, 'Australia': 3, 'Sri Lanka': 4,
    'Pakistan': 5, 'South Africa': 6, 'Afghanistan': 7,
    'England': 8, 'West Indies': 9, 'Bangladesh': 10
}
df['team1_rank'] = df['Team1'].map(team_rank)
df['team2_rank'] = df['Team2'].map(team_rank)
df = df.dropna(subset=['team1_rank', 'team2_rank'])

df['rank_diff'] = df['team2_rank'] - df['team1_rank']
df['month'] = pd.to_datetime(df['Match Date'], errors='coerce').dt.month
df = df.dropna(subset=['month'])

df['target'] = df.apply(lambda row: 1 if row['Winner'] == row['Team1'] else 0, axis=1)

# ------------------ Step 3: Define Features & Evaluate ------------------
features = [
    'Avg_Temp_C', 'home_advantage', 'won_toss', 'toss_bat',
    'team1_rank', 'team2_rank', 'rank_diff', 'month'
]
data = df[features + ['target']]

performance = predictor.evaluate(data)
print("\n🎯 Full Dataset Evaluation:")
for metric, value in performance.items():
    print(f"{metric}: {value:.4f}")


In [None]:
# predict_new_match.py
import pandas as pd
from autogluon.tabular import TabularPredictor

# ------------------ Step 1: Load Final Model ------------------
predictor = TabularPredictor.load("autogluon_final_model/")
print("✅ Model loaded for prediction.")

# ------------------ Step 2: Define New Match Scenario ------------------
# Sample match info (replace with your actual match data)
new_match = pd.DataFrame([{
    'Avg_Temp_C': 28,
    'home_advantage': 1,  # 1 if Team1 is playing in their home country
    'won_toss': 1,        # 1 if Team1 won the toss
    'toss_bat': 1,        # 1 if Team1 won the toss and chose to bat
    'team1_rank': 1,      # ICC rank of Team1
    'team2_rank': 3,      # ICC rank of Team2
    'rank_diff': 3 - 1,   # team2_rank - team1_rank
    'month': 10           # October
}])

# ------------------ Step 3: Predict ------------------
prediction = predictor.predict(new_match)
probabilities = predictor.predict_proba(new_match)

# ------------------ Step 4: Display Results ------------------
print("\n🔮 Prediction Result:")
print("Team1 Wins" if prediction[0] == 1 else "Team2 Wins")
print("\n📊 Probabilities:")
print(probabilities)

# Optional: Save to CSV
# new_match['predicted_winner'] = prediction
# new_match.to_csv("new_match_prediction.csv", index=False)


In [8]:
import shutil

# Zip the uploaded model from input into working
shutil.make_archive(
    base_name="/kaggle/working/autogluon_final_model",      # this is where ZIP will be saved
    format='zip',                                            # zip format
    root_dir="/kaggle/input/dd/other/default/1/autogluon_final_model"        # your uploaded model path
)

print("✅ Model zipped to /kaggle/working/autogluon_final_model.zip")


✅ Model zipped to /kaggle/working/autogluon_final_model.zip


In [7]:
import shutil

# Zip the uploaded model from input into working
shutil.make_archive(
    base_name="/kaggle/working/autogluon_model",      # this is where ZIP will be saved
    format='zip',                                            # zip format
    root_dir="/kaggle/input/dd/other/default/1/autogluon_models"        # your uploaded model path
)

print("✅ Model zipped to /kaggle/working/autogluon_final_model.zip")


✅ Model zipped to /kaggle/working/autogluon_final_model.zip


In [9]:
import shutil

# Zip the uploaded model from input into working
shutil.make_archive(
    base_name="/kaggle/working/autogluon_model-CV",      # this is where ZIP will be saved
    format='zip',                                            # zip format
    root_dir="/kaggle/input/dd/other/default/1/autogluon_cv_models"        # your uploaded model path
)

print("✅ Model zipped to /kaggle/working/autogluon_final_model.zip")


✅ Model zipped to /kaggle/working/autogluon_final_model.zip
