In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from collections import Counter

# ------------------ Step 1: Load & Clean ------------------
df = pd.read_csv("final_odi_matches_with_full_weather2.csv")

# Drop rows with critical missing values
df = df.dropna(subset=['Match Winner', 'Team1 Name', 'Team2 Name'])

# Replace temperature placeholder -99 with NaN and drop those rows
df['Avg_Temp_C'] = pd.to_numeric(df['Avg_Temp_C'], errors='coerce')
df = df[df['Avg_Temp_C'] != -99]
df = df.dropna(subset=['Avg_Temp_C'])

# Rename columns for simplicity
df.rename(columns={
    'Team1 Name': 'Team1',
    'Team2 Name': 'Team2',
    'Match Winner': 'Winner',
    'Toss Winner': 'TossWinner',
    'Match Venue (Country)': 'VenueCountry'
}, inplace=True)

# ------------------ Step 2: Feature Engineering ------------------

# Home advantage: check if venue country matches Team1
df['home_advantage'] = df.apply(
    lambda row: 1 if pd.notna(row['VenueCountry']) and row['VenueCountry'].lower() in row['Team1'].lower() else 0, axis=1
)

# Toss won by Team1
df['won_toss'] = df.apply(lambda row: 1 if row['TossWinner'] == row['Team1'] else 0, axis=1)

# Fake ICC rankings (adjust as needed)
icc_ranks = {
    'India': 1,
    'New Zealand': 2,
    'Australia': 3,
    'Sri Lanka': 4,
    'Pakistan': 5,
    'South Africa': 6,
    'Afghanistan': 7,
    'England': 8,
    'West Indies': 9,
    'Bangladesh': 10
}
df['team1_rank'] = df['Team1'].map(icc_ranks)
df['team2_rank'] = df['Team2'].map(icc_ranks)

# Remove matches with unknown team ranks
df = df.dropna(subset=['team1_rank', 'team2_rank'])

# Rank difference feature
df['rank_diff'] = df['team2_rank'] - df['team1_rank']

# Binary target: Did Team1 win?
df['target'] = df.apply(lambda row: 1 if row['Winner'] == row['Team1'] else 0, axis=1)

# ------------------ Step 3: Feature Selection ------------------
features = ['Avg_Temp_C', 'home_advantage', 'won_toss', 'team1_rank', 'team2_rank', 'rank_diff']
X = df[features]
y = df['target']

# ------------------ Step 4: Preprocessing for SVM/XGBoost ------------------

# Normalize features (important for SVM, not strictly needed for XGBoost but keeps pipeline consistent)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Optional: balance the dataset if class imbalance is present
class_counts = Counter(y)
print("Class distribution:", class_counts)

# Export preprocessed data
X_scaled_df = pd.DataFrame(X_scaled, columns=features)
X_scaled_df['target'] = y.values
X_scaled_df.to_csv("preprocessed_cricket_data.csv", index=False)

print("✅ Preprocessing complete. Saved to 'preprocessed_cricket_data.csv'")


Class distribution: Counter({1: 2125, 0: 2114})
✅ Preprocessing complete. Saved to 'preprocessed_cricket_data.csv'


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# ------------------ Step 1: Load Preprocessed Data ------------------
data = pd.read_csv("preprocessed_cricket_data.csv")
X = data.drop(columns=['target'])
y = data['target']

# ------------------ Step 2: Train/Test Split ------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# ------------------ Step 3: SVM Training ------------------
print("\n--- SVM Training ---")
svm = SVC(kernel='rbf', probability=True)
svm_params = {'C': [0.1, 1, 10], 'gamma': ['scale', 'auto']}
svm_grid = GridSearchCV(svm, svm_params, cv=5, scoring='accuracy', verbose=1)
svm_grid.fit(X_train, y_train)

svm_best = svm_grid.best_estimator_
y_pred_svm = svm_best.predict(X_test)
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print("SVM Report:\n", classification_report(y_test, y_pred_svm))

# ------------------ Step 4: XGBoost Training ------------------
print("\n--- XGBoost Training ---")
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', tree_method='hist')
xgb_params = {
    'n_estimators': [100],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.05, 0.1]
}
xgb_grid = GridSearchCV(xgb, xgb_params, cv=5, scoring='accuracy', verbose=1)
xgb_grid.fit(X_train, y_train)

xgb_best = xgb_grid.best_estimator_
y_pred_xgb = xgb_best.predict(X_test)
print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("XGBoost Report:\n", classification_report(y_test, y_pred_xgb))

# ------------------ Step 5: Optional Save ------------------
import joblib
joblib.dump(svm_best, "svm_model.pkl")
joblib.dump(xgb_best, "xgboost_model.pkl")



--- SVM Training ---
Fitting 5 folds for each of 6 candidates, totalling 30 fits
SVM Accuracy: 0.6049528301886793
SVM Report:
               precision    recall  f1-score   support

           0       0.60      0.61      0.61       423
           1       0.61      0.60      0.60       425

    accuracy                           0.60       848
   macro avg       0.60      0.60      0.60       848
weighted avg       0.60      0.60      0.60       848


--- XGBoost Training ---
Fitting 5 folds for each of 6 candidates, totalling 30 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


XGBoost Accuracy: 0.6367924528301887
XGBoost Report:
               precision    recall  f1-score   support

           0       0.64      0.62      0.63       423
           1       0.63      0.65      0.64       425

    accuracy                           0.64       848
   macro avg       0.64      0.64      0.64       848
weighted avg       0.64      0.64      0.64       848



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


['xgboost_model.pkl']

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

# ------------------ Step 1: Load & Clean Dataset ------------------
df = pd.read_csv("final_odi_matches_with_full_weather2.csv")

# Drop missing values in critical fields
df = df.dropna(subset=['Match Winner', 'Team1 Name', 'Team2 Name', 'Avg_Temp_C'])
df['Avg_Temp_C'] = df['Avg_Temp_C'].replace(-99, np.nan)
df = df.dropna(subset=['Avg_Temp_C'])

# Rename for consistency
df.rename(columns={
    'Team1 Name': 'Team1',
    'Team2 Name': 'Team2',
    'Match Winner': 'Winner',
    'Toss Winner': 'TossWinner',
    'Match Venue (Country)': 'VenueCountry'
}, inplace=True)

# ------------------ Step 2: Feature Engineering ------------------
df['home_advantage'] = df.apply(
    lambda row: 1 if pd.notna(row['VenueCountry']) and row['VenueCountry'].lower() in row['Team1'].lower() else 0,
    axis=1
)
df['won_toss'] = df.apply(lambda row: 1 if row['TossWinner'] == row['Team1'] else 0, axis=1)

# Updated ICC ranking from image
team_rank = {
    'India': 1,
    'New Zealand': 2,
    'Australia': 3,
    'Sri Lanka': 4,
    'Pakistan': 5,
    'South Africa': 6,
    'Afghanistan': 7,
    'England': 8,
    'West Indies': 9,
    'Bangladesh': 10
}
df['team1_rank'] = df['Team1'].map(team_rank)
df['team2_rank'] = df['Team2'].map(team_rank)
df = df.dropna(subset=['team1_rank', 'team2_rank'])

df['rank_diff'] = df['team2_rank'] - df['team1_rank']

# Target variable: Did Team1 win?
df['target'] = df.apply(lambda row: 1 if row['Winner'] == row['Team1'] else 0, axis=1)

# ------------------ Step 3: Preprocessing ------------------
features = ['Avg_Temp_C', 'home_advantage', 'won_toss', 'team1_rank', 'team2_rank', 'rank_diff']
X = df[features]
y = df['target']

# Normalize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, stratify=y, random_state=42
)

# ------------------ Step 4: SVM Training ------------------
print("\n--- SVM Training ---")
svm = SVC(kernel='rbf', probability=True)
svm_params = {'C': [0.1, 1, 10], 'gamma': ['scale', 'auto']}
svm_grid = GridSearchCV(svm, svm_params, cv=5, scoring='accuracy', verbose=1)
svm_grid.fit(X_train, y_train)
svm_best = svm_grid.best_estimator_
y_pred_svm = svm_best.predict(X_test)

print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print("SVM Report:\n", classification_report(y_test, y_pred_svm))

# ------------------ Step 5: XGBoost Training ------------------
print("\n--- XGBoost Training ---")
xgb = XGBClassifier(eval_metric='logloss', tree_method='hist', device='cuda')
xgb_params = {
    'n_estimators': [100],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.05, 0.1]
}
xgb_grid = GridSearchCV(xgb, xgb_params, cv=5, scoring='accuracy', verbose=1)
xgb_grid.fit(X_train, y_train)
xgb_best = xgb_grid.best_estimator_
y_pred_xgb = xgb_best.predict(X_test)

print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("XGBoost Report:\n", classification_report(y_test, y_pred_xgb))

# ------------------ Step 6: Save Models ------------------
joblib.dump(svm_best, "svm_model.pkl")
joblib.dump(xgb_best, "xgboost_model.pkl")
joblib.dump(scaler, "scaler.pkl")

print("\nModels and scaler saved successfully.")



--- SVM Training ---
Fitting 5 folds for each of 6 candidates, totalling 30 fits
SVM Accuracy: 0.6049528301886793
SVM Report:
               precision    recall  f1-score   support

           0       0.60      0.61      0.61       423
           1       0.61      0.60      0.60       425

    accuracy                           0.60       848
   macro avg       0.60      0.60      0.60       848
weighted avg       0.60      0.60      0.60       848


--- XGBoost Training ---
Fitting 5 folds for each of 6 candidates, totalling 30 fits


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


XGBoost Accuracy: 0.6438679245283019
XGBoost Report:
               precision    recall  f1-score   support

           0       0.65      0.61      0.63       423
           1       0.64      0.67      0.65       425

    accuracy                           0.64       848
   macro avg       0.64      0.64      0.64       848
weighted avg       0.64      0.64      0.64       848


Models and scaler saved successfully.


In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

# ------------------ Step 1: Load & Clean Dataset ------------------
df = pd.read_csv("final_odi_matches_with_full_weather2.csv")

# Drop missing values in critical fields
df = df.dropna(subset=['Match Winner', 'Team1 Name', 'Team2 Name', 'Avg_Temp_C'])
df['Avg_Temp_C'] = df['Avg_Temp_C'].replace(-99, np.nan)
df = df.dropna(subset=['Avg_Temp_C'])

# Rename for consistency
df.rename(columns={
    'Team1 Name': 'Team1',
    'Team2 Name': 'Team2',
    'Match Winner': 'Winner',
    'Toss Winner': 'TossWinner',
    'Match Venue (Country)': 'VenueCountry'
}, inplace=True)

# ------------------ Step 2: Feature Engineering ------------------
df['home_advantage'] = df.apply(
    lambda row: 1 if pd.notna(row['VenueCountry']) and row['VenueCountry'].lower() in row['Team1'].lower() else 0,
    axis=1
)
df['won_toss'] = df.apply(lambda row: 1 if row['TossWinner'] == row['Team1'] else 0, axis=1)

# Updated ICC ranking from image
team_rank = {
    'India': 1,
    'New Zealand': 2,
    'Australia': 3,
    'Sri Lanka': 4,
    'Pakistan': 5,
    'South Africa': 6,
    'Afghanistan': 7,
    'England': 8,
    'West Indies': 9,
    'Bangladesh': 10
}
df['team1_rank'] = df['Team1'].map(team_rank)
df['team2_rank'] = df['Team2'].map(team_rank)
df = df.dropna(subset=['team1_rank', 'team2_rank'])

df['rank_diff'] = df['team2_rank'] - df['team1_rank']

# Target variable: Did Team1 win?
df['target'] = df.apply(lambda row: 1 if row['Winner'] == row['Team1'] else 0, axis=1)

# ------------------ Step 3: Preprocessing ------------------
features = ['Avg_Temp_C', 'home_advantage', 'won_toss', 'team1_rank', 'team2_rank', 'rank_diff']
X = df[features]
y = df['target']

# Normalize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, stratify=y, random_state=42
)

# ------------------ Step 4: Random Forest Training ------------------
print("\n--- Random Forest Training ---")
rf = RandomForestClassifier(random_state=42)
rf_params = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 5]
}
rf_grid = GridSearchCV(rf, rf_params, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
rf_grid.fit(X_train, y_train)
rf_best = rf_grid.best_estimator_
y_pred_rf = rf_best.predict(X_test)

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Random Forest Report:\n", classification_report(y_test, y_pred_rf))

# ------------------ Step 5: Save Model ------------------
joblib.dump(rf_best, "random_forest_model.pkl")
joblib.dump(scaler, "scaler.pkl")
print("\nRandom Forest model and scaler saved.")



--- Random Forest Training ---
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Random Forest Accuracy: 0.6615566037735849
Random Forest Report:
               precision    recall  f1-score   support

           0       0.68      0.62      0.65       423
           1       0.65      0.70      0.68       425

    accuracy                           0.66       848
   macro avg       0.66      0.66      0.66       848
weighted avg       0.66      0.66      0.66       848


Random Forest model and scaler saved.


In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
import joblib

# ------------------ Step 1: Load & Clean Dataset ------------------
df = pd.read_csv("final_odi_matches_with_full_weather2.csv")

df = df.dropna(subset=['Match Winner', 'Team1 Name', 'Team2 Name', 'Avg_Temp_C'])
df['Avg_Temp_C'] = df['Avg_Temp_C'].replace(-99, np.nan)
df = df.dropna(subset=['Avg_Temp_C'])

df.rename(columns={
    'Team1 Name': 'Team1',
    'Team2 Name': 'Team2',
    'Match Winner': 'Winner',
    'Toss Winner': 'TossWinner',
    'Match Venue (Country)': 'VenueCountry'
}, inplace=True)

# ------------------ Step 2: Feature Engineering ------------------
df['home_advantage'] = df.apply(
    lambda row: 1 if pd.notna(row['VenueCountry']) and row['VenueCountry'].lower() in row['Team1'].lower() else 0,
    axis=1
)
df['won_toss'] = df.apply(lambda row: 1 if row['TossWinner'] == row['Team1'] else 0, axis=1)

team_rank = {
    'India': 1, 'New Zealand': 2, 'Australia': 3, 'Sri Lanka': 4,
    'Pakistan': 5, 'South Africa': 6, 'Afghanistan': 7,
    'England': 8, 'West Indies': 9, 'Bangladesh': 10
}
df['team1_rank'] = df['Team1'].map(team_rank)
df['team2_rank'] = df['Team2'].map(team_rank)
df = df.dropna(subset=['team1_rank', 'team2_rank'])
df['rank_diff'] = df['team2_rank'] - df['team1_rank']
df['target'] = df.apply(lambda row: 1 if row['Winner'] == row['Team1'] else 0, axis=1)

# ------------------ Step 3: Preprocessing ------------------
features = ['Avg_Temp_C', 'home_advantage', 'won_toss', 'team1_rank', 'team2_rank', 'rank_diff']
X = df[features]
y = df['target']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, stratify=y, random_state=42
)

# ------------------ Step 4: Define Base Models ------------------
rf = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
xgb = XGBClassifier(n_estimators=100, max_depth=6, learning_rate=0.1, use_label_encoder=False, eval_metric='logloss', random_state=42)
svm = SVC(C=1.0, probability=True, kernel='rbf', random_state=42)

# ------------------ Step 5: Ensemble Model ------------------
ensemble = VotingClassifier(
    estimators=[('rf', rf), ('xgb', xgb), ('svm', svm)],
    voting='soft'  # Use predicted probabilities
)

ensemble.fit(X_train, y_train)
y_pred = ensemble.predict(X_test)

print("\n--- Ensemble Model Evaluation ---")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# ------------------ Step 6: Save Ensemble ------------------
joblib.dump(ensemble, "ensemble_model.pkl")
joblib.dump(scaler, "scaler.pkl")
print("\nEnsemble model and scaler saved.")


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



--- Ensemble Model Evaluation ---
Accuracy: 0.6509433962264151
Classification Report:
               precision    recall  f1-score   support

           0       0.66      0.63      0.64       423
           1       0.64      0.68      0.66       425

    accuracy                           0.65       848
   macro avg       0.65      0.65      0.65       848
weighted avg       0.65      0.65      0.65       848


Ensemble model and scaler saved.


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from lightgbm import LGBMClassifier
import joblib

# ------------------ Step 1: Load Dataset ------------------
df = pd.read_csv("final_odi_matches_with_full_weather2.csv")

# ------------------ Step 2: Preprocessing ------------------
df = df.dropna(subset=['Match Winner', 'Team1 Name', 'Team2 Name'])
df['Avg_Temp_C'] = df['Avg_Temp_C'].replace(-99, np.nan)
df = df.dropna(subset=['Avg_Temp_C'])

df.rename(columns={
    'Team1 Name': 'Team1',
    'Team2 Name': 'Team2',
    'Match Winner': 'Winner',
    'Toss Winner': 'TossWinner',
    'Match Venue (Country)': 'VenueCountry'
}, inplace=True)

# Feature engineering
df['home_advantage'] = df.apply(lambda row: 1 if pd.notna(row['VenueCountry']) and row['VenueCountry'].lower() in row['Team1'].lower() else 0, axis=1)
df['won_toss'] = df.apply(lambda row: 1 if row['TossWinner'] == row['Team1'] else 0, axis=1)

team_rank = {
    'India': 1, 'New Zealand': 2, 'Australia': 3, 'Sri Lanka': 4,
    'Pakistan': 5, 'South Africa': 6, 'Afghanistan': 7,
    'England': 8, 'West Indies': 9, 'Bangladesh': 10
}
df['team1_rank'] = df['Team1'].map(team_rank)
df['team2_rank'] = df['Team2'].map(team_rank)
df = df.dropna(subset=['team1_rank', 'team2_rank'])

df['rank_diff'] = df['team2_rank'] - df['team1_rank']
df['target'] = df.apply(lambda row: 1 if row['Winner'] == row['Team1'] else 0, axis=1)

# ------------------ Step 3: Split Data ------------------
features = ['Avg_Temp_C', 'home_advantage', 'won_toss', 'team1_rank', 'team2_rank', 'rank_diff']
X = df[features]
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# ------------------ Step 4: Hyperparameter Tuning ------------------
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [6, 10, -1],
    'learning_rate': [0.01, 0.1, 0.2],
    'num_leaves': [31, 50, 100]
}

lgbm = LGBMClassifier(boosting_type='gbdt', objective='binary', random_state=42)
grid = GridSearchCV(estimator=lgbm, param_grid=param_grid, scoring='accuracy', cv=3, n_jobs=-1, verbose=1)

grid.fit(X_train, y_train)
best_model = grid.best_estimator_

# ------------------ Step 5: Evaluate ------------------
y_pred = best_model.predict(X_test)
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# ------------------ Step 6: Save Model ------------------
joblib.dump(best_model, "lightgbmlarge_tuned_model.pkl")
print("📦 Model saved as lightgbmlarge_tuned_model.pkl")


ModuleNotFoundError: No module named 'lightgbm'