In [3]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Step 1: Load dataset
df = pd.read_csv("t20i_Matches_Data_with_winner_runs_cleaned.csv")

# Step 2: Define features (X) and target (y)
X = df[['Team1 Runs Scored', 'Team2 Runs Scored', 'Match Venue (Country)','Team1 Name','Team2 Name','Match Winner']]
y = df['Winner Runs Scored']

# Step 3: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Step 4: Preprocess (OneHotEncode categorical features)
preprocessor = ColumnTransformer(
    transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), X.columns)]
)

# Step 5: Create pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Step 6: Train model
model.fit(X_train, y_train)

# Step 7: Make predictions
y_pred = model.predict(X_test)

# Step 8: Evaluate model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Linear Regression Performance:")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (R²): {r2:.2f}")


Linear Regression Performance:
Mean Absolute Error (MAE): 0.05
Mean Squared Error (MSE): 0.00
R-squared (R²): 0.71


# After using GridsearchCV

In [1]:
# Step 1: Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, KFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Step 2: Load dataset
df = pd.read_csv("t20i_Matches_Data_with_winner_runs_cleaned.csv")

# Step 3: Features (X) and target (y)
# Keep numeric runs as numeric; encode only the categorical columns.
cat_cols = ['Match Venue (Country)', 'Team1 Name', 'Team2 Name', 'Match Winner']
num_cols = ['Team1 Runs Scored', 'Team2 Runs Scored']

X = df[num_cols + cat_cols].copy()
y = df['Winner Runs Scored'].copy()

# Basic cleaning
for c in cat_cols:
    X[c] = X[c].fillna("Unknown").astype(str)
for c in num_cols:
    X[c] = pd.to_numeric(X[c], errors='coerce').fillna(0)

# Step 4: Train/Test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)

# Step 5: Preprocess
# - Scale numeric features (Ridge is scale-sensitive)
# - One-hot encode categoricals
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ]
)

# Step 6: Pipeline with Ridge (linear model) — we'll tune ONLY alpha
pipe = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", Ridge(random_state=42))
])

# Step 7: ONE hyperparameter grid (alpha only)
param_grid = {
    "regressor__alpha": [0.0, 0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0]
}

# Use KFold for regression (no stratification needed)
cv = KFold(n_splits=5, shuffle=True, random_state=42)

grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring="neg_mean_squared_error",  # minimize MSE
    cv=cv,
    n_jobs=-1,
    refit=True,
    verbose=1
)

# Step 8: Fit grid search
grid.fit(X_train, y_train)

print("\n=== Grid Search (Single Hyperparameter: alpha) ===")
print(f"Best alpha: {grid.best_params_['regressor__alpha']}")
best_cv_mse = -grid.best_score_
print(f"Best CV MSE : {best_cv_mse:.4f}")
print(f"Best CV RMSE: {np.sqrt(best_cv_mse):.4f}")

# Step 9: Evaluate on test set with the best model
best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("\n--- Final Test Performance (Best alpha) ---")
print(f"MAE : {mae:.2f}")
print(f"MSE : {mse:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R²  : {r2:.2f}")


Fitting 5 folds for each of 9 candidates, totalling 45 fits

=== Grid Search (Single Hyperparameter: alpha) ===
Best alpha: 20.0
Best CV MSE : 0.0018
Best CV RMSE: 0.0430

--- Final Test Performance (Best alpha) ---
MAE : 0.03
MSE : 0.00
RMSE: 0.04
R²  : 0.89
