**Rules for the code:**

- Include all the code you used for your report in this file. The code for any section in the report should go under the same section in this file.
- Any missing code will result in -20% from its corresponding section in the report.
- Any irrelevant code will result in -20% from its corresponding section in the report.
- Make sure that you run your code before rendering so that all the necessary visual/numeric outputs are visible.
- Any code that is not properly run or throws errors will be considered missing/irrelevant.

## 4) Data

In [13]:
import pandas as pd
df = pd.read_csv("high_diamond_ranked_10min.csv") 

# print("Dataset Preview:")
# print(df.head())

print(f"Number of Observations: {df.shape[0]}")
print(f"Number of Variables: {df.shape[1]}")

Number of Observations: 9879
Number of Variables: 40


In [14]:
categorical_features = df.select_dtypes(include=['object', 'category']).columns
numerical_features = df.select_dtypes(include=['int64', 'float64']).columns

print("Categorical Features:", len(categorical_features))
print("Numerical Features:", len(numerical_features))

Categorical Features: 0
Numerical Features: 40


In [15]:
missing_values = df.isnull().sum()
missing_values = missing_values[missing_values > 0]
print("missing values: \n", missing_values)

missing values: 
 Series([], dtype: int64)


## 5) Prediction

In [16]:
import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix, roc_curve, auc, roc_auc_score, precision_recall_curve
from sklearn.linear_model import Ridge, Lasso, LogisticRegression

In [17]:
# Baseline Model with only the strongest predictor (blueGoldDiff)

train_df, test_df = train_test_split(df, test_size=0.2, random_state=2)
X_train = train_df[["blueGoldDiff"]]  # Only blue team variables
y_train = train_df["blueWins"]  # Response variable

X_test = test_df[["blueGoldDiff"]]
y_test = test_df["blueWins"]

scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

logreg = LogisticRegression(penalty=None)
logreg.fit(X_train_scaled, y_train)
y_pred = logreg.predict(X_test_scaled)
y_pred_proba = logreg.predict_proba(X_test_scaled)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"ROC-AUC: {roc_auc:.4f}")
print("Confusion Matrix:")
print(conf_matrix)


Accuracy: 0.7298
Precision: 0.7286
Recall: 0.7331
ROC-AUC: 0.8080
Confusion Matrix:
[[717 270]
 [264 725]]


In [18]:
all_predictors = [col for col in df.columns if col != "blueWins"]
stepwise_results = []
selected_features = []
for predictor in all_predictors:
    if predictor not in selected_features:
        selected_features.append(predictor)

        # Slice predictors and response
        X_train = train_df[selected_features]
        y_train = train_df["blueWins"]
        X_test = test_df[selected_features]
        y_test = test_df["blueWins"]

        # Standardize the predictors
        scaler = StandardScaler()
        scaler.fit(X_train)
        X_train_scaled = scaler.transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        # Train logistic regression model with no regularization
        logreg = LogisticRegression(penalty=None)
        logreg.fit(X_train_scaled, y_train)

        # Make predictions
        y_pred = logreg.predict(X_test_scaled)
        y_pred_proba = logreg.predict_proba(X_test_scaled)[:, 1]

        # Compute evaluation metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_pred_proba)

        # Store results
        stepwise_results.append({
            "Added Predictor": predictor,
            "Accuracy": accuracy,
            "Precision": precision,
            "Recall": recall,
            "ROC-AUC": roc_auc
        })

# Convert results into DataFrame and display
stepwise_results_df = pd.DataFrame(stepwise_results)
stepwise_results_df

Unnamed: 0,Added Predictor,Accuracy,Precision,Recall,ROC-AUC
0,gameId,0.489879,0.489572,0.450961,0.491311
1,blueWardsPlaced,0.50253,0.506818,0.22548,0.493609
2,blueWardsDestroyed,0.485324,0.483452,0.413549,0.487393
3,blueFirstBlood,0.602227,0.603255,0.599596,0.597592
4,blueKills,0.648785,0.6551,0.629929,0.710353
5,blueDeaths,0.696356,0.697062,0.695652,0.781864
6,blueAssists,0.698381,0.697487,0.701719,0.781862
7,blueEliteMonsters,0.706478,0.708037,0.703741,0.789241
8,blueDragons,0.708502,0.708797,0.708797,0.789714
9,blueHeralds,0.70749,0.707786,0.707786,0.789701


In [19]:
# Polynomial features
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import RidgeCV, LassoCV, LogisticRegressionCV


X = df.drop(['gameId', 'blueWins'], axis=1)
y = df['blueWins']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)
feature_names = poly.get_feature_names_out(X.columns)


scaler = StandardScaler()
X_train_poly_scaled = scaler.fit_transform(X_train_poly)
X_test_poly_scaled = scaler.transform(X_test_poly)

Cs = [0.001, 0.01, 0.1, 1, 10, 100]  # 6 hyperparameter values
logregcv = LogisticRegressionCV(penalty="l2", Cs=Cs, cv=10, solver='saga')
logregcv.fit(X_train_poly_scaled, y_train)

best_C = logregcv.C_[0]
print(f"Best C value: {best_C}")

best_cv_score = logregcv.scores_[1].mean(axis=0).max()
print(f"Best CV Score: {best_cv_score:.4f}")

y_pred = logregcv.predict(X_test_poly_scaled)
y_pred_proba = logregcv.predict_proba(X_test_poly_scaled)[:, 1]


Best C value: 0.001
Best CV Score: 0.7311


In [20]:
# compute test performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
conf_matrix = confusion_matrix(y_test, y_pred)

print("\nTest Performance of the Trained and Tuned Model:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"ROC-AUC: {roc_auc:.4f}")
print("Confusion Matrix:")
print(conf_matrix)


Test Performance of the Trained and Tuned Model:
Accuracy: 0.7379
Precision: 0.7420
Recall: 0.7300
ROC-AUC: 0.8194
Confusion Matrix:
[[736 251]
 [267 722]]


In [21]:
# feature coefficients from the best model
coefficients = logregcv.coef_[0]

# use DataFrame for feature importance
feature_importance = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})

# sort by absolute coefficient value
feature_importance['Abs_Coefficient'] = feature_importance['Coefficient'].abs()
sorted_importance = feature_importance.sort_values('Abs_Coefficient', ascending=False)

# top 5 most important features
print("\n5 Most Important Features:")
print(sorted_importance.head(5))

# 5 least important features
print("\n5 Least Important Features:")
print(sorted_importance.tail(5))

# exactly zero coefficients
zero_coefs = sorted_importance[sorted_importance['Coefficient'] == 0]
print(f"\nFeatures with Zero Coefficients ({len(zero_coefs)}):")
if len(zero_coefs) > 0:
    print(zero_coefs)
else:
    print("No features have exactly zero coefficients.")



5 Most Important Features:
                                       Feature  Coefficient  Abs_Coefficient
363     blueTowersDestroyed redTowersDestroyed     0.029967         0.029967
595            redWardsPlaced redEliteMonsters    -0.026278         0.026278
138                  blueFirstBlood redHeralds     0.026087         0.026087
480  blueTotalJungleMinionsKilled blueGoldDiff     0.023833         0.023833
499   blueTotalJungleMinionsKilled redGoldDiff    -0.023833         0.023833

5 Least Important Features:
                                 Feature  Coefficient  Abs_Coefficient
698   redEliteMonsters redExperienceDiff    -0.000007         0.000007
535  blueExperienceDiff redEliteMonsters     0.000007         0.000007
333               blueHeralds redHeralds     0.000000         0.000000
302               blueDragons redDragons     0.000000         0.000000
132         blueFirstBlood redFirstBlood     0.000000         0.000000

Features with Zero Coefficients (3):
                 

In [22]:
# Polynomial features
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import RidgeCV, LassoCV, LogisticRegressionCV

import warnings
warnings.filterwarnings("ignore")


blue_team_columns = [
    "blueWardsPlaced", "blueWardsDestroyed", "blueFirstBlood",
    "blueKills", "blueDeaths", "blueAssists", "blueEliteMonsters", "blueDragons",
    "blueHeralds", "blueTowersDestroyed", "blueTotalGold", "blueAvgLevel",
    "blueTotalExperience", "blueTotalMinionsKilled", "blueTotalJungleMinionsKilled",
    "blueGoldDiff", "blueExperienceDiff", "blueCSPerMin", "blueGoldPerMin"
]
X = df[blue_team_columns] 
y = df['blueWins']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)
feature_names = poly.get_feature_names_out(X.columns)


scaler = StandardScaler()
X_train_poly_scaled = scaler.fit_transform(X_train_poly)
X_test_poly_scaled = scaler.transform(X_test_poly)

Cs = [0.001, 0.01, 0.1, 1, 10, 100]  # 6 hyperparameter values
logregcv = LogisticRegressionCV(penalty="l1", Cs=Cs, cv=10, solver='saga')
logregcv.fit(X_train_poly_scaled, y_train)

best_C = logregcv.C_[0]
print(f"Best C value: {best_C}")

best_cv_score = logregcv.scores_[1].mean(axis=0).max()
print(f"Best CV Score: {best_cv_score:.4f}")

y_pred = logregcv.predict(X_test_poly_scaled)
y_pred_proba = logregcv.predict_proba(X_test_poly_scaled)[:, 1]

# feature coefficients from the best model
coefficients = logregcv.coef_[0]

# use DataFrame for feature importance
feature_importance = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})

# sort by absolute coefficient value
feature_importance['Abs_Coefficient'] = feature_importance['Coefficient'].abs()
sorted_importance = feature_importance.sort_values('Abs_Coefficient', ascending=False)

# top 5 most important features
print("\n5 Most Important Features:")
print(sorted_importance.head(5))

# 5 least important features
print("\n5 Least Important Features:")
print(sorted_importance.tail(5))

# exactly zero coefficients
zero_coefs = sorted_importance[sorted_importance['Coefficient'] == 0]
print(f"\nFeatures with Zero Coefficients ({len(zero_coefs)}):")
if len(zero_coefs) > 0:
    print(zero_coefs)
else:
    print("No features have exactly zero coefficients.")

# compute test performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
conf_matrix = confusion_matrix(y_test, y_pred)

print("\nTest Performance of the Trained and Tuned Model:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"ROC-AUC: {roc_auc:.4f}")
print("Confusion Matrix:")
print(conf_matrix)


Best C value: 0.01
Best CV Score: 0.7295

5 Most Important Features:
                                       Feature  Coefficient  Abs_Coefficient
15                                blueGoldDiff     0.306995         0.306995
195  blueTotalJungleMinionsKilled blueGoldDiff     0.188068         0.188068
177                  blueAvgLevel blueGoldDiff     0.147541         0.147541
201                  blueGoldDiff blueCSPerMin     0.139341         0.139341
190        blueTotalMinionsKilled blueGoldDiff     0.139341         0.139341

5 Least Important Features:
                         Feature  Coefficient  Abs_Coefficient
74          blueKills blueDeaths          0.0              0.0
75         blueKills blueAssists          0.0              0.0
76   blueKills blueEliteMonsters          0.0              0.0
77         blueKills blueDragons          0.0              0.0
208             blueGoldPerMin^2          0.0              0.0

Features with Zero Coefficients (192):
                      

## 6) Inference

In [23]:
import warnings
warnings.filterwarnings("ignore")
final_predictors = [
    "blueGoldDiff",
    "blueTotalJungleMinionsKilled",
    "blueAvgLevel",
    "blueCSPerMin",
    "blueTotalMinionsKilled"
]

X = df[final_predictors]  # Use only selected predictors
y = df["blueWins"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)
feature_names = poly.get_feature_names_out(X.columns)

# Standardization
scaler = StandardScaler()
X_train_poly_scaled = scaler.fit_transform(X_train_poly)
X_test_poly_scaled = scaler.transform(X_test_poly)

# Train Logistic Regression with Ridge Regularization (L2)
Cs = [0.001, 0.01, 0.1, 1, 10, 100]  # 6 hyperparameter values
logregcv = LogisticRegressionCV(penalty="l2", Cs=Cs, cv=10, solver='saga', max_iter=1000)
logregcv.fit(X_train_poly_scaled, y_train)

# Best hyperparameter value
best_C = logregcv.C_[0]

# Best cross-validation score
best_cv_score = logregcv.scores_[1].mean(axis=0).max()

# Make test predictions
y_pred = logregcv.predict(X_test_poly_scaled)
y_pred_proba = logregcv.predict_proba(X_test_poly_scaled)[:, 1]

# Compute test performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
conf_matrix = confusion_matrix(y_test, y_pred)

print("\nTest Performance of the Trained and Tuned Model:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"ROC-AUC: {roc_auc:.4f}")
print("Confusion Matrix:")
print(conf_matrix)



Test Performance of the Trained and Tuned Model:
Accuracy: 0.7318
Precision: 0.7330
Recall: 0.7300
ROC-AUC: 0.8106
Confusion Matrix:
[[724 263]
 [267 722]]


In [24]:
X_train_poly_df = pd.DataFrame(X_train_poly, columns=feature_names, index=train_df.index)
X_train_poly_df["blueWins"] = train_df["blueWins"]
logit_model = smf.logit(formula = "blueWins ~ blueGoldDiff + blueTotalJungleMinionsKilled + blueAvgLevel + blueCSPerMin + blueTotalMinionsKilled + blueGoldDiff*blueTotalJungleMinionsKilled + blueGoldDiff*blueAvgLevel + blueGoldDiff*blueCSPerMin + blueGoldDiff*blueTotalMinionsKilled", data=X_train_poly_df)
result = logit_model.fit()
print(result.summary())

         Current function value: 0.539110
         Iterations: 35
                           Logit Regression Results                           
Dep. Variable:               blueWins   No. Observations:                 7903
Model:                          Logit   Df Residuals:                     7893
Method:                           MLE   Df Model:                            9
Date:                Mon, 17 Mar 2025   Pseudo R-squ.:                  0.2222
Time:                        16:17:01   Log-Likelihood:                -4260.6
converged:                      False   LL-Null:                       -5477.9
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                                coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------------
Intercept                                    -2.1020      0.820     -2.564      0.

In [25]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Compute VIF for each predictor
vif_data = pd.DataFrame()
vif_data["Feature"] = X_train_poly_df.columns
vif_data["VIF"] = [variance_inflation_factor(X_train_poly_df.values, i) for i in range(X_train_poly_df.shape[1])]

print(vif_data.sort_values(by="VIF", ascending=False))

                                              Feature           VIF
8                           blueGoldDiff blueCSPerMin           inf
17                                     blueCSPerMin^2           inf
15                          blueAvgLevel blueCSPerMin           inf
13  blueTotalJungleMinionsKilled blueTotalMinionsK...           inf
12          blueTotalJungleMinionsKilled blueCSPerMin           inf
9                 blueGoldDiff blueTotalMinionsKilled           inf
16                blueAvgLevel blueTotalMinionsKilled           inf
18                blueCSPerMin blueTotalMinionsKilled           inf
19                           blueTotalMinionsKilled^2           inf
4                              blueTotalMinionsKilled           inf
3                                        blueCSPerMin           inf
14                                     blueAvgLevel^2  1.626993e+05
2                                        blueAvgLevel  1.431420e+05
11          blueTotalJungleMinionsKilled blueAvg