In [2]:
# some useful mysklearn package import statements and reloads
import importlib
import tabulate
from collections import Counter


import mysklearn.myutils
importlib.reload(mysklearn.myutils)
import mysklearn.myutils as myutils

# uncomment once you paste your mypytable.py into mysklearn package
import mysklearn.mypytable
importlib.reload(mysklearn.mypytable)
from mysklearn.mypytable import MyPyTable

# uncomment once you paste your myclassifiers.py into mysklearn package
import mysklearn.myclassifiers
importlib.reload(mysklearn.myclassifiers)
from mysklearn.myclassifiers import MyKNeighborsClassifier, MyDummyClassifier, MyNaiveBayesClassifier, MyDecisionTreeClassifier

import mysklearn.myevaluation
importlib.reload(mysklearn.myevaluation)
import mysklearn.myevaluation as myevaluation

## Data Loading and Preprocessing
We start by loading the dataset and preparing it for classification. This includes:
- Removing unnecessary columns (`Season`, `HomeTeam`, `AwayTeam`).
- Checking for missing values.
- Splitting features and labels.


In [3]:
filename = "input_data/tournament_games2016-2021.csv"
table = MyPyTable()
table.load_from_file(filename)

table.drop_rows([table.column_names.index("Season"),
                 table.column_names.index("HomeTeam"),
                 table.column_names.index("AwayTeam")])

table.remove_rows_with_missing_values()

X = [row[:-1] for row in table.data]
y = [row[-1] for row in table.data]

# Display dataset statistics
print("Dataset Shape:", table.get_shape())
print("Target Distribution:", Counter(y))

Dataset Shape: (331, 18)
Target Distribution: Counter({'H': 174, 'A': 157})


## Step 1: Using Only `TournamentSeed`
This step uses the `TournamentSeed` feature to build classifiers. The models tested are:
1. Dummy Classifier
2. k-Nearest Neighbors (kNN)
3. Naive Bayes
4. Decision Tree

We evaluate the models using stratified 10-fold cross-validation.


In [4]:
# Feature extraction for Step 1
X_seed_only = [[row[-2]] for row in X]  # Extract only the "TournamentSeed" feature

# Encode categorical features
X_encoded = myutils.encode_categorical_features(X_seed_only)

# Stratified 10-fold cross-validation
n_splits = 10
folds = myevaluation.stratified_kfold_split(X_encoded, y, n_splits=n_splits, random_state=42)

# Define classifiers
dummy = MyDummyClassifier()
knn = MyKNeighborsClassifier(n_neighbors=3)
nb = MyNaiveBayesClassifier()
dt = MyDecisionTreeClassifier()

# Evaluate classifiers
classifiers = {
    "Dummy Classifier": MyDummyClassifier(),
    "KNN Classifier": MyKNeighborsClassifier(n_neighbors=3),
    "Naive Bayes Classifier": MyNaiveBayesClassifier(),
    "Decision Tree Classifier": MyDecisionTreeClassifier(),
}

results = {}

for name, clf in classifiers.items():
    metrics = myevaluation.cross_validate(clf, X_encoded, y, folds)
    results[name] = metrics

# Print results
for model, metrics in results.items():
    print(f"\nModel: {model}")
    metrics_table = [[key, value] for key, value in metrics.items()]
    print(tabulate.tabulate(metrics_table, headers=["Metric", "Value"], floatfmt=".2f"))


Model: Dummy Classifier
Metric       Value
---------  -------
accuracy      0.53
precision     0.53
recall        1.00
f1            0.69

Model: KNN Classifier
Metric       Value
---------  -------
accuracy      0.47
precision     0.00
recall        0.00
f1            0.00

Model: Naive Bayes Classifier
Metric       Value
---------  -------
accuracy      0.69
precision     0.69
recall        0.76
f1            0.72

Model: Decision Tree Classifier
Metric       Value
---------  -------
accuracy      0.69
precision     0.69
recall        0.76
f1            0.72


# Step 2: Using a Feature Subset of Your Choosing


In [5]:
# Choose a subset of features for Step 2
selected_features = [
    "RegularEndingWStreak",
    "RegularSeasonFGPercentMean",
    "LastOrdinalRank"
]

# Filter the data to include only the selected features
feature_indices = [table.column_names.index(feature) for feature in selected_features]
X_subset = [[row[i] for i in feature_indices] for row in table.data]

# Encode the selected features
X_encoded_subset = myutils.encode_categorical_features(X_subset)

# Stratified k-fold cross-validation
folds_subset = myevaluation.stratified_kfold_split(X_encoded_subset, y, n_splits=n_splits, random_state=42)

# Evaluate classifiers on the selected features
results_subset = {}
for name, clf in classifiers.items():
    print(f"Evaluating {name} on selected features...")
    metrics = myevaluation.cross_validate(clf, X_encoded_subset, y, folds_subset)
    results_subset[name] = metrics

# Display results
for model, metrics in results_subset.items():
    print(f"\nModel: {model}")
    metrics_table = [[key, float(value)] for key, value in metrics.items()]
    print(tabulate.tabulate(metrics_table, headers=["Metric", "Value"], floatfmt=".2f"))


Evaluating Dummy Classifier on selected features...
Evaluating KNN Classifier on selected features...
Evaluating Naive Bayes Classifier on selected features...
Evaluating Decision Tree Classifier on selected features...

Model: Dummy Classifier
Metric       Value
---------  -------
accuracy      0.53
precision     0.53
recall        1.00
f1            0.69

Model: KNN Classifier
Metric       Value
---------  -------
accuracy      0.67
precision     0.76
recall        0.55
f1            0.63

Model: Naive Bayes Classifier
Metric       Value
---------  -------
accuracy      0.69
precision     0.71
recall        0.70
f1            0.70

Model: Decision Tree Classifier
Metric       Value
---------  -------
accuracy      0.70
precision     0.70
recall        0.77
f1            0.73


# Decision Tree Rules and Pruning


In [6]:

dt = MyDecisionTreeClassifier()
dt.fit(X_encoded_subset, y)

decoded_rules = {
    392099197417919076: "H",
    4590619080915168380: "A"
}

print("\nDecision Tree Rules:")
dt.print_decision_rules(
    attribute_names=["TournamentSeed", "RegularEndingWStreak", "LastOrdinalRank"],
    class_name="Winner",
    decode_map=decoded_rules
)

pruned_rules = """
Pruned Rule Set:
1. If LastOrdinalRank = H AND TournamentSeed = H THEN Winner = H.
2. If LastOrdinalRank = A AND TournamentSeed = A THEN Winner = A.
3. If LastOrdinalRank = A AND RegularEndingWStreak = H THEN Winner = A.
"""
print(pruned_rules)



Decision Tree Rules:
IF LastOrdinalRank == -5327884911291504252 AND TournamentSeed == -5327884911291504252 AND RegularEndingWStreak == -5327884911291504252 THEN Winner = A
IF LastOrdinalRank == -5327884911291504252 AND TournamentSeed == -5327884911291504252 AND RegularEndingWStreak == -2890932978632004649 THEN Winner = A
IF LastOrdinalRank == -5327884911291504252 AND TournamentSeed == -2890932978632004649 AND RegularEndingWStreak == -5327884911291504252 THEN Winner = H
IF LastOrdinalRank == -5327884911291504252 AND TournamentSeed == -2890932978632004649 AND RegularEndingWStreak == -2890932978632004649 THEN Winner = A
IF LastOrdinalRank == -2890932978632004649 AND RegularEndingWStreak == -5327884911291504252 AND TournamentSeed == -5327884911291504252 THEN Winner = H
IF LastOrdinalRank == -2890932978632004649 AND RegularEndingWStreak == -5327884911291504252 AND TournamentSeed == -2890932978632004649 THEN Winner = H
IF LastOrdinalRank == -2890932978632004649 AND RegularEndingWStreak == -

In [7]:
# Specify filenames for the .dot and .pdf outputs
dot_filename = "tree_vis/tree"
pdf_filename = "tree_vis/tree"

# Visualize the tree
dt.visualize_tree(
    dot_fname=dot_filename,
    pdf_fname=pdf_filename,
    attribute_names=["TournamentSeed", "RegularEndingWStreak", "LastOrdinalRank"]
)


Tree visual saved to tree_vis/tree.dot and tree_vis/tree.pdf
