In [42]:
import matplotlib
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree
import seaborn as sns
import numpy as np

In [44]:
# compute the average stats for each team
def compute_team_average_stats(game_data_path, team_details_path, output_path):
    df = pd.read_csv(game_data_path)
    teams_df = pd.read_csv(team_details_path)

    features_home = ['fg3_pct_home', 'ft_pct_home', 'oreb_home', 'dreb_home', 'reb_home', 'ast_home', 'stl_home',
                     'blk_home', 'tov_home', 'wl_home']
    features_away = ['fg3_pct_away', 'ft_pct_away', 'oreb_away', 'dreb_away', 'reb_away', 'ast_away', 'stl_away',
                     'blk_away', 'tov_away', 'wl_away']

    home_stats = df[['team_id_home'] + features_home]
    away_stats = df[['team_id_away'] + features_away]

    home_stats.columns = ['team_id'] + \
        [col.replace('_home', '') for col in features_home]
    away_stats.columns = ['team_id'] + \
        [col.replace('_away', '') for col in features_away]

    combined_stats = pd.concat([home_stats, away_stats], ignore_index=True)
    average_stats = combined_stats.groupby(
        'team_id').agg(lambda x: x.mean()).reset_index()
    average_stats.columns = [
        col if col != 'team_id' else col for col in average_stats.columns]

    average_stats = pd.merge(
        average_stats, teams_df[['team_id', 'abbreviation']], on='team_id')

    average_stats.to_csv(output_path, index=False)
    print(f"Team average stats saved to {output_path}")

# merge the game data with the team average stats to avoid overfitting
def augment_game_data_with_team_stats(game_data_path, team_stats_path):
    df_games = pd.read_csv(game_data_path)
    df_team_avg = pd.read_csv(team_stats_path)
    home_columns = {
        col: col + "_home_avg" for col in df_team_avg.columns if col not in ["team_id", "abbreviation"]}
    away_columns = {
        col: col + "_away_avg" for col in df_team_avg.columns if col not in ["team_id", "abbreviation"]}

    df_team_avg_home = df_team_avg.rename(columns=home_columns)
    df_team_avg_away = df_team_avg.rename(columns=away_columns)

    df_merged = pd.merge(df_games, df_team_avg_home, how="left",
                         left_on="team_id_home", right_on="team_id")
    df_merged = pd.merge(df_merged, df_team_avg_away,
                         how="left", left_on="team_id_away", right_on="team_id")
    df = df_merged.dropna(axis=0)

    df.to_csv('data/merged_data_complete.csv', index=False)
    # print(df)
    return df

# the logistic regression model
# def logistic_regression_model(X_train, y_train, X_dev, y_dev description, printing, plot=False):
#     model = LogisticRegression(max_iter=1000)
#     model.fit(x_train, y_train)

#     y_pred = model.predict(x_test)
#     accuracy = accuracy_score(y_test, y_pred)
#     print(f'{description} Logistic Regression Model Accuracy: {accuracy:.2f}')
#     # cm = confusion_matrix(y_test, y_pred)
#     # print("Confusion Matrix:")
#     # print(cm)
#     coefficients = model.coef_[0]
#     feature_names = x.columns.tolist()

#     sorted_indices = sorted(range(len(coefficients)),
#                             key=lambda i: abs(coefficients[i]), reverse=True)
#     sorted_coefficients = [coefficients[i] for i in sorted_indices]
#     sorted_feature_names = [feature_names[i] for i in sorted_indices]
#     if printing == 1:
#         print("WEIGHTS:")
#         for coefficient, feature_name in zip(sorted_coefficients, sorted_feature_names):
#             print(f"{feature_name}: {coefficient}")
#     if plot:
#         matplotlib.use('TkAgg')

#         coef_df = pd.DataFrame(
#             {'Feature': sorted_feature_names, 'Coefficient': sorted_coefficients})
#         plt.figure(figsize=(10, 8))
#         sns.barplot(x="Coefficient", y="Feature", data=coef_df)
#         plt.title('Feature Importances in Logistic Regression')
#         plt.xlabel('Coefficient Value')
#         plt.ylabel('Features')
#         plt.show()

#     return model

# # the support vector machine model
# def svm_model(X, y, description, printing):
#     x_train, x_test, y_train, y_test = train_test_split(
#         X, y, test_size=0.3, random_state=42)

#     # Linear kernel for simplicity, can try other kernels
#     model = SVC(kernel='linear', random_state=42)
#     model.fit(x_train, y_train)

#     accuracy = model.score(x_test, y_test)
#     print(f'{description} SVM Model Accuracy: {accuracy:.2f}')

#     if printing == 1:
#         print("SVM Parameters:")
#         print(model.get_params())

#     return model

# # the neural network model
# def neural_network_model(x, y, description):
#     x_train, x_test, y_train, y_test = train_test_split(
#         x, y, test_size=0.3, random_state=42)

#     model = MLPClassifier(hidden_layer_sizes=(
#         100, 50), max_iter=500, activation='relu', solver='adam', random_state=42)
#     model.fit(x_train, y_train)

#     accuracy = model.score(x_test, y_test)
#     print(f'{description} Neural Network Model Accuracy: {accuracy:.2f}')

#     return model

# # the decision tree model
# def train_and_visualize_decision_tree(data_path, features, target, test_size=0.3, random_state=42, max_depth=5):
#     matplotlib.use('TkAgg')
#     df = pd.read_csv(data_path)

#     X = df[features]
#     y = df[target]

#     X_train, X_test, y_train, y_test = train_test_split(
#         X, y, test_size=test_size, random_state=random_state)

#     tree = DecisionTreeClassifier(
#         max_depth=max_depth, random_state=random_state)
#     tree.fit(X_train, y_train)

#     y_pred = tree.predict(X_test)
#     accuracy = accuracy_score(y_test, y_pred)
#     print(f'Accuracy: {accuracy:.2f}')

#     plt.figure(figsize=(20, 10))
#     plot_tree(tree, filled=True, feature_names=features,
#               class_names=['Loss', 'Win'], rounded=True, fontsize=12)
#     plt.show()

In [45]:
# Data Preprocessing
game_data_path = 'data/game_data.csv' # path to the game data
team_details_path = 'data/team_details.csv' # path to the team details
combined_stats_path = 'data/team_average_stats_combined.csv' # path to the combined average stats

# convert the game data to the merged data
compute_team_average_stats(game_data_path, team_details_path, combined_stats_path)
df_augmented = augment_game_data_with_team_stats(game_data_path, combined_stats_path)

features_prefined = ['fg3_pct_home_avg', 'ft_pct_home_avg', 'oreb_home_avg', 'dreb_home_avg', 'ast_home_avg',
                         'reb_home_avg', 'stl_home_avg', 'blk_home_avg', 'tov_home_avg', 'fg3_pct_away_avg', 'ft_pct_away_avg', 'oreb_away_avg', 'dreb_away_avg', 'ast_away_avg',
                         'reb_away_avg', 'stl_away_avg', 'blk_away_avg', 'tov_away_avg']
features_baseline = ['fg3_pct_home_avg', 'fg3_pct_away_avg']

# split the data into features and target, 
X_prefined = df_augmented[features_prefined]
X_baseline = df_augmented[features_baseline]
y = df_augmented['wl_home']

# split the data into training, development, and testing
X_train_prefined, X_test_prefined, y_train, y_test = train_test_split(X_prefined, y, test_size=0.3, random_state=42)
X_train_baseline, X_test_baseline = train_test_split(X_baseline, test_size=0.3, random_state=42)
X_dev_prefined, X_test_prefined, y_dev, y_test = train_test_split(X_test_prefined, y_test, test_size=0.5, random_state=42)
X_dev_baseline, X_test_baseline = train_test_split(X_test_baseline, test_size=0.5, random_state=42)


Team average stats saved to data/team_average_stats_combined.csv


In [46]:
print(X_train_baseline.shape)
print(X_dev_baseline.shape)
print(X_test_baseline.shape)
print(X_dev_baseline.head())
print(X_test_prefined.head())


(2488, 2)
(533, 2)
(534, 2)
      fg3_pct_home_avg  fg3_pct_away_avg
1328          0.365899          0.368306
3555          0.354074          0.371405
2153          0.354074          0.348697
819           0.346322          0.362968
3410          0.368306          0.380565
      fg3_pct_home_avg  ft_pct_home_avg  oreb_home_avg  dreb_home_avg  \
233           0.356782         0.807259      10.357558      33.901163   
3184          0.339892         0.749864      10.487952      33.376506   
475           0.365899         0.782590      10.111702      33.688830   
2963          0.358616         0.791506       9.286164      33.949686   
2162          0.364152         0.783493       9.724928      34.621777   

      ast_home_avg  reb_home_avg  stl_home_avg  blk_home_avg  tov_home_avg  \
233      24.130814     44.258721      7.206395      4.651163     13.485465   
3184     22.750000     43.864458      7.798193      4.795181     15.590361   
475      27.095745     43.800532      7.545213      4

In [47]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [48]:
# train the logistic regression model
# hyperparameters to tune: max_iter, solver, penalty, l1_ratio
penalties = 'elasticnet'
l1_ratios = [0.1, 0.5, 0.9]
max_iters = [100, 500, 1000, 2000, 5000]
solver = 'saga'

best_baseline_accuracy = np.zeros(1)
best_prefined_accuracy = np.zeros(1)

best_baseline_model = None
best_prefined_model = None

best_baseline_settings = {
    "penalty": 'elasticnet',
    "l1_ratio": None,
    "max_iter": None,
    "solver": 'saga'
}
best_prefined_settings = {
    "penalty": 'elasticnet',
    "l1_ratio": None,
    "max_iter": None,
    "solver": 'saga'
}
# Loop through parameters
for l1 in l1_ratios:
    for m in max_iters:
        # ===== Implement a network with iterated settings ===== #
        # Note: set validation_fraction to 0.1 or leave as default
        log_reg_baseline = LogisticRegression(penalty=penalties, l1_ratio=l1, max_iter=m, solver=solver)
        log_reg_prefined = LogisticRegression(penalty=penalties, l1_ratio=l1, max_iter=m, solver=solver)
        # ===== End of Implement a network with iterated settings ===== #

        # ===== Train network ===== #
        log_reg_baseline.fit(X_train_baseline, y_train)
        log_reg_prefined.fit(X_train_prefined, y_train)
        # ===== End of Train network ===== #

        # ===== Test network ===== #
        # train accuracy
        train_accuracy_baseline = log_reg_baseline.score(X_train_baseline, y_train)
        train_accuracy_prefined = log_reg_prefined.score(X_train_prefined, y_train)
        
        # inference
        baseline_pred = log_reg_baseline.predict(X_dev_baseline)
        prefined_pred = log_reg_prefined.predict(X_dev_prefined)
        
        # Compute accuracy
        baseline_accuracy = accuracy_score(y_dev, baseline_pred)
        print(f"Baseline Accuracy: {baseline_accuracy} when l1 = {l1}, max_iter = {m}, solver = {solver}")
        prefined_accuracy = accuracy_score(y_dev, prefined_pred)
        print(f"Prefined Accuracy: {prefined_accuracy} when l1 = {l1}, max_iter = {m}, solver = {solver}")
        
        # Compute precision
        baseline_precision = precision_score(y_dev, baseline_pred)
        prefined_precision = precision_score(y_dev, prefined_pred)
        
        # Compute recall
        baseline_recall = recall_score(y_dev, baseline_pred)
        prefined_recall = recall_score(y_dev, prefined_pred)
        
        # Compute f1 score
        baseline_f1 = f1_score(y_dev, baseline_pred)
        prefined_f1 = f1_score(y_dev, prefined_pred)
        # ===== End of Test network ===== #

        # ===== Is it the best setting ===== #
        if baseline_accuracy >= best_baseline_accuracy:
            best_baseline_accuracy = baseline_accuracy
            best_baseline_settings = {
                "penalty": penalties,
                "l1_ratio": l1,
                "max_iter": m,
                "solver": solver
            }
            best_baseline_model = log_reg_baseline
            
        if prefined_accuracy >= best_prefined_accuracy:
            best_prefined_accuracy = prefined_accuracy
            best_prefined_settings = {
                "penalty": penalties,
                "l1_ratio": l1,
                "max_iter": m,
                "solver": solver
            }
            best_prefined_model = log_reg_prefined
        # ===== End of Is it the best setting ===== #



Baseline Accuracy: 0.5928705440900562 when l1 = 0.1, max_iter = 100, solver = saga
Prefined Accuracy: 0.6097560975609756 when l1 = 0.1, max_iter = 100, solver = saga




Baseline Accuracy: 0.5928705440900562 when l1 = 0.1, max_iter = 500, solver = saga
Prefined Accuracy: 0.6210131332082551 when l1 = 0.1, max_iter = 500, solver = saga




Baseline Accuracy: 0.5928705440900562 when l1 = 0.1, max_iter = 1000, solver = saga
Prefined Accuracy: 0.6210131332082551 when l1 = 0.1, max_iter = 1000, solver = saga




Baseline Accuracy: 0.5928705440900562 when l1 = 0.1, max_iter = 2000, solver = saga
Prefined Accuracy: 0.6210131332082551 when l1 = 0.1, max_iter = 2000, solver = saga




Baseline Accuracy: 0.5928705440900562 when l1 = 0.1, max_iter = 5000, solver = saga
Prefined Accuracy: 0.6210131332082551 when l1 = 0.1, max_iter = 5000, solver = saga
Baseline Accuracy: 0.5928705440900562 when l1 = 0.5, max_iter = 100, solver = saga
Prefined Accuracy: 0.6097560975609756 when l1 = 0.5, max_iter = 100, solver = saga




Baseline Accuracy: 0.5928705440900562 when l1 = 0.5, max_iter = 500, solver = saga
Prefined Accuracy: 0.6191369606003753 when l1 = 0.5, max_iter = 500, solver = saga




Baseline Accuracy: 0.5928705440900562 when l1 = 0.5, max_iter = 1000, solver = saga
Prefined Accuracy: 0.6191369606003753 when l1 = 0.5, max_iter = 1000, solver = saga




Baseline Accuracy: 0.5928705440900562 when l1 = 0.5, max_iter = 2000, solver = saga
Prefined Accuracy: 0.6191369606003753 when l1 = 0.5, max_iter = 2000, solver = saga




Baseline Accuracy: 0.5928705440900562 when l1 = 0.5, max_iter = 5000, solver = saga
Prefined Accuracy: 0.6191369606003753 when l1 = 0.5, max_iter = 5000, solver = saga
Baseline Accuracy: 0.5928705440900562 when l1 = 0.9, max_iter = 100, solver = saga
Prefined Accuracy: 0.6097560975609756 when l1 = 0.9, max_iter = 100, solver = saga




Baseline Accuracy: 0.5928705440900562 when l1 = 0.9, max_iter = 500, solver = saga
Prefined Accuracy: 0.6191369606003753 when l1 = 0.9, max_iter = 500, solver = saga




Baseline Accuracy: 0.5928705440900562 when l1 = 0.9, max_iter = 1000, solver = saga
Prefined Accuracy: 0.6191369606003753 when l1 = 0.9, max_iter = 1000, solver = saga




Baseline Accuracy: 0.5928705440900562 when l1 = 0.9, max_iter = 2000, solver = saga
Prefined Accuracy: 0.6191369606003753 when l1 = 0.9, max_iter = 2000, solver = saga
Baseline Accuracy: 0.5928705440900562 when l1 = 0.9, max_iter = 5000, solver = saga
Prefined Accuracy: 0.6191369606003753 when l1 = 0.9, max_iter = 5000, solver = saga




In [49]:
print ("Best baseline settings: ", best_baseline_settings)
print ("Best prefined settings: ", best_prefined_settings)
print ("Best baseline accuracy: ", best_baseline_accuracy)
print ("Best prefined accuracy: ", best_prefined_accuracy)

Best baseline settings:  {'penalty': 'elasticnet', 'l1_ratio': 0.9, 'max_iter': 5000, 'solver': 'saga'}
Best prefined settings:  {'penalty': 'elasticnet', 'l1_ratio': 0.1, 'max_iter': 5000, 'solver': 'saga'}
Best baseline accuracy:  0.5928705440900562
Best prefined accuracy:  0.6210131332082551


In [50]:
# run on test set
baseline_pred = best_baseline_model.predict(X_test_baseline)
prefined_pred = best_prefined_model.predict(X_test_prefined)

baseline_accuracy = accuracy_score(y_test, baseline_pred)
prefined_accuracy = accuracy_score(y_test, prefined_pred)

print(f"Baseline Accuracy on test set: {baseline_accuracy}")
print(f"Prefined Accuracy on test set: {prefined_accuracy}")

baseline_precision = precision_score(y_test, baseline_pred)
prefined_precision = precision_score(y_test, prefined_pred)

baseline_recall = recall_score(y_test, baseline_pred)
prefined_recall = recall_score(y_test, prefined_pred)

baseline_f1 = f1_score(y_test, baseline_pred)
prefined_f1 = f1_score(y_test, prefined_pred)

print(f"Baseline Precision on test set: {baseline_precision}")
print(f"Prefined Precision on test set: {prefined_precision}")
print(f"Baseline Recall on test set: {baseline_recall}")
print(f"Prefined Recall on test set: {prefined_recall}")
print(f"Baseline F1 on test set: {baseline_f1}")
print(f"Prefined F1 on test set: {prefined_f1}")


Baseline Accuracy on test set: 0.5674157303370787
Prefined Accuracy on test set: 0.5898876404494382
Baseline Precision on test set: 0.5674157303370787
Prefined Precision on test set: 0.6099476439790575
Baseline Recall on test set: 1.0
Prefined Recall on test set: 0.768976897689769
Baseline F1 on test set: 0.7240143369175627
Prefined F1 on test set: 0.6802919708029197


In [51]:
# Train svm model
Cs = [0.1, 1, 10]
kernels = ['linear', 'rbf', 'sigmoid']
shrinking = [True, False]

best_baseline_accuracy = np.zeros(1)
best_prefined_accuracy = np.zeros(1)

best_baseline_svm = None
best_prefined_svm = None

best_baseline_settings = {
    "C": None,
    "kernel": None,
    "shrinking": None
}
best_prefined_settings = {
    "C": None,
    "kernel": None,
    "shrinking": None
}

for c in Cs:
    for k in kernels:
        for s in shrinking:
            svm_baseline = SVC(C=c, kernel=k, shrinking=s)
            svm_prefined = SVC(C=c, kernel=k, shrinking=s)
            
            # Train network
            svm_baseline.fit(X_train_baseline, y_train)
            svm_prefined.fit(X_train_prefined, y_train)
            
            # Test network
            # train accuracy
            train_accuracy_baseline = svm_baseline.score(X_train_baseline, y_train)
            train_accuracy_prefined = svm_prefined.score(X_train_prefined, y_train)
            
            # inference
            baseline_pred = svm_baseline.predict(X_dev_baseline)
            prefined_pred = svm_prefined.predict(X_dev_prefined)
            
            # Compute accuracy
            baseline_accuracy = accuracy_score(y_dev, baseline_pred)
            print(f"Baseline Accuracy: {baseline_accuracy} when C = {c}, kernel = {k}, shrinking = {s}")
            prefined_accuracy = accuracy_score(y_dev, prefined_pred)
            print(f"Prefined Accuracy: {prefined_accuracy} when C = {c}, kernel = {k}, shrinking = {s}")
            
            # Compute precision
            baseline_precision = precision_score(y_dev, baseline_pred)
            prefined_precision = precision_score(y_dev, prefined_pred)
            
            # Compute recall
            baseline_recall = recall_score(y_dev, baseline_pred)
            prefined_recall = recall_score(y_dev, prefined_pred)
            
            # Compute f1 score
            baseline_f1 = f1_score(y_dev, baseline_pred)
            prefined_f1 = f1_score(y_dev, prefined_pred)
            
            # Is it the best setting
            if baseline_accuracy >= best_baseline_accuracy:
                best_baseline_accuracy = baseline_accuracy
                best_baseline_settings = {
                    "C": c,
                    "kernel": k,
                    "shrinking": s
                }  
                best_baseline_svm = svm_baseline
            if prefined_accuracy >= best_prefined_accuracy:
                best_prefined_accuracy = prefined_accuracy
                best_prefined_settings = {
                    "C": c,
                    "kernel": k,
                    "shrinking": s
                }
                best_prefined_svm = svm_prefined


Baseline Accuracy: 0.5928705440900562 when C = 0.1, kernel = linear, shrinking = True
Prefined Accuracy: 0.6097560975609756 when C = 0.1, kernel = linear, shrinking = True
Baseline Accuracy: 0.5928705440900562 when C = 0.1, kernel = linear, shrinking = False
Prefined Accuracy: 0.6097560975609756 when C = 0.1, kernel = linear, shrinking = False
Baseline Accuracy: 0.5853658536585366 when C = 0.1, kernel = rbf, shrinking = True
Prefined Accuracy: 0.5928705440900562 when C = 0.1, kernel = rbf, shrinking = True
Baseline Accuracy: 0.5853658536585366 when C = 0.1, kernel = rbf, shrinking = False
Prefined Accuracy: 0.5928705440900562 when C = 0.1, kernel = rbf, shrinking = False
Baseline Accuracy: 0.5928705440900562 when C = 0.1, kernel = sigmoid, shrinking = True
Prefined Accuracy: 0.5928705440900562 when C = 0.1, kernel = sigmoid, shrinking = True
Baseline Accuracy: 0.5928705440900562 when C = 0.1, kernel = sigmoid, shrinking = False
Prefined Accuracy: 0.5928705440900562 when C = 0.1, kernel

In [52]:
print ("Best baseline settings: ", best_baseline_settings)
print ("Best prefined settings: ", best_prefined_settings)
print ("Best baseline accuracy: ", best_baseline_accuracy)
print ("Best prefined accuracy: ", best_prefined_accuracy)

Best baseline settings:  {'C': 10, 'kernel': 'sigmoid', 'shrinking': False}
Best prefined settings:  {'C': 1, 'kernel': 'linear', 'shrinking': False}
Best baseline accuracy:  0.5928705440900562
Best prefined accuracy:  0.6097560975609756


In [53]:
# run on test set
baseline_pred = best_baseline_svm.predict(X_test_baseline)
prefined_pred = best_prefined_svm.predict(X_test_prefined)

baseline_accuracy = accuracy_score(y_test, baseline_pred)
prefined_accuracy = accuracy_score(y_test, prefined_pred)

print(f"Baseline Accuracy on test set: {baseline_accuracy}")
print(f"Prefined Accuracy on test set: {prefined_accuracy}")

baseline_precision = precision_score(y_test, baseline_pred)
prefined_precision = precision_score(y_test, prefined_pred)

baseline_recall = recall_score(y_test, baseline_pred)
prefined_recall = recall_score(y_test, prefined_pred)

baseline_f1 = f1_score(y_test, baseline_pred)
prefined_f1 = f1_score(y_test, prefined_pred)

print(f"Baseline Precision on test set: {baseline_precision}")
print(f"Prefined Precision on test set: {prefined_precision}")
print(f"Baseline Recall on test set: {baseline_recall}")
print(f"Prefined Recall on test set: {prefined_recall}")
print(f"Baseline F1 on test set: {baseline_f1}")
print(f"Prefined F1 on test set: {prefined_f1}")

Baseline Accuracy on test set: 0.5674157303370787
Prefined Accuracy on test set: 0.5692883895131086
Baseline Precision on test set: 0.5674157303370787
Prefined Precision on test set: 0.5910224438902744
Baseline Recall on test set: 1.0
Prefined Recall on test set: 0.7821782178217822
Baseline F1 on test set: 0.7240143369175627
Prefined F1 on test set: 0.6732954545454546


In [32]:
# train neural network model
hidden_layer_sizes = [(100, 50), (100, 100, 50), (100, 100, 100, 50)]
activations = ['relu', 'tanh', 'logistic']
solvers = ['adam', 'sgd']
learning_rates = ['constant', 'adaptive']
alphas = [0.0001, 0.001, 0.01]

best_baseline_accuracy = np.zeros(1)
best_prefined_accuracy = np.zeros(1)

best_baseline_nn = None
best_prefined_nn = None

best_baseline_settings = {
    "hidden_layer_sizes": None,
    "activation": None,
    "solver": None,
    "learning_rate": None,
    "alpha": None
}
best_prefined_settings = {
    "hidden_layer_sizes": None,
    "activation": None,
    "solver": None,
    "learning_rate": None,
    "alpha": None
}

for h in hidden_layer_sizes:
    for a in activations:
        for s in solvers:
            for l in learning_rates:
                for alpha in alphas:
                    nn_baseline = MLPClassifier(hidden_layer_sizes=h, activation=a, solver=s, learning_rate=l, alpha=alpha)
                    nn_prefined = MLPClassifier(hidden_layer_sizes=h, activation=a, solver=s, learning_rate=l, alpha=alpha)
                    
                    # Train network
                    nn_baseline.fit(X_train_baseline, y_train)
                    nn_prefined.fit(X_train_prefined, y_train)
                    
                    # Test network
                    # train accuracy
                    train_accuracy_baseline = nn_baseline.score(X_train_baseline, y_train)
                    train_accuracy_prefined = nn_prefined.score(X_train_prefined, y_train)
                    
                    # inference
                    baseline_pred = nn_baseline.predict(X_dev_baseline)
                    prefined_pred = nn_prefined.predict(X_dev_prefined)
                    
                    # Compute accuracy
                    baseline_accuracy = accuracy_score(y_dev, baseline_pred)
                    print(f"Baseline Accuracy: {baseline_accuracy} when hidden_layer_sizes = {h}, activation = {a}, solver = {s}, learning_rate = {l}, alpha = {alpha}")
                    prefined_accuracy = accuracy_score(y_dev, prefined_pred)
                    print(f"Prefined Accuracy: {prefined_accuracy} when hidden_layer_sizes = {h}, activation = {a}, solver = {s}, learning_rate = {l}, alpha = {alpha}")
                    
                    # Compute precision
                    baseline_precision = precision_score(y_dev, baseline_pred)
                    prefined_precision = precision_score(y_dev, prefined_pred)
                    
                    # Compute recall
                    baseline_recall = recall_score(y_dev, baseline_pred)
                    prefined_recall = recall_score(y_dev, prefined_pred)
                    
                    # Compute f1 score
                    baseline_f1 = f1_score(y_dev, baseline_pred)
                    prefined_f1 = f1_score(y_dev, prefined_pred)
                    
                    # Is it the best setting
                    if baseline_accuracy >= best_baseline_accuracy:
                        best_baseline_accuracy = baseline_accuracy
                        best_baseline_settings = {
                            "hidden_layer_sizes": h,
                            "activation": a,
                            "solver": s,
                            "learning_rate": l,
                            "alpha": alpha
                        }
                        best_baseline_nn = nn_baseline
                    if prefined_accuracy >= best_prefined_accuracy:
                        best_prefined_accuracy = prefined_accuracy
                        best_prefined_settings = {
                            "hidden_layer_sizes": h,
                            "activation": a,
                            "solver": s,
                            "learning_rate": l,
                            "alpha": alpha
                        }
                        best_prefined_nn = nn_prefined

Baseline Accuracy: 0.5928705440900562 when hidden_layer_sizes = (100, 50), activation = relu, solver = adam, learning_rate = constant, alpha = 0.0001
Prefined Accuracy: 0.4652908067542214 when hidden_layer_sizes = (100, 50), activation = relu, solver = adam, learning_rate = constant, alpha = 0.0001
Baseline Accuracy: 0.5928705440900562 when hidden_layer_sizes = (100, 50), activation = relu, solver = adam, learning_rate = constant, alpha = 0.001
Prefined Accuracy: 0.5834896810506567 when hidden_layer_sizes = (100, 50), activation = relu, solver = adam, learning_rate = constant, alpha = 0.001
Baseline Accuracy: 0.5928705440900562 when hidden_layer_sizes = (100, 50), activation = relu, solver = adam, learning_rate = constant, alpha = 0.01
Prefined Accuracy: 0.4971857410881801 when hidden_layer_sizes = (100, 50), activation = relu, solver = adam, learning_rate = constant, alpha = 0.01
Baseline Accuracy: 0.5928705440900562 when hidden_layer_sizes = (100, 50), activation = relu, solver = ada



Baseline Accuracy: 0.5928705440900562 when hidden_layer_sizes = (100, 100, 100, 50), activation = tanh, solver = sgd, learning_rate = adaptive, alpha = 0.01
Prefined Accuracy: 0.6097560975609756 when hidden_layer_sizes = (100, 100, 100, 50), activation = tanh, solver = sgd, learning_rate = adaptive, alpha = 0.01
Baseline Accuracy: 0.5928705440900562 when hidden_layer_sizes = (100, 100, 100, 50), activation = logistic, solver = adam, learning_rate = constant, alpha = 0.0001
Prefined Accuracy: 0.5928705440900562 when hidden_layer_sizes = (100, 100, 100, 50), activation = logistic, solver = adam, learning_rate = constant, alpha = 0.0001
Baseline Accuracy: 0.5928705440900562 when hidden_layer_sizes = (100, 100, 100, 50), activation = logistic, solver = adam, learning_rate = constant, alpha = 0.001
Prefined Accuracy: 0.5928705440900562 when hidden_layer_sizes = (100, 100, 100, 50), activation = logistic, solver = adam, learning_rate = constant, alpha = 0.001
Baseline Accuracy: 0.59287054409

In [33]:
print ("Best baseline settings: ", best_baseline_settings)
print ("Best prefined settings: ", best_prefined_settings)

print ("Best baseline accuracy: ", best_baseline_accuracy)
print ("Best prefined accuracy: ", best_prefined_accuracy)

Best baseline settings:  {'hidden_layer_sizes': (100, 100, 50), 'activation': 'tanh', 'solver': 'adam', 'learning_rate': 'constant', 'alpha': 0.01}
Best prefined settings:  {'hidden_layer_sizes': (100, 50), 'activation': 'logistic', 'solver': 'adam', 'learning_rate': 'adaptive', 'alpha': 0.01}
Best baseline accuracy:  0.5966228893058161
Best prefined accuracy:  0.626641651031895


In [54]:
# run on test set
baseline_pred = best_baseline_nn.predict(X_test_baseline)
prefined_pred = best_prefined_nn.predict(X_test_prefined)

baseline_accuracy = accuracy_score(y_test, baseline_pred)
prefined_accuracy = accuracy_score(y_test, prefined_pred)

print(f"Baseline Accuracy on test set: {baseline_accuracy}")
print(f"Prefined Accuracy on test set: {prefined_accuracy}")

baseline_precision = precision_score(y_test, baseline_pred)
prefined_precision = precision_score(y_test, prefined_pred)

baseline_recall = recall_score(y_test, baseline_pred)
prefined_recall = recall_score(y_test, prefined_pred)

baseline_f1 = f1_score(y_test, baseline_pred)
prefined_f1 = f1_score(y_test, prefined_pred)

print(f"Baseline Precision on test set: {baseline_precision}")
print(f"Prefined Precision on test set: {prefined_precision}")
print(f"Baseline Recall on test set: {baseline_recall}")
print(f"Prefined Recall on test set: {prefined_recall}")
print(f"Baseline F1 on test set: {baseline_f1}")
print(f"Prefined F1 on test set: {prefined_f1}")

Baseline Accuracy on test set: 0.6142322097378277
Prefined Accuracy on test set: 0.5936329588014981
Baseline Precision on test set: 0.6227848101265823
Prefined Precision on test set: 0.5899581589958159
Baseline Recall on test set: 0.8118811881188119
Prefined Recall on test set: 0.9306930693069307
Baseline F1 on test set: 0.7048710601719198
Prefined F1 on test set: 0.7221510883482715


In [35]:
# train decision tree model
criteria = ['gini', 'entropy']
splitters = ['best', 'random']

best_baseline_accuracy = np.zeros(1)
best_prefined_accuracy = np.zeros(1)

best_baseline_tree = None
best_prefined_tree = None

best_baseline_settings = {
    "criterion": None,
    "splitter": None
}
best_prefined_settings = {  
    "criterion": None,
    "splitter": None
}

for c in criteria:
    for s in splitters:
        tree_baseline = DecisionTreeClassifier(criterion=c, splitter=s)
        tree_prefined = DecisionTreeClassifier(criterion=c, splitter=s)
        
        # Train network
        tree_baseline.fit(X_train_baseline, y_train)
        tree_prefined.fit(X_train_prefined, y_train)
        
        # Test network
        # train accuracy
        train_accuracy_baseline = tree_baseline.score(X_train_baseline, y_train)
        train_accuracy_prefined = tree_prefined.score(X_train_prefined, y_train)
        
        # inference
        baseline_pred = tree_baseline.predict(X_dev_baseline)
        prefined_pred = tree_prefined.predict(X_dev_prefined)
        
        # Compute accuracy
        baseline_accuracy = accuracy_score(y_dev, baseline_pred)
        print(f"Baseline Accuracy: {baseline_accuracy} when criterion = {c}, splitter = {s}")
        prefined_accuracy = accuracy_score(y_dev, prefined_pred)
        print(f"Prefined Accuracy: {prefined_accuracy} when criterion = {c}, splitter = {s}")
        
        # Compute precision
        baseline_precision = precision_score(y_dev, baseline_pred)
        prefined_precision = precision_score(y_dev, prefined_pred)
        
        # Compute recall
        baseline_recall = recall_score(y_dev, baseline_pred)
        prefined_recall = recall_score(y_dev, prefined_pred)
        
        # Compute f1 score
        baseline_f1 = f1_score(y_dev, baseline_pred)
        prefined_f1 = f1_score(y_dev, prefined_pred)
        
        # Is it the best setting
        if baseline_accuracy >= best_baseline_accuracy:
            best_baseline_accuracy = baseline_accuracy
            best_baseline_settings = {
                "criterion": c,
                "splitter": s
            }
            best_baseline_tree = tree_baseline
        if prefined_accuracy >= best_prefined_accuracy:
            best_prefined_accuracy = prefined_accuracy
            best_prefined_settings = {
                "criterion": c,
                "splitter": s
            }
            best_prefined_tree = tree_prefined

Baseline Accuracy: 0.5478424015009381 when criterion = gini, splitter = best
Prefined Accuracy: 0.5440900562851783 when criterion = gini, splitter = best
Baseline Accuracy: 0.5478424015009381 when criterion = gini, splitter = random
Prefined Accuracy: 0.5478424015009381 when criterion = gini, splitter = random
Baseline Accuracy: 0.5478424015009381 when criterion = entropy, splitter = best
Prefined Accuracy: 0.5440900562851783 when criterion = entropy, splitter = best
Baseline Accuracy: 0.5478424015009381 when criterion = entropy, splitter = random
Prefined Accuracy: 0.5478424015009381 when criterion = entropy, splitter = random


In [36]:
print("Best baseline settings: ", best_baseline_settings)
print("Best prefined settings: ", best_prefined_settings)

print("Best baseline accuracy: ", best_baseline_accuracy)
print("Best prefined accuracy: ", best_prefined_accuracy)

Best baseline settings:  {'criterion': 'entropy', 'splitter': 'random'}
Best prefined settings:  {'criterion': 'entropy', 'splitter': 'random'}
Best baseline accuracy:  0.5478424015009381
Best prefined accuracy:  0.5478424015009381


In [55]:
# run on test set
baseline_pred = best_baseline_tree.predict(X_test_baseline)
prefined_pred = best_prefined_tree.predict(X_test_prefined)

baseline_accuracy = accuracy_score(y_test, baseline_pred)
prefined_accuracy = accuracy_score(y_test, prefined_pred)

print(f"Baseline Accuracy on test set: {baseline_accuracy}")
print(f"Prefined Accuracy on test set: {prefined_accuracy}")

baseline_precision = precision_score(y_test, baseline_pred)
prefined_precision = precision_score(y_test, prefined_pred)

baseline_recall = recall_score(y_test, baseline_pred)
prefined_recall = recall_score(y_test, prefined_pred)

baseline_f1 = f1_score(y_test, baseline_pred)
prefined_f1 = f1_score(y_test, prefined_pred)

print(f"Baseline Precision on test set: {baseline_precision}")
print(f"Prefined Precision on test set: {prefined_precision}")
print(f"Baseline Recall on test set: {baseline_recall}")
print(f"Prefined Recall on test set: {prefined_recall}")
print(f"Baseline F1 on test set: {baseline_f1}")
print(f"Prefined F1 on test set: {prefined_f1}")

Baseline Accuracy on test set: 0.5393258426966292
Prefined Accuracy on test set: 0.5393258426966292
Baseline Precision on test set: 0.6014234875444839
Prefined Precision on test set: 0.6014234875444839
Baseline Recall on test set: 0.5577557755775577
Prefined Recall on test set: 0.5577557755775577
Baseline F1 on test set: 0.5787671232876712
Prefined F1 on test set: 0.5787671232876712
