In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pandas as pd
import tensorflow as tf

# Import our input dataset
team_df = pd.read_csv('Resources/clean_team_data.csv')
team_df

Unnamed: 0,team,kp_adj_eff,kp_adj_off,kp_adj_def,seed,school,year,tournament_wins
0,2022 Kansas,25.5,119.4,93.9,1,Kansas,2022,6
1,2022 Arizona,27.2,119.6,92.4,1,Arizona,2022,2
2,2022 Gonzaga,33.0,121.8,88.8,1,Gonzaga,2022,2
3,2022 Baylor,26.3,117.9,91.6,1,Baylor,2022,1
4,2022 Duke,23.7,119.4,95.7,2,Duke,2022,4
...,...,...,...,...,...,...,...,...
891,2008 UMBC,4.1,113.8,109.7,15,UMBC,2008,0
892,2008 Mississippi Valley St.,-15.0,92.1,107.1,16,Mississippi Valley St.,2008,0
893,2008 Mount St. Mary's,-1.0,99.0,100.1,16,Mount St. Mary's,2008,0
894,2008 Portland St.,5.8,108.4,102.6,16,Portland St.,2008,0


In [2]:
# Create new "outcome" column that will dichotimize tournament_wins data
# (1 = three or more wins, 0 = less than three wins)
team_df['outcome'] = team_df['tournament_wins'].apply(lambda x: 1 if x>=3 else 0)
team_df.head()

Unnamed: 0,team,kp_adj_eff,kp_adj_off,kp_adj_def,seed,school,year,tournament_wins,outcome
0,2022 Kansas,25.5,119.4,93.9,1,Kansas,2022,6,1
1,2022 Arizona,27.2,119.6,92.4,1,Arizona,2022,2,0
2,2022 Gonzaga,33.0,121.8,88.8,1,Gonzaga,2022,2,0
3,2022 Baylor,26.3,117.9,91.6,1,Baylor,2022,1,0
4,2022 Duke,23.7,119.4,95.7,2,Duke,2022,4,1


In [3]:
# Drop columns that won't be used
team_df = team_df.drop(['team', 'kp_adj_def', 'school', 'year', 'tournament_wins'], axis=1)
team_df.head()

Unnamed: 0,kp_adj_eff,kp_adj_off,seed,outcome
0,25.5,119.4,1,1
1,27.2,119.6,1,0
2,33.0,121.8,1,0
3,26.3,117.9,1,0
4,23.7,119.4,2,1


In [4]:
# Remove outcome target from features data
y = team_df.outcome
X = team_df.drop(columns="outcome")

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [5]:
# Preprocess numerical data for neural network

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [6]:
# Define the logistic regression model
log_classifier = LogisticRegression(solver="lbfgs",max_iter=100)

# Train the model
log_classifier.fit(X_train,y_train)

# Evaluate the model
y_pred = log_classifier.predict(X_test)
print("\n\n")
print(f" Logistic regression model accuracy: {accuracy_score(y_test,y_pred):.3f}")

# Display the confusion matrix
from sklearn.metrics import confusion_matrix, classification_report
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    cm, index=["Actual Unsuccessful (0)", "Actual Successful (1)"],
    columns=["Predicted Unsuccessful (0)", "Predicted Successful (1)"]
)

# Displaying results
display(cm_df)
print("\n\n")

#Print classification report
report = classification_report(y_test, y_pred)
print(report)




 Logistic regression model accuracy: 0.862


Unnamed: 0,Predicted Unsuccessful (0),Predicted Successful (1)
Actual Unsuccessful (0),187,9
Actual Successful (1),22,6





              precision    recall  f1-score   support

           0       0.89      0.95      0.92       196
           1       0.40      0.21      0.28        28

    accuracy                           0.86       224
   macro avg       0.65      0.58      0.60       224
weighted avg       0.83      0.86      0.84       224

