# Imports

In [259]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

awards_players_cleaned = pd.read_csv('../data/basketballPlayoffs_cleaned/awards_players_cleaned.csv')
coaches_cleaned = pd.read_csv('../data/basketballPlayoffs_cleaned/coaches_cleaned.csv')
players_cleaned = pd.read_csv('../data/basketballPlayoffs_cleaned/players_cleaned.csv')
players_teams_cleaned = pd.read_csv('../data/basketballPlayoffs_cleaned/players_teams_cleaned.csv')
series_post_cleaned = pd.read_csv('../data/basketballPlayoffs_cleaned/series_post_cleaned.csv')
teams_cleaned = pd.read_csv('../data/basketballPlayoffs_cleaned/teams_cleaned.csv')
teams_post_cleaned = pd.read_csv('../data/basketballPlayoffs_cleaned/teams_post_cleaned.csv')

# Obtain data from only 9 years

In [260]:
awards_players_model = awards_players_cleaned[awards_players_cleaned['year'] != 10]
coaches_model = coaches_cleaned[coaches_cleaned['year'] != 10]
players_teams_model = players_teams_cleaned[players_teams_cleaned['year'] != 10]
series_post_model = series_post_cleaned[series_post_cleaned['year'] != 10]
teams_model = teams_cleaned[teams_cleaned['year'] != 10]
teams_post_model = teams_post_cleaned[teams_post_cleaned['year'] != 10]

os.makedirs('../data/basketballPlayoffs_model', exist_ok=True)

awards_players_model.to_csv('../data/basketballPlayoffs_model/awards_players_model.csv', index=False)
coaches_model.to_csv('../data/basketballPlayoffs_model/coaches_model.csv', index=False)
players_cleaned.to_csv('../data/basketballPlayoffs_model/players_model.csv', index=False)
players_teams_model.to_csv('../data/basketballPlayoffs_model/players_teams_model.csv', index=False)
series_post_model.to_csv('../data/basketballPlayoffs_model/series_post_model.csv', index=False)
teams_model.to_csv('../data/basketballPlayoffs_model/teams_model.csv', index=False)
teams_post_model.to_csv('../data/basketballPlayoffs_model/teams_post_model.csv', index=False)

# Overalls from only 9 years

In [261]:
players_overall_avg = players_teams_model.groupby('playerID')['OVERALL'].mean().reset_index()

players_overall_avg.rename(columns={'OVERALL': 'OVERALL_ALL_TIME'}, inplace=True)

players_overall_avg['OVERALL_ALL_TIME'] = players_overall_avg['OVERALL_ALL_TIME'].round(1)

players_overall_avg.to_csv('../data/basketballPlayoffs_model/players_overall_all_time_model.csv', index=False)

In [262]:
rookie_players = players_teams_model[players_teams_model['is_rookie'] == 1]

rookie_overall_avg = rookie_players['OVERALL'].mean().round(1)

rookie_overall_avg_df = pd.DataFrame({'rookie_overall_avg': [rookie_overall_avg]})

rookie_overall_avg_df.to_csv('../data/basketballPlayoffs_model/rookie_overall_avg_model.csv', index=False)

In [263]:
rookie_coaches = coaches_model[coaches_model['is_rookie'] == 1]

rookie_overall_coaches_avg = rookie_coaches['OVERALL'].mean().round(1)

rookie_overall_coaches_avg_df = pd.DataFrame({'rookie_overall_coaches_avg': [rookie_overall_coaches_avg]})

rookie_overall_coaches_avg_df.to_csv('../data/basketballPlayoffs_model/rookie_overall_coaches_avg_model.csv', index=False)

In [264]:
coaches_overall_avg = coaches_model.groupby('coachID')['OVERALL'].mean().reset_index()

coaches_overall_avg.rename(columns={'OVERALL': 'OVERALL_ALL_TIME'}, inplace=True)

coaches_overall_avg['OVERALL_ALL_TIME'] = coaches_overall_avg['OVERALL_ALL_TIME'].round(1)

coaches_overall_avg.to_csv('../data/basketballPlayoffs_model/coaches_overall_all_time_model.csv', index=False)

# Obtain ranks from the 10th year to compare

In [265]:
playoffs_10th_year = teams_cleaned[teams_cleaned['year'] == 10][['tmID', 'playoff']]

playoffs_10th_year.to_csv('../data/basketballPlayoffs_model/playoffs_10th_year.csv', index=False)

# Obtain teams line up and coach from the 10th year

In [266]:
coaches_model_10th_year = coaches_cleaned[coaches_cleaned['year'] == 10][['coachID', 'tmID',]]

coaches_model_10th_year.to_csv('../data/basketballPlayoffs_model/coaches_model_10th_year.csv', index=False)

In [267]:
players_model_10th_year = players_teams_cleaned[players_teams_cleaned['year'] == 10][['playerID','tmID']]

players_model_10th_year = players_model_10th_year.sort_values(by='tmID')

players_model_10th_year.to_csv('../data/basketballPlayoffs_model/players_model_10th_year.csv', index=False)

# Add overalls calculated from 9 years to the 10th year players and coach

In [268]:
players_model_10th_year = pd.read_csv('../data/basketballPlayoffs_model/players_model_10th_year.csv')
players_overall_all_time_model = pd.read_csv('../data/basketballPlayoffs_model/players_overall_all_time_model.csv')

rookie_avg = 5.6 

players_model_10th_year = players_model_10th_year.merge(
    players_overall_all_time_model[['playerID', 'OVERALL_ALL_TIME']],
    on='playerID',
    how='left'
).fillna({'OVERALL_ALL_TIME': rookie_avg})

players_model_10th_year.rename(columns={'OVERALL_ALL_TIME': 'OVERALL'}, inplace=True)

players_model_10th_year.to_csv('../data/basketballPlayoffs_model/players_model_10th_year.csv', index=False)

In [269]:
coaches_model_10th_year = pd.read_csv('../data/basketballPlayoffs_model/coaches_model_10th_year.csv')
coaches_overall_all_time_model = pd.read_csv('../data/basketballPlayoffs_model/coaches_overall_all_time_model.csv')

rookie_avg_coach = 8.9 

coaches_model_10th_year = coaches_model_10th_year.merge(
    coaches_overall_all_time_model[['coachID', 'OVERALL_ALL_TIME']],
    on='coachID',
    how='left'
).fillna({'OVERALL_ALL_TIME': rookie_avg_coach})

coaches_model_10th_year.rename(columns={'OVERALL_ALL_TIME': 'OVERALL'}, inplace=True)

coaches_model_10th_year.to_csv('../data/basketballPlayoffs_model/coaches_model_10th_year.csv', index=False)


# Given the teams players composition and their respective overalls calculate the team overall (mean of all players)

In [270]:
players_model_10th_year = pd.read_csv('../data/basketballPlayoffs_model/players_model_10th_year.csv')

team_overall = players_model_10th_year.groupby('tmID')['OVERALL'].mean().reset_index()

team_overall['overall_team'] = team_overall['OVERALL'].round(1)

team_overall = team_overall.drop(columns=['OVERALL'])

team_overall.to_csv('../data/basketballPlayoffs_model/team_overall.csv', index=False)

# Given the team coaches and their respective overalls calculate the coach overall (mean of all coaches(1 or more than 1))

In [271]:
coaches_model_10th_year = pd.read_csv('../data/basketballPlayoffs_model/coaches_model_10th_year.csv')

team_overall_coaches = coaches_model_10th_year.groupby('tmID')['OVERALL'].mean().reset_index()

team_overall_coaches['overall_team_coach'] = team_overall_coaches['OVERALL'].round(1)

team_overall_coaches = team_overall_coaches.drop(columns=['OVERALL'])

team_overall_coaches.to_csv('../data/basketballPlayoffs_model/team_overall_coaches.csv', index=False)

# Create team final overall ( 70% team + 30% coach)

In [272]:
team_overall = pd.read_csv('../data/basketballPlayoffs_model/team_overall.csv')
team_overall_coach = pd.read_csv('../data/basketballPlayoffs_model/team_overall_coaches.csv')
teams_model = pd.read_csv('../data/basketballPlayoffs_model/teams_model.csv')

merged_df = pd.merge(team_overall, team_overall_coach, on='tmID')

merged_df['overall_team_final'] = (merged_df['overall_team'] * 0.9) + (merged_df['overall_team_coach'] * 0.1)
merged_df['overall_team_final'] = merged_df['overall_team_final'].round(2)

merged_df = pd.merge(merged_df, teams_model[['tmID', 'confID']], on='tmID', how='left')

final_df = merged_df[['tmID', 'confID', 'overall_team_final']]

final_df = final_df.drop_duplicates(subset='tmID')

final_df = final_df.sort_values(by='overall_team_final', ascending=True)

final_df.to_csv('../data/basketballPlayoffs_model/team_final_overall.csv', index=False)

# Accuracy

In [273]:
team_final_overall = pd.read_csv('../data/basketballPlayoffs_model/team_final_overall.csv')
playoffs_10th_year = pd.read_csv('../data/basketballPlayoffs_model/playoffs_10th_year.csv')

top_teams_by_conf = team_final_overall.sort_values(by='overall_team_final', ascending=False).groupby('confID').head(4)

print("Teams that go to the playoffs acording to the previsions:\n")
print(top_teams_by_conf)

comparison = top_teams_by_conf.merge(playoffs_10th_year, on='tmID', how='left')

correct_predictions = comparison[comparison['playoff'] == 1]

print("\nTeams that are in the playoffs and in the previsions:\n")
print(correct_predictions)

accuracy = len(correct_predictions) / len(top_teams_by_conf) * 100
print(f"\nThe accuracy is {accuracy:.2f}%")


Teams that go to the playoffs acording to the previsions:

    tmID  confID  overall_team_final
12     8       1               11.17
11    16       1                9.13
10     5       0                8.95
9     13       1                8.68
8     17       1                8.54
5     11       0                8.28
4      7       0                8.21
3     19       0                8.18

Teams that are in the playoffs and in the previsions:

   tmID  confID  overall_team_final  playoff
0     8       1               11.17        1
1    16       1                9.13        1
2     5       0                8.95        1
3    13       1                8.68        1
4    17       1                8.54        1
6     7       0                8.21        1
7    19       0                8.18        1

The accuracy is 87.50%


## SVM MODEL

In [274]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import numpy as np

# Load the data
team_final_overall = pd.read_csv('../data/basketballPlayoffs_model/team_final_overall.csv')
playoffs_10th_year = pd.read_csv('../data/basketballPlayoffs_model/playoffs_10th_year.csv')

# Create separate models for each conference
def train_conference_svm(conf_data, conf_playoffs):
    X = conf_data[['overall_team_final']].values
    y = conf_playoffs['playoff'].values
    
    # Scale the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Try different SVM configurations
    svms = {
        'default': SVC(kernel='rbf', C=1.0, gamma='scale', class_weight='balanced'),
        'linear': SVC(kernel='linear', C=0.1, class_weight='balanced'),
        'custom_rbf': SVC(kernel='rbf', C=0.5, gamma=0.1, class_weight={0: 1, 1: 2})
    }
    
    best_accuracy = 0
    best_model = None
    best_scaler = None
    
    # Select the best performing model
    for name, svm in svms.items():
        svm.fit(X_scaled, y)
        accuracy = accuracy_score(y, svm.predict(X_scaled))
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model = svm
            best_scaler = scaler
            
    return best_model, best_scaler

# Split data by conference
conf_0_data = team_final_overall[team_final_overall['confID'] == 0].copy()
conf_1_data = team_final_overall[team_final_overall['confID'] == 1].copy()

# Sort by overall_team_final to ensure we're considering ranking
conf_0_data = conf_0_data.sort_values('overall_team_final', ascending=False)
conf_1_data = conf_1_data.sort_values('overall_team_final', ascending=False)

conf_0_playoffs = playoffs_10th_year[playoffs_10th_year['tmID'].isin(conf_0_data['tmID'])].copy()
conf_1_playoffs = playoffs_10th_year[playoffs_10th_year['tmID'].isin(conf_1_data['tmID'])].copy()

# Train separate models for each conference
svm_conf_0, scaler_conf_0 = train_conference_svm(conf_0_data, conf_0_playoffs)
svm_conf_1, scaler_conf_1 = train_conference_svm(conf_1_data, conf_1_playoffs)

# Make predictions with probability threshold
def predict_playoffs(team_data, svm_model, scaler):
    X = team_data[['overall_team_final']].values
    X_scaled = scaler.transform(X)
    
    # Get the top 4 teams based on overall_team_final
    n_teams = len(team_data)
    n_playoff_spots = 4
    
    # Get predicted probabilities
    if hasattr(svm_model, 'decision_function'):
        decision_scores = svm_model.decision_function(X_scaled)
    else:
        decision_scores = svm_model.predict(X_scaled)
    
    # Create predictions based on top 4 scores
    predictions = np.zeros(n_teams, dtype=int)
    top_indices = np.argsort(decision_scores)[-n_playoff_spots:]
    predictions[top_indices] = 1
    
    return predictions

# Get predictions for each conference
pred_conf_0 = predict_playoffs(conf_0_data, svm_conf_0, scaler_conf_0)
pred_conf_1 = predict_playoffs(conf_1_data, svm_conf_1, scaler_conf_1)

# Combine predictions
predictions = []
conf_0_idx = 0
conf_1_idx = 0

for idx, row in team_final_overall.iterrows():
    if row['confID'] == 0:
        predictions.append(pred_conf_0[conf_0_idx])
        conf_0_idx += 1
    else:
        predictions.append(pred_conf_1[conf_1_idx])
        conf_1_idx += 1

# Print results
results_df = pd.DataFrame({
    'Team_ID': team_final_overall['tmID'],
    'Conference': team_final_overall['confID'],
    'Overall': team_final_overall['overall_team_final'],
    'Predicted': predictions,
    'Actual': playoffs_10th_year['playoff']
})

print("\nPredictions vs Actual Results:")
print(results_df)

# Calculate and print all metrics
print("\nDetailed Classification Report:")
print(classification_report(playoffs_10th_year['playoff'], predictions))

accuracy = accuracy_score(playoffs_10th_year['playoff'], predictions) * 100
print(f"\nSVM Accuracy: {accuracy:.2f}%")


Predictions vs Actual Results:
    Team_ID  Conference  Overall  Predicted  Actual
0         2           0     7.90          1       1
1         0           0     7.97          0       0
2         4           0     8.06          0       0
3        19           0     8.18          0       1
4         7           0     8.21          1       1
5        11           0     8.28          1       1
6        15           1     8.38          1       0
7        10           1     8.45          0       0
8        17           1     8.54          0       1
9        13           1     8.68          1       0
10        5           0     8.95          1       1
11       16           1     9.13          1       1
12        8           1    11.17          1       1

Detailed Classification Report:
              precision    recall  f1-score   support

           0       0.60      0.60      0.60         5
           1       0.75      0.75      0.75         8

    accuracy                           0.69

## KNN Model

In [275]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import numpy as np

# Load the data
team_final_overall = pd.read_csv('../data/basketballPlayoffs_model/team_final_overall.csv')
playoffs_10th_year = pd.read_csv('../data/basketballPlayoffs_model/playoffs_10th_year.csv')

# Create separate models for each conference
def train_conference_knn(conf_data, conf_playoffs):
    X = conf_data[['overall_team_final']].values
    y = conf_playoffs['playoff'].values
    
    # Scale the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Adjust k_values based on number of samples
    n_samples = len(X)
    k_values = [k for k in [3, 5] if k < n_samples]  # Only use k values less than n_samples
    if not k_values:  # If no valid k values, use k=1
        k_values = [1]
    
    best_accuracy = 0
    best_model = None
    best_scaler = None
    
    # Select the best performing model
    for k in k_values:
        knn = KNeighborsClassifier(n_neighbors=k, weights='distance')
        knn.fit(X_scaled, y)
        accuracy = accuracy_score(y, knn.predict(X_scaled))
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model = knn
            best_scaler = scaler
            
    return best_model, best_scaler

# Split data by conference
conf_0_data = team_final_overall[team_final_overall['confID'] == 0].copy()
conf_1_data = team_final_overall[team_final_overall['confID'] == 1].copy()

# Sort by overall_team_final to ensure we're considering ranking
conf_0_data = conf_0_data.sort_values('overall_team_final', ascending=False)
conf_1_data = conf_1_data.sort_values('overall_team_final', ascending=False)

conf_0_playoffs = playoffs_10th_year[playoffs_10th_year['tmID'].isin(conf_0_data['tmID'])].copy()
conf_1_playoffs = playoffs_10th_year[playoffs_10th_year['tmID'].isin(conf_1_data['tmID'])].copy()

# Train separate models for each conference
knn_conf_0, scaler_conf_0 = train_conference_knn(conf_0_data, conf_0_playoffs)
knn_conf_1, scaler_conf_1 = train_conference_knn(conf_1_data, conf_1_playoffs)

# Make predictions with probability threshold
def predict_playoffs(team_data, knn_model, scaler):
    X = team_data[['overall_team_final']].values
    X_scaled = scaler.transform(X)
    
    # Get the top 4 teams based on overall_team_final
    n_teams = len(team_data)
    n_playoff_spots = 4
    
    # Get predicted probabilities
    probabilities = knn_model.predict_proba(X_scaled)[:, 1]  # Probability of class 1
    
    # Create predictions based on top 4 scores
    predictions = np.zeros(n_teams, dtype=int)
    top_indices = np.argsort(probabilities)[-n_playoff_spots:]
    predictions[top_indices] = 1
    
    return predictions

# Get predictions for each conference
pred_conf_0 = predict_playoffs(conf_0_data, knn_conf_0, scaler_conf_0)
pred_conf_1 = predict_playoffs(conf_1_data, knn_conf_1, scaler_conf_1)

# Combine predictions
predictions = []
conf_0_idx = 0
conf_1_idx = 0

for idx, row in team_final_overall.iterrows():
    if row['confID'] == 0:
        predictions.append(pred_conf_0[conf_0_idx])
        conf_0_idx += 1
    else:
        predictions.append(pred_conf_1[conf_1_idx])
        conf_1_idx += 1

# Print results
results_df = pd.DataFrame({
    'Team_ID': team_final_overall['tmID'],
    'Conference': team_final_overall['confID'],
    'Overall': team_final_overall['overall_team_final'],
    'Predicted': predictions,
    'Actual': playoffs_10th_year['playoff']
})

print("\nPredictions vs Actual Results:")
print(results_df)

# Calculate and print all metrics
print("\nDetailed Classification Report:")
print(classification_report(playoffs_10th_year['playoff'], predictions))

accuracy = accuracy_score(playoffs_10th_year['playoff'], predictions) * 100
print(f"\nKNN Accuracy: {accuracy:.2f}%")


Predictions vs Actual Results:
    Team_ID  Conference  Overall  Predicted  Actual
0         2           0     7.90          1       1
1         0           0     7.97          0       0
2         4           0     8.06          0       0
3        19           0     8.18          1       1
4         7           0     8.21          1       1
5        11           0     8.28          0       1
6        15           1     8.38          1       0
7        10           1     8.45          0       0
8        17           1     8.54          1       1
9        13           1     8.68          0       0
10        5           0     8.95          1       1
11       16           1     9.13          1       1
12        8           1    11.17          1       1

Detailed Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.80      0.80         5
           1       0.88      0.88      0.88         8

    accuracy                           0.85

## RFC (Random Forest Classifier)

In [276]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import numpy as np

# Load the data
team_final_overall = pd.read_csv('../data/basketballPlayoffs_model/team_final_overall.csv')
playoffs_10th_year = pd.read_csv('../data/basketballPlayoffs_model/playoffs_10th_year.csv')

# Create separate models for each conference
def train_conference_rf(conf_data, conf_playoffs):
    X = conf_data[['overall_team_final']].values
    y = conf_playoffs['playoff'].values
    
    # Scale the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Try different Random Forest configurations
    rf_configs = {
        'default': RandomForestClassifier(n_estimators=100, random_state=42),
        'balanced': RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42),
        'more_trees': RandomForestClassifier(n_estimators=200, min_samples_split=2, random_state=42)
    }
    
    best_accuracy = 0
    best_model = None
    best_scaler = None
    
    # Select the best performing model
    for name, rf in rf_configs.items():
        rf.fit(X_scaled, y)
        accuracy = accuracy_score(y, rf.predict(X_scaled))
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model = rf
            best_scaler = scaler
            
    return best_model, best_scaler

# Split data by conference
conf_0_data = team_final_overall[team_final_overall['confID'] == 0].copy()
conf_1_data = team_final_overall[team_final_overall['confID'] == 1].copy()

# Sort by overall_team_final to ensure we're considering ranking
conf_0_data = conf_0_data.sort_values('overall_team_final', ascending=False)
conf_1_data = conf_1_data.sort_values('overall_team_final', ascending=False)

conf_0_playoffs = playoffs_10th_year[playoffs_10th_year['tmID'].isin(conf_0_data['tmID'])].copy()
conf_1_playoffs = playoffs_10th_year[playoffs_10th_year['tmID'].isin(conf_1_data['tmID'])].copy()

# Train separate models for each conference
rf_conf_0, scaler_conf_0 = train_conference_rf(conf_0_data, conf_0_playoffs)
rf_conf_1, scaler_conf_1 = train_conference_rf(conf_1_data, conf_1_playoffs)

# Make predictions with probability threshold
def predict_playoffs(team_data, rf_model, scaler):
    X = team_data[['overall_team_final']].values
    X_scaled = scaler.transform(X)
    
    # Get the top 4 teams based on overall_team_final
    n_teams = len(team_data)
    n_playoff_spots = 4
    
    # Get predicted probabilities
    probabilities = rf_model.predict_proba(X_scaled)[:, 1]  # Probability of class 1
    
    # Create predictions based on top 4 scores
    predictions = np.zeros(n_teams, dtype=int)
    top_indices = np.argsort(probabilities)[-n_playoff_spots:]
    predictions[top_indices] = 1
    
    return predictions

# Get predictions for each conference
pred_conf_0 = predict_playoffs(conf_0_data, rf_conf_0, scaler_conf_0)
pred_conf_1 = predict_playoffs(conf_1_data, rf_conf_1, scaler_conf_1)

# Combine predictions
predictions = []
conf_0_idx = 0
conf_1_idx = 0

for idx, row in team_final_overall.iterrows():
    if row['confID'] == 0:
        predictions.append(pred_conf_0[conf_0_idx])
        conf_0_idx += 1
    else:
        predictions.append(pred_conf_1[conf_1_idx])
        conf_1_idx += 1

# Print results
results_df = pd.DataFrame({
    'Team_ID': team_final_overall['tmID'],
    'Conference': team_final_overall['confID'],
    'Overall': team_final_overall['overall_team_final'],
    'Predicted': predictions,
    'Actual': playoffs_10th_year['playoff']
})

print("\nPredictions vs Actual Results:")
print(results_df)

# Calculate and print all metrics
print("\nDetailed Classification Report:")
print(classification_report(playoffs_10th_year['playoff'], predictions))

accuracy = accuracy_score(playoffs_10th_year['playoff'], predictions) * 100
print(f"\nRandom Forest Accuracy: {accuracy:.2f}%")


Predictions vs Actual Results:
    Team_ID  Conference  Overall  Predicted  Actual
0         2           0     7.90          1       1
1         0           0     7.97          0       0
2         4           0     8.06          0       0
3        19           0     8.18          1       1
4         7           0     8.21          1       1
5        11           0     8.28          0       1
6        15           1     8.38          1       0
7        10           1     8.45          0       0
8        17           1     8.54          1       1
9        13           1     8.68          0       0
10        5           0     8.95          1       1
11       16           1     9.13          1       1
12        8           1    11.17          1       1

Detailed Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.80      0.80         5
           1       0.88      0.88      0.88         8

    accuracy                           0.85

## Logistic Regression (LR)

In [277]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import numpy as np

# Load the data
team_final_overall = pd.read_csv('../data/basketballPlayoffs_model/team_final_overall.csv')
playoffs_10th_year = pd.read_csv('../data/basketballPlayoffs_model/playoffs_10th_year.csv')

# Create separate models for each conference
def train_conference_lr(conf_data, conf_playoffs):
    X = conf_data[['overall_team_final']].values
    y = conf_playoffs['playoff'].values
    
    # Scale the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Try different Logistic Regression configurations
    lr_configs = {
        'default': LogisticRegression(random_state=42),
        'balanced': LogisticRegression(class_weight='balanced', random_state=42),
        'stronger_reg': LogisticRegression(C=0.1, random_state=42)
    }
    
    best_accuracy = 0
    best_model = None
    best_scaler = None
    
    # Select the best performing model
    for name, lr in lr_configs.items():
        lr.fit(X_scaled, y)
        accuracy = accuracy_score(y, lr.predict(X_scaled))
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model = lr
            best_scaler = scaler
            
    return best_model, best_scaler

# Split data by conference
conf_0_data = team_final_overall[team_final_overall['confID'] == 0].copy()
conf_1_data = team_final_overall[team_final_overall['confID'] == 1].copy()

# Sort by overall_team_final to ensure we're considering ranking
conf_0_data = conf_0_data.sort_values('overall_team_final', ascending=False)
conf_1_data = conf_1_data.sort_values('overall_team_final', ascending=False)

conf_0_playoffs = playoffs_10th_year[playoffs_10th_year['tmID'].isin(conf_0_data['tmID'])].copy()
conf_1_playoffs = playoffs_10th_year[playoffs_10th_year['tmID'].isin(conf_1_data['tmID'])].copy()

# Train separate models for each conference
lr_conf_0, scaler_conf_0 = train_conference_lr(conf_0_data, conf_0_playoffs)
lr_conf_1, scaler_conf_1 = train_conference_lr(conf_1_data, conf_1_playoffs)

# Make predictions with probability threshold
def predict_playoffs(team_data, lr_model, scaler):
    X = team_data[['overall_team_final']].values
    X_scaled = scaler.transform(X)
    
    # Get the top 4 teams based on overall_team_final
    n_teams = len(team_data)
    n_playoff_spots = 4
    
    # Get predicted probabilities
    probabilities = lr_model.predict_proba(X_scaled)[:, 1]  # Probability of class 1
    
    # Create predictions based on top 4 scores
    predictions = np.zeros(n_teams, dtype=int)
    top_indices = np.argsort(probabilities)[-n_playoff_spots:]
    predictions[top_indices] = 1
    
    return predictions

# Get predictions for each conference
pred_conf_0 = predict_playoffs(conf_0_data, lr_conf_0, scaler_conf_0)
pred_conf_1 = predict_playoffs(conf_1_data, lr_conf_1, scaler_conf_1)

# Combine predictions
predictions = []
conf_0_idx = 0
conf_1_idx = 0

for idx, row in team_final_overall.iterrows():
    if row['confID'] == 0:
        predictions.append(pred_conf_0[conf_0_idx])
        conf_0_idx += 1
    else:
        predictions.append(pred_conf_1[conf_1_idx])
        conf_1_idx += 1

# Print results
results_df = pd.DataFrame({
    'Team_ID': team_final_overall['tmID'],
    'Conference': team_final_overall['confID'],
    'Overall': team_final_overall['overall_team_final'],
    'Predicted': predictions,
    'Actual': playoffs_10th_year['playoff']
})

print("\nPredictions vs Actual Results:")
print(results_df)

# Calculate and print all metrics
print("\nDetailed Classification Report:")
print(classification_report(playoffs_10th_year['playoff'], predictions))

accuracy = accuracy_score(playoffs_10th_year['playoff'], predictions) * 100
print(f"\nLogistic Regression Accuracy: {accuracy:.2f}%")


Predictions vs Actual Results:
    Team_ID  Conference  Overall  Predicted  Actual
0         2           0     7.90          1       1
1         0           0     7.97          1       0
2         4           0     8.06          1       0
3        19           0     8.18          1       1
4         7           0     8.21          0       1
5        11           0     8.28          0       1
6        15           1     8.38          1       0
7        10           1     8.45          1       0
8        17           1     8.54          1       1
9        13           1     8.68          1       0
10        5           0     8.95          0       1
11       16           1     9.13          0       1
12        8           1    11.17          0       1

Detailed Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         5
           1       0.38      0.38      0.38         8

    accuracy                           0.23

## Decision Trees

In [278]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import numpy as np

# Load the data
team_final_overall = pd.read_csv('../data/basketballPlayoffs_model/team_final_overall.csv')
playoffs_10th_year = pd.read_csv('../data/basketballPlayoffs_model/playoffs_10th_year.csv')

# Create separate models for each conference
def train_conference_dt(conf_data, conf_playoffs):
    X = conf_data[['overall_team_final']].values
    y = conf_playoffs['playoff'].values
    
    # Scale the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Try different Decision Tree configurations
    dt_configs = {
        'default': DecisionTreeClassifier(random_state=42),
        'balanced': DecisionTreeClassifier(class_weight='balanced', random_state=42),
        'min_samples': DecisionTreeClassifier(min_samples_split=3, random_state=42)
    }
    
    best_accuracy = 0
    best_model = None
    best_scaler = None
    
    # Select the best performing model
    for name, dt in dt_configs.items():
        dt.fit(X_scaled, y)
        accuracy = accuracy_score(y, dt.predict(X_scaled))
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model = dt
            best_scaler = scaler
            
    return best_model, best_scaler

# Split data by conference
conf_0_data = team_final_overall[team_final_overall['confID'] == 0].copy()
conf_1_data = team_final_overall[team_final_overall['confID'] == 1].copy()

# Sort by overall_team_final to ensure we're considering ranking
conf_0_data = conf_0_data.sort_values('overall_team_final', ascending=False)
conf_1_data = conf_1_data.sort_values('overall_team_final', ascending=False)

conf_0_playoffs = playoffs_10th_year[playoffs_10th_year['tmID'].isin(conf_0_data['tmID'])].copy()
conf_1_playoffs = playoffs_10th_year[playoffs_10th_year['tmID'].isin(conf_1_data['tmID'])].copy()

# Train separate models for each conference
dt_conf_0, scaler_conf_0 = train_conference_dt(conf_0_data, conf_0_playoffs)
dt_conf_1, scaler_conf_1 = train_conference_dt(conf_1_data, conf_1_playoffs)

# Make predictions with probability threshold
def predict_playoffs(team_data, dt_model, scaler):
    X = team_data[['overall_team_final']].values
    X_scaled = scaler.transform(X)
    
    # Get the top 4 teams based on overall_team_final
    n_teams = len(team_data)
    n_playoff_spots = 4
    
    # Get predicted probabilities
    probabilities = dt_model.predict_proba(X_scaled)[:, 1]  # Probability of class 1
    
    # Create predictions based on top 4 scores
    predictions = np.zeros(n_teams, dtype=int)
    top_indices = np.argsort(probabilities)[-n_playoff_spots:]
    predictions[top_indices] = 1
    
    return predictions

# Get predictions for each conference
pred_conf_0 = predict_playoffs(conf_0_data, dt_conf_0, scaler_conf_0)
pred_conf_1 = predict_playoffs(conf_1_data, dt_conf_1, scaler_conf_1)

# Combine predictions
predictions = []
conf_0_idx = 0
conf_1_idx = 0

for idx, row in team_final_overall.iterrows():
    if row['confID'] == 0:
        predictions.append(pred_conf_0[conf_0_idx])
        conf_0_idx += 1
    else:
        predictions.append(pred_conf_1[conf_1_idx])
        conf_1_idx += 1

# Print results
results_df = pd.DataFrame({
    'Team_ID': team_final_overall['tmID'],
    'Conference': team_final_overall['confID'],
    'Overall': team_final_overall['overall_team_final'],
    'Predicted': predictions,
    'Actual': playoffs_10th_year['playoff']
})

print("\nPredictions vs Actual Results:")
print(results_df)

# Calculate and print all metrics
print("\nDetailed Classification Report:")
print(classification_report(playoffs_10th_year['playoff'], predictions))

accuracy = accuracy_score(playoffs_10th_year['playoff'], predictions) * 100
print(f"\nDecision Tree Accuracy: {accuracy:.2f}%")


Predictions vs Actual Results:
    Team_ID  Conference  Overall  Predicted  Actual
0         2           0     7.90          1       1
1         0           0     7.97          0       0
2         4           0     8.06          0       0
3        19           0     8.18          1       1
4         7           0     8.21          1       1
5        11           0     8.28          0       1
6        15           1     8.38          1       0
7        10           1     8.45          0       0
8        17           1     8.54          1       1
9        13           1     8.68          0       0
10        5           0     8.95          1       1
11       16           1     9.13          1       1
12        8           1    11.17          1       1

Detailed Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.80      0.80         5
           1       0.88      0.88      0.88         8

    accuracy                           0.85