# Imports

In [48]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

awards_players_cleaned = pd.read_csv('../data/basketballPlayoffs_cleaned/awards_players_cleaned.csv')
coaches_cleaned = pd.read_csv('../data/basketballPlayoffs_cleaned/coaches_cleaned.csv')
players_cleaned = pd.read_csv('../data/basketballPlayoffs_cleaned/players_cleaned.csv')
players_teams_cleaned = pd.read_csv('../data/basketballPlayoffs_cleaned/players_teams_cleaned.csv')
series_post_cleaned = pd.read_csv('../data/basketballPlayoffs_cleaned/series_post_cleaned.csv')
teams_cleaned = pd.read_csv('../data/basketballPlayoffs_cleaned/teams_cleaned.csv')
teams_post_cleaned = pd.read_csv('../data/basketballPlayoffs_cleaned/teams_post_cleaned.csv')

# Obtain data from only 9 years

In [49]:
awards_players_model = awards_players_cleaned[awards_players_cleaned['year'] != 10]
coaches_model = coaches_cleaned[coaches_cleaned['year'] != 10]
players_teams_model = players_teams_cleaned[players_teams_cleaned['year'] != 10]
series_post_model = series_post_cleaned[series_post_cleaned['year'] != 10]
teams_model = teams_cleaned[teams_cleaned['year'] != 10]
teams_post_model = teams_post_cleaned[teams_post_cleaned['year'] != 10]

os.makedirs('../data/basketballPlayoffs_model', exist_ok=True)

awards_players_model.to_csv('../data/basketballPlayoffs_model/awards_players_model.csv', index=False)
coaches_model.to_csv('../data/basketballPlayoffs_model/coaches_model.csv', index=False)
players_cleaned.to_csv('../data/basketballPlayoffs_model/players_model.csv', index=False)
players_teams_model.to_csv('../data/basketballPlayoffs_model/players_teams_model.csv', index=False)
series_post_model.to_csv('../data/basketballPlayoffs_model/series_post_model.csv', index=False)
teams_model.to_csv('../data/basketballPlayoffs_model/teams_model.csv', index=False)
teams_post_model.to_csv('../data/basketballPlayoffs_model/teams_post_model.csv', index=False)

# Overalls from only 9 years

In [50]:
players_overall_avg = players_teams_model.groupby('playerID')['OVERALL'].mean().reset_index()

players_overall_avg.rename(columns={'OVERALL': 'OVERALL_ALL_TIME'}, inplace=True)

players_overall_avg['OVERALL_ALL_TIME'] = players_overall_avg['OVERALL_ALL_TIME'].round(1)

players_overall_avg.to_csv('../data/basketballPlayoffs_model/players_overall_all_time_model.csv', index=False)

In [51]:
rookie_players = players_teams_model[players_teams_model['is_rookie'] == 1]

rookie_overall_avg = rookie_players['OVERALL'].mean().round(1)

rookie_overall_avg_df = pd.DataFrame({'rookie_overall_avg': [rookie_overall_avg]})

rookie_overall_avg_df.to_csv('../data/basketballPlayoffs_model/rookie_overall_avg_model.csv', index=False)

In [52]:
rookie_coaches = coaches_model[coaches_model['is_rookie'] == 1]

rookie_overall_coaches_avg = rookie_coaches['OVERALL'].mean().round(1)

rookie_overall_coaches_avg_df = pd.DataFrame({'rookie_overall_coaches_avg': [rookie_overall_coaches_avg]})

rookie_overall_coaches_avg_df.to_csv('../data/basketballPlayoffs_model/rookie_overall_coaches_avg_model.csv', index=False)

In [53]:
coaches_overall_avg = coaches_model.groupby('coachID')['OVERALL'].mean().reset_index()

coaches_overall_avg.rename(columns={'OVERALL': 'OVERALL_ALL_TIME'}, inplace=True)

coaches_overall_avg['OVERALL_ALL_TIME'] = coaches_overall_avg['OVERALL_ALL_TIME'].round(1)

coaches_overall_avg.to_csv('../data/basketballPlayoffs_model/coaches_overall_all_time_model.csv', index=False)

# Obtain ranks from the 10th year to compare

In [54]:
playoffs_10th_year = teams_cleaned[teams_cleaned['year'] == 10][['tmID', 'playoff']]

playoffs_10th_year.to_csv('../data/basketballPlayoffs_model/playoffs_10th_year.csv', index=False)

# Obtain teams line up and coach from the 10th year

In [55]:
coaches_model_10th_year = coaches_cleaned[coaches_cleaned['year'] == 10][['coachID', 'tmID',]]

coaches_model_10th_year.to_csv('../data/basketballPlayoffs_model/coaches_model_10th_year.csv', index=False)

In [56]:
players_model_10th_year = players_teams_cleaned[players_teams_cleaned['year'] == 10][['playerID','tmID']]

players_model_10th_year = players_model_10th_year.sort_values(by='tmID')

players_model_10th_year.to_csv('../data/basketballPlayoffs_model/players_model_10th_year.csv', index=False)

# Add overalls calculated from 9 years to the 10th year players and coach

In [57]:
players_model_10th_year = pd.read_csv('../data/basketballPlayoffs_model/players_model_10th_year.csv')
players_overall_all_time_model = pd.read_csv('../data/basketballPlayoffs_model/players_overall_all_time_model.csv')

rookie_avg = 5.6 

players_model_10th_year = players_model_10th_year.merge(
    players_overall_all_time_model[['playerID', 'OVERALL_ALL_TIME']],
    on='playerID',
    how='left'
).fillna({'OVERALL_ALL_TIME': rookie_avg})

players_model_10th_year.rename(columns={'OVERALL_ALL_TIME': 'OVERALL'}, inplace=True)

players_model_10th_year.to_csv('../data/basketballPlayoffs_model/players_model_10th_year.csv', index=False)

In [58]:
coaches_model_10th_year = pd.read_csv('../data/basketballPlayoffs_model/coaches_model_10th_year.csv')
coaches_overall_all_time_model = pd.read_csv('../data/basketballPlayoffs_model/coaches_overall_all_time_model.csv')

rookie_avg_coach = 8.9 

coaches_model_10th_year = coaches_model_10th_year.merge(
    coaches_overall_all_time_model[['coachID', 'OVERALL_ALL_TIME']],
    on='coachID',
    how='left'
).fillna({'OVERALL_ALL_TIME': rookie_avg_coach})

coaches_model_10th_year.rename(columns={'OVERALL_ALL_TIME': 'OVERALL'}, inplace=True)

coaches_model_10th_year.to_csv('../data/basketballPlayoffs_model/coaches_model_10th_year.csv', index=False)


# Given the teams players composition and their respective overalls calculate the team overall (mean of all players)

In [59]:
players_model_10th_year = pd.read_csv('../data/basketballPlayoffs_model/players_model_10th_year.csv')

team_overall = players_model_10th_year.groupby('tmID')['OVERALL'].mean().reset_index()

team_overall['overall_team'] = team_overall['OVERALL'].round(1)

team_overall = team_overall.drop(columns=['OVERALL'])

team_overall.to_csv('../data/basketballPlayoffs_model/team_overall.csv', index=False)

# Given the team coaches and their respective overalls calculate the coach overall (mean of all coaches(1 or more then 1))

In [60]:
coaches_model_10th_year = pd.read_csv('../data/basketballPlayoffs_model/coaches_model_10th_year.csv')

team_overall_coaches = coaches_model_10th_year.groupby('tmID')['OVERALL'].mean().reset_index()

team_overall_coaches['overall_team_coach'] = team_overall_coaches['OVERALL'].round(1)

team_overall_coaches = team_overall_coaches.drop(columns=['OVERALL'])

team_overall_coaches.to_csv('../data/basketballPlayoffs_model/team_overall_coaches.csv', index=False)

# Create team final overall ( 70% team + 30% coach)

In [61]:
team_overall = pd.read_csv('../data/basketballPlayoffs_model/team_overall.csv')
team_overall_coach = pd.read_csv('../data/basketballPlayoffs_model/team_overall_coaches.csv')
teams_model = pd.read_csv('../data/basketballPlayoffs_model/teams_model.csv')

merged_df = pd.merge(team_overall, team_overall_coach, on='tmID')

merged_df['overall_team_final'] = (merged_df['overall_team'] * 0.9) + (merged_df['overall_team_coach'] * 0.1)
merged_df['overall_team_final'] = merged_df['overall_team_final'].round(2)

merged_df = pd.merge(merged_df, teams_model[['tmID', 'confID']], on='tmID', how='left')

final_df = merged_df[['tmID', 'confID', 'overall_team_final']]

final_df = final_df.drop_duplicates(subset='tmID')

final_df = final_df.sort_values(by='overall_team_final', ascending=True)

final_df.to_csv('../data/basketballPlayoffs_model/team_final_overall.csv', index=False)

# Accuracy

In [62]:
team_final_overall = pd.read_csv('../data/basketballPlayoffs_model/team_final_overall.csv')
playoffs_10th_year = pd.read_csv('../data/basketballPlayoffs_model/playoffs_10th_year.csv')

top_teams_by_conf = team_final_overall.sort_values(by='overall_team_final', ascending=False).groupby('confID').head(4)

print("Teams that go to the playoffs acording to the previsions:\n")
print(top_teams_by_conf)

comparison = top_teams_by_conf.merge(playoffs_10th_year, on='tmID', how='left')

correct_predictions = comparison[comparison['playoff'] == 1]

print("\nTeams that are in the playoffs and in the previsions:\n")
print(correct_predictions)

accuracy = len(correct_predictions) / len(top_teams_by_conf) * 100
print(f"\nThe accuracy is {accuracy:.2f}%")


Teams that go to the playoffs acording to the previsions:

    tmID  confID  overall_team_final
12     8       1               11.17
11    16       1                9.13
10     5       0                8.95
9     13       1                8.68
8     17       1                8.54
5     11       0                8.28
4      7       0                8.21
3     19       0                8.18

Teams that are in the playoffs and in the previsions:

   tmID  confID  overall_team_final  playoff
0     8       1               11.17        1
1    16       1                9.13        1
2     5       0                8.95        1
3    13       1                8.68        1
4    17       1                8.54        1
6     7       0                8.21        1
7    19       0                8.18        1

The accuracy is 87.50%


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import numpy as np

FEATURES = ['o_pts', 'd_pts', 'won', 'lost', 'homeW', 'homeL', 
            'awayW', 'awayL', 'confW', 'confL', 'tmORB', 'tmDRB', 'tmTRB']

def prepare_data(teams_model):
    X = teams_model[FEATURES]
    y = teams_model['playoff']
    return X, y

def train_knn_model(X_train, y_train):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    
    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(X_train_scaled, y_train)
    
    return knn, scaler

def predict_playoffs(teams_10th_year, knn_model, scaler):
    X_10th = teams_10th_year[FEATURES]
    X_10th_scaled = scaler.transform(X_10th)
    return knn_model.predict(X_10th_scaled)

teams_model = pd.read_csv('../data/basketballPlayoffs_model/teams_model.csv')
teams_10th_year = teams_cleaned[teams_cleaned['year'] == 10]

X, y = prepare_data(teams_model)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

knn_model, scaler = train_knn_model(X_train, y_train)

X_test_scaled = scaler.transform(X_test)
y_pred_test = knn_model.predict(X_test_scaled)
test_accuracy = accuracy_score(y_test, y_pred_test)
print(f"Test set accuracy: {test_accuracy*100:.2f}%")

predictions_10th = predict_playoffs(teams_10th_year, knn_model, scaler)

playoffs_10th_year = pd.read_csv('../data/basketballPlayoffs_model/playoffs_10th_year.csv')
actual_playoffs = playoffs_10th_year['playoff'].values

accuracy_10th = accuracy_score(actual_playoffs, predictions_10th)
print(f"\nAccuracy on 10th year predictions: {accuracy_10th*100:.2f}%")

results_df = pd.DataFrame({
    'Team_ID': teams_10th_year['tmID'].values,
    'Conference': teams_10th_year['confID'].values,
    'Predicted': predictions_10th,
    'Actual': actual_playoffs
})

print("\nOverall Predictions vs Actual Results:")
print(results_df)

print("\nTeams Predicted to Make Playoffs by Conference:")
for conf in results_df['Conference'].unique():
    print(f"\nConference {conf}:")
    conf_teams = results_df[results_df['Conference'] == conf]
    playoff_teams_conf = conf_teams[conf_teams['Predicted'] == 1]
    print(playoff_teams_conf[['Team_ID', 'Predicted', 'Actual']])

print("\nDetailed Classification Report:")
print(classification_report(actual_playoffs, predictions_10th))

correct_predictions = results_df[results_df['Predicted'] == results_df['Actual']]
accuracy = len(correct_predictions) / len(results_df) * 100
print(f"\nOverall Accuracy: {accuracy:.2f}%")

Test set accuracy: 92.31%

Accuracy on 10th year predictions: 76.92%

Overall Predictions vs Actual Results:
    Team_ID  Conference  Predicted  Actual
0         0           0          0       1
1         2           0          1       0
2         4           0          1       0
3         5           0          1       1
4         7           0          1       1
5         8           1          1       1
6        10           1          0       0
7        11           0          0       0
8        13           1          1       1
9        15           1          0       0
10       16           1          1       1
11       17           1          1       1
12       19           0          1       1

Teams Predicted to Make Playoffs by Conference:

Conference 0:
    Team_ID  Predicted  Actual
1         2          1       0
2         4          1       0
3         5          1       1
4         7          1       1
12       19          1       1

Conference 1:
    Team_ID  Predicted  