# **VI - Model Development for 11th Year**

## **V.1 Import Libraries and Load Data**

We start by importing all the necessary files to be used in the models predictions. 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

coaches_11th = pd.read_csv('../data/Season_11/coaches.csv')
players_teams_11th = pd.read_csv('../data/Season_11/players_teams.csv')
teams_11th = pd.read_csv('../data/Season_11/teams.csv')

## **V.2 Cleaning Data**

We eliminated unnecessary columns for the new predictions.

In [2]:
# coaches_11th = coaches_11th.drop(columns=['stint', 'lgID', 'year'])
# players_teams_11th = players_teams_11th.drop(columns=['stint', 'lgID', 'year'])
# teams_11th = teams_11th.drop(columns=['lgID', 'franchID', 'year', 'name', 'arena'])

# coaches_11th.to_csv('../data/Season_11/coaches.csv', index=False)
# players_teams_11th.to_csv('../data/Season_11/players_teams.csv', index=False)
# teams_11th.to_csv('../data/Season_11/teams.csv', index=False)

## **V.3 Mapping Categorical Values**

We mapped values such as teams and conferences for a better and faster evaluation of our models.

In [3]:
# map_teams = {
#     'ATL': 0, 'CHI': 2, 'CON': 4, 'IND': 7, 
#     'LAS': 8, 'MIN': 10, 'NYL': 11, 'PHO': 13, 
#     'SAS': 16, 'SEA': 17, 'WAS': 19, 'TUL': 20
# }

# map_conf = {
#   'EA': 0, 'WE': 1
# }

# coaches_11th['tmID'] = coaches_11th['tmID'].map(map_teams)
# players_teams_11th['tmID'] = players_teams_11th['tmID'].map(map_teams)
# teams_11th['tmID'] = teams_11th['tmID'].map(map_teams)
# teams_11th['confID'] = teams_11th['confID'].map(map_conf)

# coaches_11th.to_csv('../data/Season_11/coaches.csv', index=False)
# players_teams_11th.to_csv('../data/Season_11/players_teams.csv', index=False)
# teams_11th.to_csv('../data/Season_11/teams.csv', index=False)

## **V.4 Import data from the past 10 years**

We imported previously cleaned data containing data from the past 10 seasons and also verified that there was no data beyond 10 years to avoid future errors.

In [4]:
awards_players_cleaned = pd.read_csv('../data/basketballPlayoffs_cleaned/awards_players_cleaned.csv')
coaches_cleaned = pd.read_csv('../data/basketballPlayoffs_cleaned/coaches_cleaned.csv')
players_cleaned = pd.read_csv('../data/basketballPlayoffs_cleaned/players_cleaned.csv')
players_teams_cleaned = pd.read_csv('../data/basketballPlayoffs_cleaned/players_teams_cleaned.csv')
series_post_cleaned = pd.read_csv('../data/basketballPlayoffs_cleaned/series_post_cleaned.csv')
teams_cleaned = pd.read_csv('../data/basketballPlayoffs_cleaned/teams_cleaned.csv')
teams_post_cleaned = pd.read_csv('../data/basketballPlayoffs_cleaned/teams_post_cleaned.csv')

In [5]:
awards_players_10_years = awards_players_cleaned[awards_players_cleaned['year'] != 11]
coaches__10_years = coaches_cleaned[coaches_cleaned['year'] != 11]
players_teams_10_years = players_teams_cleaned[players_teams_cleaned['year'] != 11]
series_post_10_years = series_post_cleaned[series_post_cleaned['year'] != 11]
teams_10_years = teams_cleaned[teams_cleaned['year'] != 11]
teams_post_10_years = teams_post_cleaned[teams_post_cleaned['year'] != 11]

for df, name in [(awards_players_10_years, 'awards'), 
                 (coaches__10_years, 'coaches'),
                 (players_teams_10_years, 'players'),
                 (series_post_10_years, 'series'),
                 (teams_10_years, 'teams'),
                 (teams_post_10_years, 'teams_post')]:
    if df['year'].max() != 10:
        print(f"Warning: {name} contains data beyond year 10")

awards_players_10_years.to_csv('../data/Season_11/awards_players_10_years.csv', index=False)
coaches__10_years.to_csv('../data/Season_11/coaches_10_years.csv', index=False)
players_cleaned.to_csv('../data/Season_11/players_10_years.csv', index=False)
players_teams_10_years.to_csv('../data/Season_11/players_teams_10_years.csv', index=False)
series_post_10_years.to_csv('../data/Season_11/series_post_10_years.csv', index=False)
teams_10_years.to_csv('../data/Season_11/teams_10_years.csv', index=False)
teams_post_10_years.to_csv('../data/Season_11/teams_post_10_years.csv', index=False)

## **V.5 Calculated Overalls from 10 years**

We calculated the all-time overall averages for players, coaches over the last 10 years by grouping the data by playerID/coachID and computing the mean of their OVERALL ratings. This provided a better way to visualize all performance data received. We also calculated an average overall for rookies and an average overall for rookie coaches.

In [6]:
players_overall_avg = players_teams_10_years.groupby('playerID')['OVERALL'].mean().reset_index()

players_overall_avg.rename(columns={'OVERALL': 'OVERALL_ALL_TIME'}, inplace=True)

players_overall_avg['OVERALL_ALL_TIME'] = players_overall_avg['OVERALL_ALL_TIME'].round(1)

players_overall_avg.to_csv('../data/Season_11/players_overall_all_time_10_years.csv', index=False)

In [7]:
rookie_players = players_teams_10_years[players_teams_10_years['is_rookie'] == 1]

rookie_overall_avg = rookie_players['OVERALL'].mean().round(1)

rookie_overall_avg_df = pd.DataFrame({'rookie_overall_avg': [rookie_overall_avg]})

rookie_overall_avg_df.to_csv('../data/Season_11/rookie_overall_avg_10_years.csv', index=False)

In [8]:
rookie_coaches = coaches__10_years[coaches__10_years['is_rookie'] == 1]

rookie_overall_coaches_avg = rookie_coaches['OVERALL'].mean().round(1)

rookie_overall_coaches_avg_df = pd.DataFrame({'rookie_overall_coaches_avg': [rookie_overall_coaches_avg]})

rookie_overall_coaches_avg_df.to_csv('../data/Season_11/rookie_overall_coaches_avg_10_years.csv', index=False)

In [9]:
coaches_overall_avg = coaches__10_years.groupby('coachID')['OVERALL'].mean().reset_index()

coaches_overall_avg.rename(columns={'OVERALL': 'OVERALL_ALL_TIME'}, inplace=True)

coaches_overall_avg['OVERALL_ALL_TIME'] = coaches_overall_avg['OVERALL_ALL_TIME'].round(1)

coaches_overall_avg.to_csv('../data/Season_11/coaches_overall_all_time_10_years.csv', index=False)

## **V.6 Add overalls calculated from 10 years to the 11th year players and coach**

Afterward, given the lineups for Year 11 and the overalls of all players/coaches from the previous 10 years, we added the overalls to the lineups to provide an overview of the team quality in Year 11.

In [10]:
# players_teams = pd.read_csv('../data/Season_11/players_teams.csv')
# players_overall_all_time_10_years = pd.read_csv('../data/Season_11/players_overall_all_time_10_years.csv')

# rookie_avg = 5.6 

# players_teams = players_teams.merge(
#     players_overall_all_time_10_years[['playerID', 'OVERALL_ALL_TIME']],
#     on='playerID',
#     how='left'
# ).fillna({'OVERALL_ALL_TIME': rookie_avg})

# players_teams.rename(columns={'OVERALL_ALL_TIME': 'OVERALL'}, inplace=True)

# players_teams.to_csv('../data/Season_11/players_teams.csv', index=False)

In [11]:
# coaches = pd.read_csv('../data/Season_11/coaches.csv')
# coaches_overall_all_time_10_years = pd.read_csv('../data/Season_11/coaches_overall_all_time_10_years.csv')

# rookie_avg_coach = 9.0 

# coaches = coaches.merge(
#     coaches_overall_all_time_10_years[['coachID', 'OVERALL_ALL_TIME']],
#     on='coachID',
#     how='left'
# ).fillna({'OVERALL_ALL_TIME': rookie_avg_coach})

# coaches.rename(columns={'OVERALL_ALL_TIME': 'OVERALL'}, inplace=True)

# coaches.to_csv('../data/Season_11/coaches.csv', index=False)

## **V.7 Given the teams players composition and their respective overalls calculate the team overall (mean of all players)**

Next, using the player lineups and their respective overalls, we decided to create a team_overall that summarizes the average overall of each team.

In [12]:
players = pd.read_csv('../data/Season_11/players_teams.csv')

team_overall = players.groupby('tmID')['OVERALL'].mean().reset_index()

team_overall['overall_team'] = team_overall['OVERALL'].round(1)

team_overall = team_overall.drop(columns=['OVERALL'])

team_overall.to_csv('../data/Season_11/team_overall.csv', index=False)

## **V.8 Given the team coaches and their respective overalls calculate the coach overall (mean of all coaches(1 or more than 1))**

Then, we did the same for coaches, as a team may have had more than one coach throughout the year.

In [13]:
coaches = pd.read_csv('../data/Season_11/coaches.csv')

team_overall_coaches = coaches.groupby('tmID')['OVERALL'].mean().reset_index()

team_overall_coaches['overall_team_coach'] = team_overall_coaches['OVERALL'].round(1)

team_overall_coaches = team_overall_coaches.drop(columns=['OVERALL'])

team_overall_coaches.to_csv('../data/Season_11/team_overall_coaches.csv', index=False)

## **V.9 Implementing Models**

We trained several models (SVM, Decision Tree, Random Forest, Logistic Regression, and KNN) to predict playoff teams for Year 11 based on data from the past 10 years. Features like win rates, points, and coach/player overalls were used. After training, the best model was selected based on accuracy, and its adjusted predictions for playoff were outputted, sorted by team ID.

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

teams_10_years = pd.read_csv('../data/Season_11/teams_10_years.csv')
players_teams_10_years = pd.read_csv('../data/Season_11/players_teams_10_years.csv')
coaches_10_years = pd.read_csv('../data/Season_11/coaches_10_years.csv')
teams_year_11 = pd.read_csv('../data/Season_11/teams.csv')
players_teams_11 = pd.read_csv('../data/Season_11/players_teams.csv')
coaches_11 = pd.read_csv('../data/Season_11/coaches.csv')

def create_features_for_team(team_data, teams_history, players_teams_data, coaches_data, year):
    features = {
        'won': team_data['won'] if 'won' in team_data else 0,
        'lost': team_data['lost'] if 'lost' in team_data else 0,
        'o_pts': team_data['o_pts'] if 'o_pts' in team_data else 0,
        'd_pts': team_data['d_pts'] if 'd_pts' in team_data else 0,
        'o_reb': team_data['o_reb'] if 'o_reb' in team_data else 0,
        'd_reb': team_data['d_reb'] if 'd_reb' in team_data else 0,
        'confID': team_data['confID']
    }
    
    team_history_data = teams_history[
        (teams_history['tmID'] == team_data['tmID']) & 
        (teams_history['year'] <= year)
    ].sort_values('year')
    
    if len(team_history_data) > 0:
        features['win_rate_3yr'] = team_history_data['won'].tail(3).mean()
        features['playoff_rate_3yr'] = team_history_data['playoff'].tail(3).mean()
        features['points_diff_3yr'] = (team_history_data['o_pts'] - team_history_data['d_pts']).tail(3).mean()
    else:
        features['win_rate_3yr'] = 0
        features['playoff_rate_3yr'] = 0
        features['points_diff_3yr'] = 0
    
    team_players = players_teams_data[
        players_teams_data['tmID'] == team_data['tmID']
    ]
    features['player_overall_avg'] = team_players['OVERALL'].mean() if len(team_players) > 0 else 0
    
    team_coach = coaches_data[
        coaches_data['tmID'] == team_data['tmID']
    ]
    features['coach_overall'] = team_coach['OVERALL'].mean() if len(team_coach) > 0 else 0
    
    return list(features.values())

X_train = []
y_train = []

for year in range(1, 10):
    current_year_teams = teams_10_years[teams_10_years['year'] == year]
    next_year_teams = teams_10_years[teams_10_years['year'] == year + 1]
    
    for _, team in current_year_teams.iterrows():
        if team['tmID'] in next_year_teams['tmID'].values:
            features = create_features_for_team(
                team, 
                teams_10_years, 
                players_teams_10_years[players_teams_10_years['year'] == year],
                coaches_10_years[coaches_10_years['year'] == year],
                year
            )
            X_train.append(features)
            next_year_playoff = next_year_teams[next_year_teams['tmID'] == team['tmID']]['playoff'].iloc[0]
            y_train.append(next_year_playoff)

year_10_teams = teams_10_years[teams_10_years['year'] == 10]
for _, team in year_10_teams.iterrows():
    features = create_features_for_team(
        team,
        teams_10_years,
        players_teams_10_years[players_teams_10_years['year'] == 10],
        coaches_10_years[coaches_10_years['year'] == 10],
        10
    )
    X_train.append(features)
    y_train.append(team['playoff'])

X_train = np.array(X_train)
y_train = np.array(y_train)

X_predict = []
for _, team in teams_year_11.iterrows():
    features = create_features_for_team(
        team,
        teams_10_years,
        players_teams_11,
        coaches_11,
        11
    )
    X_predict.append(features)
X_predict = np.array(X_predict)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_predict_scaled = scaler.transform(X_predict)

models = {
    'SVM': SVC(kernel='rbf', probability=True, C=0.5, random_state=42),
    'Decision Tree': DecisionTreeClassifier(
        max_depth=3,
        min_samples_leaf=4,
        random_state=42
    ),
    'Random Forest': RandomForestClassifier(
        n_estimators=100,
        max_depth=5,
        min_samples_split=5,
        random_state=42
    ),
    'Logistic Regression': LogisticRegression(
        C=0.8,
        max_iter=1000, 
        random_state=42
    ),
    'KNN': KNeighborsClassifier(n_neighbors=7)
}

model_predictions = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    
    model.fit(X_train_scaled, y_train)
    
    train_accuracy = accuracy_score(y_train, model.predict(X_train_scaled)) * 100
    
    probs = model.predict_proba(X_predict_scaled)[:, 1]
    
    probs_adjusted = 8 * (1 - probs) / np.sum(1 - probs)

    model_predictions[name] = {
        'training_accuracy': train_accuracy,
        'probabilities': dict(zip(teams_year_11['tmID'], probs_adjusted))
    }
    
    print(f"Training Accuracy: {train_accuracy:.2f}%")
    print(f"Sum of probabilities: {np.sum(probs_adjusted):.2f}")

best_model = max(model_predictions.items(), key=lambda x: x[1]['training_accuracy'])
print(f"\nBest Model: {best_model[0]}")
print(f"Training Accuracy: {best_model[1]['training_accuracy']:.2f}%")

final_predictions = pd.DataFrame({
    'tmID': teams_year_11['tmID'],
    'Playoff': [best_model[1]['probabilities'][tmID] for tmID in teams_year_11['tmID']]
})

final_predictions = final_predictions.sort_values('tmID')
final_predictions['Playoff'] = final_predictions['Playoff'].apply(lambda x: "{:.2f}".format(x))

print("\nFinal predictions (sorted by tmID):")
print(final_predictions.to_string(index=False))

prob_sum = sum(float(x) for x in final_predictions['Playoff'])
print(f"\nSum of probabilities: {prob_sum:.2f}")


Training SVM...
Training Accuracy: 76.30%
Sum of probabilities: 8.00

Training Decision Tree...
Training Accuracy: 78.52%
Sum of probabilities: 8.00

Training Random Forest...
Training Accuracy: 93.33%
Sum of probabilities: 8.00

Training Logistic Regression...
Training Accuracy: 71.11%
Sum of probabilities: 8.00

Training KNN...
Training Accuracy: 73.33%
Sum of probabilities: 8.00

Best Model: Random Forest
Training Accuracy: 93.33%

Final predictions (sorted by tmID):
 tmID Playoff
    0    0.57
    2    0.57
    4    0.76
    7    0.76
    8    0.63
   10    0.52
   11    0.70
   13    0.77
   16    0.73
   17    0.75
   19    0.63
   20    0.62

Sum of probabilities: 8.01


In this section, we decided to use the SVM model, a new model different from the previously best-performing model. We incorporated team and coach overalls, average win/loss ratio from the past 10 years, total playoff appearances, and average rank (with higher ranks being worse for the team). Afterward, we calculated binary playoff probabilities and saved the output to a CSV file.

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

data_team = pd.read_csv('../data/Season_11/team_overall.csv')
data_coach = pd.read_csv('../data/Season_11/team_overall_coaches.csv')
data_conference = pd.read_csv('../data/Season_11/teams.csv')
data_historic = pd.read_csv('../data/Season_11/teams_10_years.csv')

data = data_team.merge(data_coach, on="tmID").merge(data_conference, on="tmID")

data_historic['win_loss_ratio'] = data_historic['won'] / (data_historic['won'] + data_historic['lost'])
win_loss_avg = data_historic.groupby('tmID')['win_loss_ratio'].mean().reset_index()
win_loss_avg.rename(columns={'win_loss_ratio': 'avg_win_loss_ratio'}, inplace=True)

data_historic['playoff_count'] = data_historic['playoff'].apply(lambda x: 1 if x == 1 else 0)
playoff_count = data_historic.groupby('tmID')['playoff_count'].sum().reset_index()
playoff_count.rename(columns={'playoff_count': 'total_playoffs'}, inplace=True)

rank_avg = data_historic.groupby('tmID')['rank'].mean().reset_index()
rank_avg.rename(columns={'rank': 'avg_rank'}, inplace=True)

data = data.merge(win_loss_avg, on='tmID', how='left')\
           .merge(playoff_count, on='tmID', how='left')\
           .merge(rank_avg, on='tmID', how='left')

data['avg_win_loss_ratio'] = data['avg_win_loss_ratio'].fillna(0)
data['total_playoffs'] = data['total_playoffs'].fillna(0)
data['avg_rank'] = data['avg_rank'].fillna(0)

data["weighted_overall"] = (
    8 * data["overall_team"] + 
    2 * data["overall_team_coach"] + 
    1 * data["avg_win_loss_ratio"] +  
    1 * data["total_playoffs"] +      
    -2 * data["avg_rank"]              
)

data["Playoff"] = 0
for conf in data["confID"].unique():
    conf_teams = data[data["confID"] == conf]
    top_teams = conf_teams.nlargest(4, "weighted_overall")
    data.loc[top_teams.index, "Playoff"] = 1

X = data[["overall_team", "overall_team_coach", "confID", "avg_win_loss_ratio", "total_playoffs", "avg_rank"]]
y = data["Playoff"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

model = Pipeline([
    ("scaler", StandardScaler()),
    ("svm", SVC(probability=True, random_state=42))
])

print("Training SVM...")
model.fit(X_train, y_train)
print("Training complete.")

probs = model.predict_proba(X)[:, 1]
data['Playoff_Probability'] = probs

data["Playoff"] = 0

for conf in data["confID"].unique():
    conf_teams = data[data["confID"] == conf]
    top_teams = conf_teams.nlargest(4, "Playoff_Probability")
    data.loc[top_teams.index, "Playoff"] = 1

final_result = data[["tmID", "confID", "weighted_overall", "Playoff"]].sort_values(
    by=["confID", "Playoff"], ascending=[True, False]
)

print("\nConference classification (Binary Playoff):")
print(final_result[["tmID", "Playoff"]])

final_output = final_result[["tmID", "Playoff"]]
final_output.to_csv('../data/Season_11/playoff_predictions.csv', index=False)

Training SVM...
Training complete.

Conference classification (Binary Playoff):
    tmID  Playoff
2      4        1
3      7        1
6     11        1
10    19        1
0      0        0
1      2        0
5     10        1
7     13        1
8     16        1
9     17        1
4      8        0
11    20        0


Finally, we performed a reverse mapping to format the output CSV as required for the Kaggle competition.

In [15]:
map_teams = {
     'ATL': 0, 'CHI': 2, 'CON': 4, 'IND': 7, 
     'LAS': 8, 'MIN': 10, 'NYL': 11, 'PHO': 13, 
     'SAS': 16, 'SEA': 17, 'WAS': 19, 'TUL': 20
}

reverse_map_teams = {v: k for k, v in map_teams.items()}

final_output = pd.read_csv('../data/Season_11/playoff_predictions.csv')

final_output["tmID"] = final_output["tmID"].map(reverse_map_teams)
final_output = final_output.sort_values(by="tmID")

final_output.to_csv('../data/Season_11/playoff_predictions.csv', index=False)

We obtained a prediction error of 4 during the 5 days of the competition. Considering the theoretical maximum error is 12 and the best is 0, we are happy with the results we achieved. 