In [613]:
import pandas as pd

In [614]:
teams = pd.read_csv('data/selection/selected_features_teams.csv')
players = pd.read_csv('data/clean/cleaned_players.csv')
players_teams = pd.read_csv('data/clean/cleaned_players_teams.csv')
coaches = pd.read_csv('data/clean/cleaned_coaches.csv')

In [615]:
def calculate_coach_experience_for_team(coaches, team_id, year):
    team_coaches = coaches[(coaches['tmID'] == team_id) & (coaches['year'] == year)]
    total_games = team_coaches['won'].sum() + team_coaches['lost'].sum()
    
    total_coach_experience = 0
    
    for _, coach in team_coaches.iterrows():
        coach_history = coaches[(coaches['coachID'] == coach['coachID']) & (coaches['year'] < year)]
        coach_history = coach_history.sort_values(by='year', ascending=False).head(year)

        weights = list(range(year, 0, -1)) 
        weighted_winrate = sum(coach_history['winrate'] * weights[:len(coach_history)])
        total_awards = coach_history['TotalAwards'].sum()
        coach_experience = weighted_winrate + total_awards
        
        coach_games = coach['won'] + coach['lost']
        coach_weight = coach_games / total_games if total_games > 0 else 0
        total_coach_experience += coach_experience * coach_weight
    
    return total_coach_experience


### Predict the Team Year Stats 

In [616]:
def predict_team_year_stats(team_id, year): 

    # Teams stats, considering the most recent year possible and if not available, using the average rookie team stats
    team_stats = []

    team_previous_stats = teams[(teams['tmID'] == team_id) & (teams['year'] < year)]

    if not team_previous_stats.empty:
        recent_stats = team_previous_stats.sort_values('year', ascending=False).head(1)
        team_stats.append(recent_stats)
    else:
        print(f"Team {team_id} is new in year {year}. Using average rookie team stats.")
        rookie_team_stats = teams[teams['tmID'] == 'average_rookie_team']
        team_stats.append(rookie_team_stats)

    predicted_stats = pd.concat(team_stats, ignore_index=True)

    predicted_stats['year'] = year
    predicted_stats['tmID'] = team_id
    predicted_stats['franchID'] = teams[teams['tmID'] == team_id]['franchID'].iloc[0]

    print(predicted_stats)
   

    # Select player ids for the team for that year
    players_ids = players_teams[(players_teams['tmID'] == team_id) & (players_teams['year'] == year)]['playerID']
    team_players_bio = players[(players['bioID'].isin(players_ids))]
    
    # Players stats, considering the most recent year possible and if not available, using the average rookie player stats
    team_players = []
    for player_id in players_ids:
        player_stats = players_teams[players_teams['playerID'] == player_id]
        if not player_stats.empty:
            recent_stats = player_stats[player_stats['year'] < year].sort_values('year', ascending=False).head(1)
            if not recent_stats.empty:
                team_players.append(recent_stats)
            else:
                print(f'No stats for player {player_id}')
                rookie_player_stats = players_teams[players_teams['playerID'] == f"average_rookie_{year}"]  
                team_players.append(rookie_player_stats)    


    
    team_players = pd.concat(team_players, ignore_index=True)
    
    
    # Calculate the player stats
    predicted_stats['player_average_height'] = team_players_bio['height'].mean()
    predicted_stats['player_average_weight'] = team_players_bio['weight'].mean()
    predicted_stats['player_total_GP'] = team_players['GP'].sum()
    predicted_stats['player_total_GS'] = team_players['GS'].sum()
    predicted_stats['player_total_points'] = team_players['points'].sum()
    predicted_stats['player_total_oRebounds'] = team_players['oRebounds'].sum()
    predicted_stats['player_total_dRebounds'] = team_players['dRebounds'].sum()
    predicted_stats['player_total_rebounds'] = team_players['rebounds'].sum()
    predicted_stats['player_total_assists'] = team_players['assists'].sum()
    predicted_stats['player_total_steals'] = team_players['steals'].sum()
    predicted_stats['player_total_blocks'] = team_players['blocks'].sum()
    predicted_stats['player_total_turnovers'] = team_players['turnovers'].sum()
    predicted_stats['player_total_PF'] = team_players['PF'].sum()
    predicted_stats['player_total_fgAttempted'] = team_players['fgAttempted'].sum()
    predicted_stats['player_total_fgMade'] = team_players['fgMade'].sum()
    predicted_stats['player_total_ftAttempted'] = team_players['ftAttempted'].sum()
    predicted_stats['player_total_ftMade'] = team_players['ftMade'].sum()
    predicted_stats['player_total_threeAttempted'] = team_players['threeAttempted'].sum()
    predicted_stats['player_total_threeMade'] = team_players['threeMade'].sum()
    predicted_stats['player_total_dq'] = team_players['dq'].sum()
    predicted_stats['player_total_PostGP'] = team_players['PostGP'].sum()
    predicted_stats['player_total_PostGS'] = team_players['PostGS'].sum()
    predicted_stats['player_total_PostMinutes'] = team_players['PostMinutes'].sum()
    predicted_stats['player_total_PostPoints'] = team_players['PostPoints'].sum()
    predicted_stats['player_total_PostoRebounds'] = team_players['PostoRebounds'].sum()
    predicted_stats['player_total_PostdRebounds'] = team_players['PostdRebounds'].sum()
    predicted_stats['player_total_PostRebounds'] = team_players['PostRebounds'].sum()
    predicted_stats['player_total_PostAssists'] = team_players['PostAssists'].sum()
    predicted_stats['player_total_PostSteals'] = team_players['PostSteals'].sum()
    predicted_stats['player_total_PostBlocks'] = team_players['PostBlocks'].sum()
    predicted_stats['player_total_PostTurnovers'] = team_players['PostTurnovers'].sum()
    predicted_stats['player_total_PostPF'] = team_players['PostPF'].sum()
    predicted_stats['player_total_PostfgAttempted'] = team_players['PostfgAttempted'].sum()
    predicted_stats['player_total_PostfgMade'] = team_players['PostfgMade'].sum()
    predicted_stats['player_total_PostftAttempted'] = team_players['PostftAttempted'].sum()
    predicted_stats['player_total_PostftMade'] = team_players['PostftMade'].sum()
    predicted_stats['player_total_PostthreeAttempted'] = team_players['PostthreeAttempted'].sum()
    predicted_stats['player_total_PostthreeMade'] = team_players['PostthreeMade'].sum()
    predicted_stats['player_total_PostDQ'] = team_players['PostDQ'].sum()
    predicted_stats['player_total_awards'] = team_players['TotalAwards'].sum()
    
    coach_experience = calculate_coach_experience_for_team(coaches, team_id, year)
    predicted_stats['coach_experience'] = coach_experience

    
    predicted_stats['playoff'] = ""
    predicted_stats['firstRound'] = ""
    predicted_stats['semis'] = ""
    predicted_stats['finals'] = ""
    
    return predicted_stats


# Function that returns a dataframe with all team stats for every year from 1 to year-1 plus the predicted stats for year
def get_year_predictions(year):
    team_predictions = []   
    for index, row in teams.iterrows():
        if row['year'] < year:
            team_predictions.append(
                teams.loc[(teams['tmID'] == row['tmID']) & (teams['year'] == row['year'])]
            )
        elif row['year'] == year:
            predicted_stats = predict_team_year_stats(row['tmID'], year)
            team_predictions.append(predicted_stats)
            
    return pd.concat(team_predictions, ignore_index=True)

# save 10 year

year_prediction = get_year_predictions(9)


Team ATL is new in year 9. Using average rookie team stats.
   year tmID franchID confID  rank playoff firstRound semis finals        won  \
0     9  ATL      ATL    NaN   NaN     NaN        NaN   NaN    NaN  14.722222   

   ...  o_ft_pct  o_3p_pct  o_oreb_pct  o_dreb_pct  d_fg_pct  d_ft_pct  \
0  ...  0.737594  0.326752    0.325021    0.674979  0.434235  0.740219   

   d_3p_pct  d_oreb_pct  d_dreb_pct   pts_diff  
0  0.335249    0.322985    0.677015 -39.444444  

[1 rows x 75 columns]
No stats for player nnamach01w
No stats for player youngta01w
   year tmID franchID confID  rank playoff firstRound semis finals   won  ...  \
0     9  CHI      CHI     EA   6.0       N        NaN   NaN    NaN  14.0  ...   

   o_ft_pct  o_3p_pct  o_oreb_pct  o_dreb_pct  d_fg_pct  d_ft_pct  d_3p_pct  \
0  0.676349  0.341912    0.328755    0.671245  0.429336  0.780374  0.382536   

   d_oreb_pct  d_dreb_pct  pts_diff  
0    0.309388    0.690612     -86.0  

[1 rows x 75 columns]
No stats for player chan

### Model: Random Forest Classifier

In [617]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

def evaluate_RFC_model(year, year_prediction):
    print("Shape of the data: ", year_prediction.shape)
    train_data = year_prediction[year_prediction['year'] <= (year-1)].dropna(subset=['playoff'])
    X = train_data.drop(columns=['playoff', 'year', 'tmID', 'franchID', 'confID', 'firstRound', 'semis', 'finals', 'rank'])
    y = train_data['playoff'] 

    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X, y)
    year_10_data = year_prediction[year_prediction['year'] == year].drop(columns=['playoff', 'year', 'tmID', 'franchID', 'confID', 'firstRound', 'semis', 'finals', 'rank'])
    predictions = model.predict(year_10_data)

    year_prediction.loc[year_prediction['year'] == year, 'playoff'] = predictions

    real_values = teams[teams['year'] == year]['playoff']

    # Calculate the metrics
    accuracy = accuracy_score(real_values, predictions)
    precision = precision_score(real_values, predictions, pos_label='Y')  
    recall = recall_score(real_values, predictions, pos_label='Y')
    f1 = f1_score(real_values, predictions, pos_label='Y')

    print("\nResults for year " + str(year) + ":")
    print("Model: Random Forest Classifier with n_estimators=100 and random_state=42")
    print("Accuracy: ", accuracy)
    print("Precision: ", precision)
    print("Recall: ", recall)
    print("F1 Score: ", f1)

    print("\nClassification Report:")
    print(classification_report(real_values, predictions, target_names=['N', 'Y']))

    year_prediction.to_csv('data/clean/year_prediction1.csv', index=False)

evaluate_RFC_model(9, year_prediction)


Shape of the data:  (130, 75)

Results for year 9:
Model: Random Forest Classifier with n_estimators=100 and random_state=42
Accuracy:  0.7142857142857143
Precision:  0.6666666666666666
Recall:  1.0
F1 Score:  0.8

Classification Report:
              precision    recall  f1-score   support

           N       1.00      0.33      0.50         6
           Y       0.67      1.00      0.80         8

    accuracy                           0.71        14
   macro avg       0.83      0.67      0.65        14
weighted avg       0.81      0.71      0.67        14



### Model: Logistic Regression

In [618]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

def evaluate_LR_model(year, year_prediction):
    print("Shape of the data: ", year_prediction.shape)
    train_data = year_prediction[year_prediction['year'] <= (year-1)].dropna(subset=['playoff'])
    X = train_data.drop(columns=['playoff', 'year', 'tmID', 'franchID', 'confID', 'firstRound', 'semis', 'finals', 'rank'])
    y = train_data['playoff'] 

    model = LogisticRegression(random_state=42)
    model.fit(X, y)
    year_10_data = year_prediction[year_prediction['year'] == year].drop(columns=['playoff', 'year', 'tmID', 'franchID', 'confID', 'firstRound', 'semis', 'finals', 'rank'])
    predictions = model.predict(year_10_data)

    year_prediction.loc[year_prediction['year'] == year, 'playoff'] = predictions

    real_values = teams[teams['year'] == year]['playoff']

    # Calculate the metrics
    accuracy = accuracy_score(real_values, predictions)
    precision = precision_score(real_values, predictions, pos_label='Y')  
    recall = recall_score(real_values, predictions, pos_label='Y')
    f1 = f1_score(real_values, predictions, pos_label='Y')

    print("\nResults for year " + str(year) + ":")
    print("Model: Logistic Regression with random_state=42")
    print("Accuracy: ", accuracy)
    print("Precision: ", precision)
    print("Recall: ", recall)
    print("F1 Score: ", f1)

    print("\nClassification Report:")
    print(classification_report(real_values, predictions, target_names=['N', 'Y']))



evaluate_LR_model(9, year_prediction)


Shape of the data:  (130, 75)

Results for year 9:
Model: Logistic Regression with random_state=42
Accuracy:  0.8571428571428571
Precision:  0.8
Recall:  1.0
F1 Score:  0.8888888888888888

Classification Report:
              precision    recall  f1-score   support

           N       1.00      0.67      0.80         6
           Y       0.80      1.00      0.89         8

    accuracy                           0.86        14
   macro avg       0.90      0.83      0.84        14
weighted avg       0.89      0.86      0.85        14



### Model: SVM (Support Vector Machine)

In [619]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

def evaluate_SVM_model(year, year_prediction):
    print("Shape of the data: ", year_prediction.shape)
    train_data = year_prediction[year_prediction['year'] <= (year-1)].dropna(subset=['playoff'])
    X = train_data.drop(columns=['playoff', 'year', 'tmID', 'franchID', 'confID', 'firstRound', 'semis', 'finals', 'rank'])
    y = train_data['playoff'] 

    model = SVC(random_state=42)
    model.fit(X, y)
    year_10_data = year_prediction[year_prediction['year'] == year].drop(columns=['playoff', 'year', 'tmID', 'franchID', 'confID', 'firstRound', 'semis', 'finals', 'rank'])
    predictions = model.predict(year_10_data)

    year_prediction.loc[year_prediction['year'] == year, 'playoff'] = predictions

    real_values = teams[teams['year'] == year]['playoff']

    # Calculate the metrics
    accuracy = accuracy_score(real_values, predictions)
    precision = precision_score(real_values, predictions, pos_label='Y')  
    recall = recall_score(real_values, predictions, pos_label='Y')
    f1 = f1_score(real_values, predictions, pos_label='Y')

    print("\nResults for year " + str(year) + ":")
    print("Model: Support Vector Machine with random_state=42")
    print("Accuracy: ", accuracy)
    print("Precision: ", precision)
    print("Recall: ", recall)
    print("F1 Score: ", f1)

    print("\nClassification Report:")
    print(classification_report(real_values, predictions, target_names=['N', 'Y']))

evaluate_SVM_model(9, year_prediction)

Shape of the data:  (130, 75)

Results for year 9:
Model: Support Vector Machine with random_state=42
Accuracy:  0.8571428571428571
Precision:  0.875
Recall:  0.875
F1 Score:  0.875

Classification Report:
              precision    recall  f1-score   support

           N       0.83      0.83      0.83         6
           Y       0.88      0.88      0.88         8

    accuracy                           0.86        14
   macro avg       0.85      0.85      0.85        14
weighted avg       0.86      0.86      0.86        14



### Model: KNN (K-Nearest Neighbors)

In [620]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

def evaluate_KNN_model(year, year_prediction):
    print("Shape of the data: ", year_prediction.shape)
    train_data = year_prediction[year_prediction['year'] <= (year-1)].dropna(subset=['playoff'])
    X = train_data.drop(columns=['playoff', 'year', 'tmID', 'franchID', 'confID', 'firstRound', 'semis', 'finals', 'rank'])
    y = train_data['playoff'] 

    model = KNeighborsClassifier(n_neighbors=3)
    model.fit(X, y)
    year_10_data = year_prediction[year_prediction['year'] == year].drop(columns=['playoff', 'year', 'tmID', 'franchID', 'confID', 'firstRound', 'semis', 'finals', 'rank'])
    predictions = model.predict(year_10_data)

    year_prediction.loc[year_prediction['year'] == year, 'playoff'] = predictions

    real_values = teams[teams['year'] == year]['playoff']

    # Calculate the metrics
    accuracy = accuracy_score(real_values, predictions)
    precision = precision_score(real_values, predictions, pos_label='Y')  
    recall = recall_score(real_values, predictions, pos_label='Y')
    f1 = f1_score(real_values, predictions, pos_label='Y')

    print("\nResults for year " + str(year) + ":")
    print("Model: K-Nearest Neighbors with n_neighbors=3")
    print("Accuracy: ", accuracy)
    print("Precision: ", precision)
    print("Recall: ", recall)
    print("F1 Score: ", f1)

    print("\nClassification Report:")
    print(classification_report(real_values, predictions, target_names=['N', 'Y']))

evaluate_KNN_model(9, year_prediction)


Shape of the data:  (130, 75)

Results for year 9:
Model: K-Nearest Neighbors with n_neighbors=3
Accuracy:  0.7857142857142857
Precision:  0.7272727272727273
Recall:  1.0
F1 Score:  0.8421052631578947

Classification Report:
              precision    recall  f1-score   support

           N       1.00      0.50      0.67         6
           Y       0.73      1.00      0.84         8

    accuracy                           0.79        14
   macro avg       0.86      0.75      0.75        14
weighted avg       0.84      0.79      0.77        14



### Model: Decision Tree

In [621]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

def evaluate_DTC_model(year, year_prediction):
    print("Shape of the data: ", year_prediction.shape)
    train_data = year_prediction[year_prediction['year'] <= (year-1)].dropna(subset=['playoff'])
    X = train_data.drop(columns=['playoff', 'year', 'tmID', 'franchID', 'confID', 'firstRound', 'semis', 'finals', 'rank'])
    y = train_data['playoff'] 

    model = DecisionTreeClassifier(random_state=42)
    model.fit(X, y)
    year_10_data = year_prediction[year_prediction['year'] == year].drop(columns=['playoff', 'year', 'tmID', 'franchID', 'confID', 'firstRound', 'semis', 'finals', 'rank'])
    predictions = model.predict(year_10_data)

    year_prediction.loc[year_prediction['year'] == year, 'playoff'] = predictions

    real_values = teams[teams['year'] == year]['playoff']

    # Calculate the metrics
    accuracy = accuracy_score(real_values, predictions)
    precision = precision_score(real_values, predictions, pos_label='Y')  
    recall = recall_score(real_values, predictions, pos_label='Y')
    f1 = f1_score(real_values, predictions, pos_label='Y')

    print("\nResults for year " + str(year) + ":")
    print("Model: Decision Tree Classifier with random_state=42")
    print("Accuracy: ", accuracy)
    print("Precision: ", precision)
    print("Recall: ", recall)
    print("F1 Score: ", f1)

    print("\nClassification Report:")
    print(classification_report(real_values, predictions, target_names=['N', 'Y']))

evaluate_DTC_model(9, year_prediction)


Shape of the data:  (130, 75)

Results for year 9:
Model: Decision Tree Classifier with random_state=42
Accuracy:  0.6428571428571429
Precision:  0.6153846153846154
Recall:  1.0
F1 Score:  0.7619047619047619

Classification Report:
              precision    recall  f1-score   support

           N       1.00      0.17      0.29         6
           Y       0.62      1.00      0.76         8

    accuracy                           0.64        14
   macro avg       0.81      0.58      0.52        14
weighted avg       0.78      0.64      0.56        14

