In [436]:
import pandas as pd

In [437]:
teams = pd.read_csv('data/selection/selected_features_teams.csv')
players = pd.read_csv('data/clean/cleaned_players.csv')
players_teams = pd.read_csv('data/clean/cleaned_players_teams.csv')
coaches = pd.read_csv('data/clean/cleaned_coaches.csv')

In [438]:
def calculate_coach_experience_for_team(coaches, team_id, year):
    team_coaches = coaches[(coaches['tmID'] == team_id) & (coaches['year'] == year)]
    total_games = team_coaches['won'].sum() + team_coaches['lost'].sum()
    
    total_coach_experience = 0
    
    for _, coach in team_coaches.iterrows():
        coach_history = coaches[(coaches['coachID'] == coach['coachID']) & (coaches['year'] < year)]
        coach_history = coach_history.sort_values(by='year', ascending=False).head(year)

        weights = list(range(year, 0, -1)) 
        weighted_winrate = sum(coach_history['winrate'] * weights[:len(coach_history)])
        total_awards = coach_history['TotalAwards'].sum()
        coach_experience = weighted_winrate + total_awards
        
        coach_games = coach['won'] + coach['lost']
        coach_weight = coach_games / total_games if total_games > 0 else 0
        total_coach_experience += coach_experience * coach_weight
    
    return total_coach_experience


### Predict the Team Year Stats 

In [439]:
def predict_team_year_stats(team_id, year): 

    # Teams stats, considering the most recent year possible and if not available, using the average rookie team stats
    team_stats = []

    team_previous_stats = teams[(teams['tmID'] == team_id) & (teams['year'] < year)]

    if not team_previous_stats.empty:
        recent_stats = team_previous_stats.sort_values('year', ascending=False).head(1)
        team_stats.append(recent_stats)
    else:
        print(f"Team {team_id} is new in year {year}. Using average rookie team stats.")
        rookie_team_stats = teams[teams['tmID'] == 'average_rookie_team']
        team_stats.append(rookie_team_stats)

    predicted_stats = pd.concat(team_stats, ignore_index=True)

    predicted_stats['year'] = year
    predicted_stats['tmID'] = team_id
    predicted_stats['franchID'] = teams[teams['tmID'] == team_id]['franchID'].iloc[0]

    print(predicted_stats)
   

    # Select player ids for the team for that year
    players_ids = players_teams[(players_teams['tmID'] == team_id) & (players_teams['year'] == year)]['playerID']
    team_players_bio = players[(players['bioID'].isin(players_ids))]
    
    # Players stats, considering the most recent year possible and if not available, using the average rookie player stats
    team_players = []
    for player_id in players_ids:
        player_stats = players_teams[players_teams['playerID'] == player_id]
        if not player_stats.empty:
            recent_stats = player_stats[player_stats['year'] < year].sort_values('year', ascending=False).head(1)
            if not recent_stats.empty:
                team_players.append(recent_stats)
            else:
                print(f'No stats for player {player_id}')
                team_players.append(players_teams[players_teams['playerID'] == 'average_rookie'])         


    
    team_players = pd.concat(team_players, ignore_index=True)
    
    
    # Calculate the player stats
    predicted_stats['player_average_height'] = team_players_bio['height'].mean()
    predicted_stats['player_average_weight'] = team_players_bio['weight'].mean()
    predicted_stats['player_total_GP'] = team_players['GP'].sum()
    predicted_stats['player_total_GS'] = team_players['GS'].sum()
    predicted_stats['player_total_points'] = team_players['points'].sum()
    predicted_stats['player_total_oRebounds'] = team_players['oRebounds'].sum()
    predicted_stats['player_total_dRebounds'] = team_players['dRebounds'].sum()
    predicted_stats['player_total_rebounds'] = team_players['rebounds'].sum()
    predicted_stats['player_total_assists'] = team_players['assists'].sum()
    predicted_stats['player_total_steals'] = team_players['steals'].sum()
    predicted_stats['player_total_blocks'] = team_players['blocks'].sum()
    predicted_stats['player_total_turnovers'] = team_players['turnovers'].sum()
    predicted_stats['player_total_PF'] = team_players['PF'].sum()
    predicted_stats['player_total_fgAttempted'] = team_players['fgAttempted'].sum()
    predicted_stats['player_total_fgMade'] = team_players['fgMade'].sum()
    predicted_stats['player_total_ftAttempted'] = team_players['ftAttempted'].sum()
    predicted_stats['player_total_ftMade'] = team_players['ftMade'].sum()
    predicted_stats['player_total_threeAttempted'] = team_players['threeAttempted'].sum()
    predicted_stats['player_total_threeMade'] = team_players['threeMade'].sum()
    predicted_stats['player_total_dq'] = team_players['dq'].sum()
    predicted_stats['player_total_PostGP'] = team_players['PostGP'].sum()
    predicted_stats['player_total_PostGS'] = team_players['PostGS'].sum()
    predicted_stats['player_total_PostMinutes'] = team_players['PostMinutes'].sum()
    predicted_stats['player_total_PostPoints'] = team_players['PostPoints'].sum()
    predicted_stats['player_total_PostoRebounds'] = team_players['PostoRebounds'].sum()
    predicted_stats['player_total_PostdRebounds'] = team_players['PostdRebounds'].sum()
    predicted_stats['player_total_PostRebounds'] = team_players['PostRebounds'].sum()
    predicted_stats['player_total_PostAssists'] = team_players['PostAssists'].sum()
    predicted_stats['player_total_PostSteals'] = team_players['PostSteals'].sum()
    predicted_stats['player_total_PostBlocks'] = team_players['PostBlocks'].sum()
    predicted_stats['player_total_PostTurnovers'] = team_players['PostTurnovers'].sum()
    predicted_stats['player_total_PostPF'] = team_players['PostPF'].sum()
    predicted_stats['player_total_PostfgAttempted'] = team_players['PostfgAttempted'].sum()
    predicted_stats['player_total_PostfgMade'] = team_players['PostfgMade'].sum()
    predicted_stats['player_total_PostftAttempted'] = team_players['PostftAttempted'].sum()
    predicted_stats['player_total_PostftMade'] = team_players['PostftMade'].sum()
    predicted_stats['player_total_PostthreeAttempted'] = team_players['PostthreeAttempted'].sum()
    predicted_stats['player_total_PostthreeMade'] = team_players['PostthreeMade'].sum()
    predicted_stats['player_total_PostDQ'] = team_players['PostDQ'].sum()
    predicted_stats['player_total_awards'] = team_players['TotalAwards'].sum()
    
    coach_experience = calculate_coach_experience_for_team(coaches, team_id, year)
    predicted_stats['coach_experience'] = coach_experience

    
    predicted_stats['playoff'] = ""
    predicted_stats['firstRound'] = ""
    predicted_stats['semis'] = ""
    predicted_stats['finals'] = ""
    
    return predicted_stats


# Function that returns a dataframe with all team stats for every year from 1 to year-1 plus the predicted stats for year
def get_year_predictions(year):
    team_predictions = []   
    for index, row in teams.iterrows():
        if row['year'] < year:
            team_predictions.append(
                teams.loc[(teams['tmID'] == row['tmID']) & (teams['year'] == row['year'])]
            )
        elif row['year'] == year:
            predicted_stats = predict_team_year_stats(row['tmID'], year)
            team_predictions.append(predicted_stats)
            
    return pd.concat(team_predictions, ignore_index=True)

# save 10 year

# Get the predictions for year 10 and save them to data/clean/year_7_predictions.csv
year_10_predictions = get_year_predictions(10)


   year tmID franchID confID  rank playoff firstRound semis finals  won  ...  \
0    10  ATL      ATL     EA   7.0       N        NaN   NaN    NaN  4.0  ...   

   o_ft_pct  o_3p_pct  o_oreb_pct  o_dreb_pct  d_fg_pct  d_ft_pct  d_3p_pct  \
0  0.747586  0.337793    0.315692    0.684308  0.449867  0.739651  0.342629   

   d_oreb_pct  d_dreb_pct  pts_diff  
0    0.316996    0.683004    -345.0  

[1 rows x 75 columns]
No stats for player lehnish01w
No stats for player mccouan01w
   year tmID franchID confID  rank playoff firstRound semis finals   won  ...  \
0    10  CHI      CHI     EA   5.0       N        NaN   NaN    NaN  12.0  ...   

   o_ft_pct  o_3p_pct  o_oreb_pct  o_dreb_pct  d_fg_pct  d_ft_pct  d_3p_pct  \
0  0.694581  0.322581     0.31677     0.68323  0.416251  0.772519  0.334646   

   d_oreb_pct  d_dreb_pct  pts_diff  
0    0.315517    0.684483     -38.0  

[1 rows x 75 columns]
No stats for player nanch01w
No stats for player tolivkr01w
   year tmID franchID confID  rank pla

### Model: Random Forest Classifier

In [440]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

def evaluate_RFC_model(year, year_10_predictions):
    print("Shape of the data: ", year_10_predictions.shape)
    train_data = year_10_predictions[year_10_predictions['year'] <= (year-1)].dropna(subset=['playoff'])
    X = train_data.drop(columns=['playoff', 'year', 'tmID', 'franchID', 'confID', 'firstRound', 'semis', 'finals', 'rank'])
    y = train_data['playoff'] 

    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X, y)
    year_10_data = year_10_predictions[year_10_predictions['year'] == year].drop(columns=['playoff', 'year', 'tmID', 'franchID', 'confID', 'firstRound', 'semis', 'finals', 'rank'])
    predictions = model.predict(year_10_data)

    year_10_predictions.loc[year_10_predictions['year'] == year, 'playoff'] = predictions

    real_values = teams[teams['year'] == year]['playoff']

    # Calculate the metrics
    accuracy = accuracy_score(real_values, predictions)
    precision = precision_score(real_values, predictions, pos_label='Y')  
    recall = recall_score(real_values, predictions, pos_label='Y')
    f1 = f1_score(real_values, predictions, pos_label='Y')

    print("\nResults for year " + str(year) + ":")
    print("Model: Random Forest Classifier with n_estimators=100 and random_state=42")
    print("Accuracy: ", accuracy)
    print("Precision: ", precision)
    print("Recall: ", recall)
    print("F1 Score: ", f1)

    print("\nClassification Report:")
    print(classification_report(real_values, predictions, target_names=['N', 'Y']))

    year_10_predictions.to_csv('data/clean/year_10_predictions1.csv', index=False)

evaluate_RFC_model(10, year_10_predictions)


Shape of the data:  (143, 75)

Results for year 10:
Model: Random Forest Classifier with n_estimators=100 and random_state=42
Accuracy:  0.6153846153846154
Precision:  0.6666666666666666
Recall:  0.75
F1 Score:  0.7058823529411765

Classification Report:
              precision    recall  f1-score   support

           N       0.50      0.40      0.44         5
           Y       0.67      0.75      0.71         8

    accuracy                           0.62        13
   macro avg       0.58      0.57      0.58        13
weighted avg       0.60      0.62      0.61        13



### Model: Logistic Regression

In [441]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

def evaluate_LR_model(year, year_10_predictions):
    print("Shape of the data: ", year_10_predictions.shape)
    train_data = year_10_predictions[year_10_predictions['year'] <= (year-1)].dropna(subset=['playoff'])
    X = train_data.drop(columns=['playoff', 'year', 'tmID', 'franchID', 'confID', 'firstRound', 'semis', 'finals', 'rank'])
    y = train_data['playoff'] 

    model = LogisticRegression(random_state=42)
    model.fit(X, y)
    year_10_data = year_10_predictions[year_10_predictions['year'] == year].drop(columns=['playoff', 'year', 'tmID', 'franchID', 'confID', 'firstRound', 'semis', 'finals', 'rank'])
    predictions = model.predict(year_10_data)

    year_10_predictions.loc[year_10_predictions['year'] == year, 'playoff'] = predictions

    real_values = teams[teams['year'] == year]['playoff']

    # Calculate the metrics
    accuracy = accuracy_score(real_values, predictions)
    precision = precision_score(real_values, predictions, pos_label='Y')  
    recall = recall_score(real_values, predictions, pos_label='Y')
    f1 = f1_score(real_values, predictions, pos_label='Y')

    print("\nResults for year " + str(year) + ":")
    print("Model: Logistic Regression with random_state=42")
    print("Accuracy: ", accuracy)
    print("Precision: ", precision)
    print("Recall: ", recall)
    print("F1 Score: ", f1)

    print("\nClassification Report:")
    print(classification_report(real_values, predictions, target_names=['N', 'Y']))



evaluate_LR_model(10, year_10_predictions)


Shape of the data:  (143, 75)

Results for year 10:
Model: Logistic Regression with random_state=42
Accuracy:  0.6153846153846154
Precision:  0.6666666666666666
Recall:  0.75
F1 Score:  0.7058823529411765

Classification Report:
              precision    recall  f1-score   support

           N       0.50      0.40      0.44         5
           Y       0.67      0.75      0.71         8

    accuracy                           0.62        13
   macro avg       0.58      0.57      0.58        13
weighted avg       0.60      0.62      0.61        13



### Model: SVM (Support Vector Machine)

In [442]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

def evaluate_SVM_model(year, year_10_predictions):
    print("Shape of the data: ", year_10_predictions.shape)
    train_data = year_10_predictions[year_10_predictions['year'] <= (year-1)].dropna(subset=['playoff'])
    X = train_data.drop(columns=['playoff', 'year', 'tmID', 'franchID', 'confID', 'firstRound', 'semis', 'finals', 'rank'])
    y = train_data['playoff'] 

    model = SVC(random_state=42)
    model.fit(X, y)
    year_10_data = year_10_predictions[year_10_predictions['year'] == year].drop(columns=['playoff', 'year', 'tmID', 'franchID', 'confID', 'firstRound', 'semis', 'finals', 'rank'])
    predictions = model.predict(year_10_data)

    year_10_predictions.loc[year_10_predictions['year'] == year, 'playoff'] = predictions

    real_values = teams[teams['year'] == year]['playoff']

    # Calculate the metrics
    accuracy = accuracy_score(real_values, predictions)
    precision = precision_score(real_values, predictions, pos_label='Y')  
    recall = recall_score(real_values, predictions, pos_label='Y')
    f1 = f1_score(real_values, predictions, pos_label='Y')

    print("\nResults for year " + str(year) + ":")
    print("Model: Support Vector Machine with random_state=42")
    print("Accuracy: ", accuracy)
    print("Precision: ", precision)
    print("Recall: ", recall)
    print("F1 Score: ", f1)

    print("\nClassification Report:")
    print(classification_report(real_values, predictions, target_names=['N', 'Y']))

evaluate_SVM_model(10, year_10_predictions)

Shape of the data:  (143, 75)

Results for year 10:
Model: Support Vector Machine with random_state=42
Accuracy:  0.6153846153846154
Precision:  0.6666666666666666
Recall:  0.75
F1 Score:  0.7058823529411765

Classification Report:
              precision    recall  f1-score   support

           N       0.50      0.40      0.44         5
           Y       0.67      0.75      0.71         8

    accuracy                           0.62        13
   macro avg       0.58      0.57      0.58        13
weighted avg       0.60      0.62      0.61        13



### Model: KNN (K-Nearest Neighbors)

In [443]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

def evaluate_KNN_model(year, year_10_predictions):
    print("Shape of the data: ", year_10_predictions.shape)
    train_data = year_10_predictions[year_10_predictions['year'] <= (year-1)].dropna(subset=['playoff'])
    X = train_data.drop(columns=['playoff', 'year', 'tmID', 'franchID', 'confID', 'firstRound', 'semis', 'finals', 'rank'])
    y = train_data['playoff'] 

    model = KNeighborsClassifier(n_neighbors=3)
    model.fit(X, y)
    year_10_data = year_10_predictions[year_10_predictions['year'] == year].drop(columns=['playoff', 'year', 'tmID', 'franchID', 'confID', 'firstRound', 'semis', 'finals', 'rank'])
    predictions = model.predict(year_10_data)

    year_10_predictions.loc[year_10_predictions['year'] == year, 'playoff'] = predictions

    real_values = teams[teams['year'] == year]['playoff']

    # Calculate the metrics
    accuracy = accuracy_score(real_values, predictions)
    precision = precision_score(real_values, predictions, pos_label='Y')  
    recall = recall_score(real_values, predictions, pos_label='Y')
    f1 = f1_score(real_values, predictions, pos_label='Y')

    print("\nResults for year " + str(year) + ":")
    print("Model: K-Nearest Neighbors with n_neighbors=3")
    print("Accuracy: ", accuracy)
    print("Precision: ", precision)
    print("Recall: ", recall)
    print("F1 Score: ", f1)

    print("\nClassification Report:")
    print(classification_report(real_values, predictions, target_names=['N', 'Y']))

evaluate_KNN_model(10, year_10_predictions)


Shape of the data:  (143, 75)

Results for year 10:
Model: K-Nearest Neighbors with n_neighbors=3
Accuracy:  0.5384615384615384
Precision:  0.625
Recall:  0.625
F1 Score:  0.625

Classification Report:
              precision    recall  f1-score   support

           N       0.40      0.40      0.40         5
           Y       0.62      0.62      0.62         8

    accuracy                           0.54        13
   macro avg       0.51      0.51      0.51        13
weighted avg       0.54      0.54      0.54        13



### Model: Decision Tree

In [444]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

def evaluate_DTC_model(year, year_10_predictions):
    print("Shape of the data: ", year_10_predictions.shape)
    train_data = year_10_predictions[year_10_predictions['year'] <= (year-1)].dropna(subset=['playoff'])
    X = train_data.drop(columns=['playoff', 'year', 'tmID', 'franchID', 'confID', 'firstRound', 'semis', 'finals', 'rank'])
    y = train_data['playoff'] 

    model = DecisionTreeClassifier(random_state=42)
    model.fit(X, y)
    year_10_data = year_10_predictions[year_10_predictions['year'] == year].drop(columns=['playoff', 'year', 'tmID', 'franchID', 'confID', 'firstRound', 'semis', 'finals', 'rank'])
    predictions = model.predict(year_10_data)

    year_10_predictions.loc[year_10_predictions['year'] == year, 'playoff'] = predictions

    real_values = teams[teams['year'] == year]['playoff']

    # Calculate the metrics
    accuracy = accuracy_score(real_values, predictions)
    precision = precision_score(real_values, predictions, pos_label='Y')  
    recall = recall_score(real_values, predictions, pos_label='Y')
    f1 = f1_score(real_values, predictions, pos_label='Y')

    print("\nResults for year " + str(year) + ":")
    print("Model: Decision Tree Classifier with random_state=42")
    print("Accuracy: ", accuracy)
    print("Precision: ", precision)
    print("Recall: ", recall)
    print("F1 Score: ", f1)

    print("\nClassification Report:")
    print(classification_report(real_values, predictions, target_names=['N', 'Y']))

evaluate_DTC_model(10, year_10_predictions)


Shape of the data:  (143, 75)

Results for year 10:
Model: Decision Tree Classifier with random_state=42
Accuracy:  0.5384615384615384
Precision:  0.6
Recall:  0.75
F1 Score:  0.6666666666666666

Classification Report:
              precision    recall  f1-score   support

           N       0.33      0.20      0.25         5
           Y       0.60      0.75      0.67         8

    accuracy                           0.54        13
   macro avg       0.47      0.47      0.46        13
weighted avg       0.50      0.54      0.51        13

