In [76]:
import pandas as pd

In [77]:
teams = pd.read_csv('data/selection/selected_features_teams.csv')
players = pd.read_csv('data/clean/cleaned_players.csv')
players_teams = pd.read_csv('data/clean/cleaned_players_teams.csv')
coaches = pd.read_csv('data/clean/cleaned_coaches.csv')

In [78]:
def calculate_coach_experience_for_team(coaches, team_id, year):
    team_coaches = coaches[(coaches['tmID'] == team_id) & (coaches['year'] == year)]
    total_games = team_coaches['won'].sum() + team_coaches['lost'].sum()
    
    total_coach_experience = 0
    
    for _, coach in team_coaches.iterrows():
        coach_history = coaches[(coaches['coachID'] == coach['coachID']) & (coaches['year'] < year)]
        coach_history = coach_history.sort_values(by='year', ascending=False).head(year)

        weights = list(range(year, 0, -1)) 
        weighted_winrate = sum(coach_history['winrate'] * weights[:len(coach_history)])
        total_awards = coach_history['TotalAwards'].sum()
        coach_experience = weighted_winrate + total_awards
        
        coach_games = coach['won'] + coach['lost']
        coach_weight = coach_games / total_games if total_games > 0 else 0
        total_coach_experience += coach_experience * coach_weight
    
    return total_coach_experience


### Predict the Team Year Stats 

In [89]:
def predict_team_year_stats(team_id, year): 

    predicted_stats = teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year - 1)].copy()
    predicted_stats['year'] = year

    if teams[(teams['tmID'] == team_id) & (teams['year'] == year - 1)].empty:
       print(f"Team {team_id} is new in year {year}. Using average rookie team stats.")
       predicted_stats = teams.loc[(teams['tmID'] == 'average_rookie_team')].copy()
       predicted_stats['year'] = year
       predicted_stats['franchID'] = teams[teams['tmID'] == team_id]['franchID'].iloc[0]
       predicted_stats['year'] = year
       predicted_stats['tmID'] = team_id
       print(predicted_stats)

    # Select player ids for the team for that year
    players_ids = players_teams[(players_teams['tmID'] == team_id) & (players_teams['year'] == year)]['playerID']
    # Select player stats for last year
    team_players = players_teams[(players_teams['playerID'].isin(players_ids)) & (players_teams['year'] == year - 1)]
    team_players_bio = players[(players['bioID'].isin(players_ids))]
    
    # Selecionar stats para jogadores considerando o ano mais recente possível
    team_players = []
    for player_id in players_ids:
        player_stats = players_teams[players_teams['playerID'] == player_id]
        if not player_stats.empty:
            recent_stats = player_stats[player_stats['year'] < year].sort_values('year', ascending=False).head(1)
            if not recent_stats.empty:
                team_players.append(recent_stats)
            else:
                print(f'No stats for player {player_id}')
                team_players.append(players_teams[players_teams['playerID'] == 'average_rookie'])         


    
    # Combinar as estatísticas de todos os jogadores
    team_players = pd.concat(team_players, ignore_index=True)
    
    
    # Calculate the player stats
    predicted_stats['player_average_height'] = team_players_bio['height'].mean()
    predicted_stats['player_average_weight'] = team_players_bio['weight'].mean()
    predicted_stats['player_total_GP'] = team_players['GP'].sum()
    predicted_stats['player_total_GS'] = team_players['GS'].sum()
    predicted_stats['player_total_points'] = team_players['points'].sum()
    predicted_stats['player_total_oRebounds'] = team_players['oRebounds'].sum()
    predicted_stats['player_total_dRebounds'] = team_players['dRebounds'].sum()
    predicted_stats['player_total_rebounds'] = team_players['rebounds'].sum()
    predicted_stats['player_total_assists'] = team_players['assists'].sum()
    predicted_stats['player_total_steals'] = team_players['steals'].sum()
    predicted_stats['player_total_blocks'] = team_players['blocks'].sum()
    predicted_stats['player_total_turnovers'] = team_players['turnovers'].sum()
    predicted_stats['player_total_PF'] = team_players['PF'].sum()
    predicted_stats['player_total_fgAttempted'] = team_players['fgAttempted'].sum()
    predicted_stats['player_total_fgMade'] = team_players['fgMade'].sum()
    predicted_stats['player_total_ftAttempted'] = team_players['ftAttempted'].sum()
    predicted_stats['player_total_ftMade'] = team_players['ftMade'].sum()
    predicted_stats['player_total_threeAttempted'] = team_players['threeAttempted'].sum()
    predicted_stats['player_total_threeMade'] = team_players['threeMade'].sum()
    predicted_stats['player_total_dq'] = team_players['dq'].sum()
    predicted_stats['player_total_PostGP'] = team_players['PostGP'].sum()
    predicted_stats['player_total_PostGS'] = team_players['PostGS'].sum()
    predicted_stats['player_total_PostMinutes'] = team_players['PostMinutes'].sum()
    predicted_stats['player_total_PostPoints'] = team_players['PostPoints'].sum()
    predicted_stats['player_total_PostoRebounds'] = team_players['PostoRebounds'].sum()
    predicted_stats['player_total_PostdRebounds'] = team_players['PostdRebounds'].sum()
    predicted_stats['player_total_PostRebounds'] = team_players['PostRebounds'].sum()
    predicted_stats['player_total_PostAssists'] = team_players['PostAssists'].sum()
    predicted_stats['player_total_PostSteals'] = team_players['PostSteals'].sum()
    predicted_stats['player_total_PostBlocks'] = team_players['PostBlocks'].sum()
    predicted_stats['player_total_PostTurnovers'] = team_players['PostTurnovers'].sum()
    predicted_stats['player_total_PostPF'] = team_players['PostPF'].sum()
    predicted_stats['player_total_PostfgAttempted'] = team_players['PostfgAttempted'].sum()
    predicted_stats['player_total_PostfgMade'] = team_players['PostfgMade'].sum()
    predicted_stats['player_total_PostftAttempted'] = team_players['PostftAttempted'].sum()
    predicted_stats['player_total_PostftMade'] = team_players['PostftMade'].sum()
    predicted_stats['player_total_PostthreeAttempted'] = team_players['PostthreeAttempted'].sum()
    predicted_stats['player_total_PostthreeMade'] = team_players['PostthreeMade'].sum()
    predicted_stats['player_total_PostDQ'] = team_players['PostDQ'].sum()
    predicted_stats['player_total_awards'] = team_players['TotalAwards'].sum()
    
    coach_experience = calculate_coach_experience_for_team(coaches, team_id, year)
    predicted_stats['coach_experience'] = coach_experience

    
    predicted_stats['playoff'] = ""
    predicted_stats['firstRound'] = ""
    predicted_stats['semis'] = ""
    predicted_stats['finals'] = ""
    
    return predicted_stats


# Function that returns a dataframe with all team stats for every year from 1 to year-1 plus the predicted stats for year
def get_year_predictions(year):
    team_predictions = []  # Use a list to collect rows
    for index, row in teams.iterrows():
        if row['year'] < year:
            team_predictions.append(
                teams.loc[(teams['tmID'] == row['tmID']) & (teams['year'] == row['year'])]
            )
        elif row['year'] == year:
            predicted_stats = predict_team_year_stats(row['tmID'], year)
            team_predictions.append(predicted_stats)
            
    return pd.concat(team_predictions, ignore_index=True)

# save 10 year

# Get the predictions for year 10 and save them to data/clean/year_7_predictions.csv
year_10_predictions = get_year_predictions(8)


No stats for player coggicl01w
No stats for player joensca01w
No stats for player pricear01w
No stats for player raymost01w
No stats for player thomaca01w
No stats for player chambco01w
No stats for player hairska01w
No stats for player maltsev01w
No stats for player lattaiv01w
No stats for player sandeam01w
No stats for player smithty01w
No stats for player davisde01w
No stats for player granter01w
No stats for player shielas01w
No stats for player balesal01w
No stats for player fernama01w
No stats for player spencsi01w
No stats for player hardili01w
No stats for player moorena01w
No stats for player murphsh01w
No stats for player quinnno01w
No stats for player bowenli01w
No stats for player davenje01w
No stats for player doronsh01w
No stats for player jacksti02w
No stats for player weberma01w
No stats for player littlca01w
No stats for player gearlka01w
No stats for player goringi01w
No stats for player mosbybe01w


### Model: Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

print("Shape of the data: ", year_10_predictions.shape)
train_data = year_10_predictions[year_10_predictions['year'] <= 7].dropna(subset=['playoff'])
X = train_data.drop(columns=['playoff', 'year', 'tmID', 'franchID', 'confID', 'firstRound', 'semis', 'finals', 'rank'])
y = train_data['playoff'] 

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X, y)
year_10_data = year_10_predictions[year_10_predictions['year'] == 8].drop(columns=['playoff', 'year', 'tmID', 'franchID', 'confID', 'firstRound', 'semis', 'finals', 'rank'])
predictions = model.predict(year_10_data)

year_10_predictions.loc[year_10_predictions['year'] == 8, 'playoff'] = predictions

real_values = teams[teams['year'] == 8]['playoff']

# Calculate the metrics
accuracy = accuracy_score(real_values, predictions)
precision = precision_score(real_values, predictions, pos_label='Y')  
recall = recall_score(real_values, predictions, pos_label='Y')
f1 = f1_score(real_values, predictions, pos_label='Y')

print("Model: Random Forest Classifier")
print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("F1 Score: ", f1)

print("\nClassification Report:")
print(classification_report(real_values, predictions, target_names=['N', 'Y']))

year_10_predictions.to_csv('data/clean/year_10_predictions1.csv', index=False)


Shape of the data:  (116, 75)
Model: Random Forest Classifier
Accuracy:  0.5384615384615384
Precision:  0.6
Recall:  0.75
F1 Score:  0.6666666666666666

Classification Report:
              precision    recall  f1-score   support

           N       0.33      0.20      0.25         5
           Y       0.60      0.75      0.67         8

    accuracy                           0.54        13
   macro avg       0.47      0.47      0.46        13
weighted avg       0.50      0.54      0.51        13



### Model: Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

train_data = year_10_predictions[year_10_predictions['year'] <= 9].dropna(subset=['playoff'])
X_train = train_data.drop(columns=['playoff', 'year', 'tmID', 'franchID', 'confID', 'firstRound', 'semis', 'finals', 'rank'])
y_train = train_data['playoff']

logreg_model = LogisticRegression(random_state=42)

logreg_model.fit(X_train, y_train)

X_test = year_10_predictions[year_10_predictions['year'] == 10].drop(columns=['playoff', 'year', 'tmID', 'franchID', 'confID', 'firstRound', 'semis', 'finals', 'rank'])
real_values = teams[teams['year'] == 10]['playoff']

X_test = X_test[X_train.columns]  

logreg_predictions = logreg_model.predict(X_test)

logreg_accuracy = accuracy_score(real_values, logreg_predictions)
logreg_precision = precision_score(real_values, logreg_predictions, pos_label='Y')
logreg_recall = recall_score(real_values, logreg_predictions, pos_label='Y')
logreg_f1 = f1_score(real_values, logreg_predictions, pos_label='Y')

print("Model: Logistic Regression")
print(f"Accuracy: {logreg_accuracy}")
print(f"Precision: {logreg_precision}")
print(f"Recall: {logreg_recall}")
print(f"F1 Score: {logreg_f1}")
print("\nClassification Report:")
print(classification_report(real_values, logreg_predictions, target_names=['N', 'Y']))




Model: Logistic Regression
Accuracy: 0.6153846153846154
Precision: 0.6666666666666666
Recall: 0.75
F1 Score: 0.7058823529411765

Classification Report:
              precision    recall  f1-score   support

           N       0.50      0.40      0.44         5
           Y       0.67      0.75      0.71         8

    accuracy                           0.62        13
   macro avg       0.58      0.57      0.58        13
weighted avg       0.60      0.62      0.61        13



### Model: SVM (Support Vector Machine)

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

train_data = year_10_predictions[year_10_predictions['year'] <= 9].dropna(subset=['playoff'])
X_train = train_data.drop(columns=['playoff', 'year', 'tmID', 'franchID', 'confID', 'firstRound', 'semis', 'finals', 'rank'])
y_train = train_data['playoff']

svm_model = SVC(random_state=42)

svm_model.fit(X_train, y_train)

X_test = year_10_predictions[year_10_predictions['year'] == 10].drop(columns=['playoff', 'year', 'tmID', 'franchID', 'confID', 'firstRound', 'semis', 'finals', 'rank'])
real_values = teams[teams['year'] == 10]['playoff']

X_test = X_test[X_train.columns]  

svm_predictions = svm_model.predict(X_test)

svm_accuracy = accuracy_score(real_values, svm_predictions)
svm_precision = precision_score(real_values, svm_predictions, pos_label='Y')
svm_recall = recall_score(real_values, svm_predictions, pos_label='Y')
svm_f1 = f1_score(real_values, svm_predictions, pos_label='Y')

print("\nModel: Support Vector Machine")
print(f"Accuracy: {svm_accuracy}")
print(f"Precision: {svm_precision}")
print(f"Recall: {svm_recall}")
print(f"F1 Score: {svm_f1}")
print("\nClassification Report:")
print(classification_report(real_values, svm_predictions, target_names=['N', 'Y']))



Model: Support Vector Machine
Accuracy: 0.6153846153846154
Precision: 0.6666666666666666
Recall: 0.75
F1 Score: 0.7058823529411765

Classification Report:
              precision    recall  f1-score   support

           N       0.50      0.40      0.44         5
           Y       0.67      0.75      0.71         8

    accuracy                           0.62        13
   macro avg       0.58      0.57      0.58        13
weighted avg       0.60      0.62      0.61        13



### Model: KNN (K-Nearest Neighbors)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

train_data = year_10_predictions[year_10_predictions['year'] <= 9].dropna(subset=['playoff'])
X_train = train_data.drop(columns=['playoff', 'year', 'tmID', 'franchID', 'confID', 'firstRound', 'semis', 'finals', 'rank'])
y_train = train_data['playoff']

knn_model = KNeighborsClassifier()

knn_model.fit(X_train, y_train)

X_test = year_10_predictions[year_10_predictions['year'] == 10].drop(columns=['playoff', 'year', 'tmID', 'franchID', 'confID', 'firstRound', 'semis', 'finals', 'rank'])
real_values = teams[teams['year'] == 10]['playoff']

X_test = X_test[X_train.columns] 

knn_predictions = knn_model.predict(X_test)

knn_accuracy = accuracy_score(real_values, knn_predictions)
knn_precision = precision_score(real_values, knn_predictions, pos_label='Y')
knn_recall = recall_score(real_values, knn_predictions, pos_label='Y')
knn_f1 = f1_score(real_values, knn_predictions, pos_label='Y')

print("\nModel: K-Nearest Neighbors")
print(f"Accuracy: {knn_accuracy}")
print(f"Precision: {knn_precision}")
print(f"Recall: {knn_recall}")
print(f"F1 Score: {knn_f1}")
print("\nClassification Report:")
print(classification_report(real_values, knn_predictions, target_names=['N', 'Y']))



Model: K-Nearest Neighbors
Accuracy: 0.5384615384615384
Precision: 0.625
Recall: 0.625
F1 Score: 0.625

Classification Report:
              precision    recall  f1-score   support

           N       0.40      0.40      0.40         5
           Y       0.62      0.62      0.62         8

    accuracy                           0.54        13
   macro avg       0.51      0.51      0.51        13
weighted avg       0.54      0.54      0.54        13



### Model: Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

train_data = year_10_predictions[year_10_predictions['year'] <= 9].dropna(subset=['playoff'])
X_train = train_data.drop(columns=['playoff', 'year', 'tmID', 'franchID', 'confID', 'firstRound', 'semis', 'finals', 'rank'])
y_train = train_data['playoff']

dt_model = DecisionTreeClassifier(random_state=42)

dt_model.fit(X_train, y_train)

X_test = year_10_predictions[year_10_predictions['year'] == 10].drop(columns=['playoff', 'year', 'tmID', 'franchID', 'confID', 'firstRound', 'semis', 'finals', 'rank'])
real_values = teams[teams['year'] == 10]['playoff']

X_test = X_test[X_train.columns] 

dt_predictions = dt_model.predict(X_test)

dt_accuracy = accuracy_score(real_values, dt_predictions)
dt_precision = precision_score(real_values, dt_predictions, pos_label='Y')
dt_recall = recall_score(real_values, dt_predictions, pos_label='Y')
dt_f1 = f1_score(real_values, dt_predictions, pos_label='Y')

print("\nModel: Decision Tree")
print(f"Accuracy: {dt_accuracy}")
print(f"Precision: {dt_precision}")
print(f"Recall: {dt_recall}")
print(f"F1 Score: {dt_f1}")
print("\nClassification Report:")
print(classification_report(real_values, dt_predictions, target_names=['N', 'Y']))



Model: Decision Tree
Accuracy: 0.5384615384615384
Precision: 0.6
Recall: 0.75
F1 Score: 0.6666666666666666

Classification Report:
              precision    recall  f1-score   support

           N       0.33      0.20      0.25         5
           Y       0.60      0.75      0.67         8

    accuracy                           0.54        13
   macro avg       0.47      0.47      0.46        13
weighted avg       0.50      0.54      0.51        13

