In [32]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.preprocessing import StandardScaler

In [33]:
teams = pd.read_csv('data/selection/selected_features_teams.csv')
players = pd.read_csv('data/clean/cleaned_players.csv')
players_teams = pd.read_csv('data/clean/cleaned_players_teams.csv')
coaches = pd.read_csv('data/clean/cleaned_coaches.csv')

In [34]:
def calculate_coach_experience_for_team(coaches, team_id, year):
    # Filtrar os treinadores para a equipe e ano fornecidos
    team_coaches = coaches[(coaches['tmID'] == team_id) & (coaches['year'] == year)]
    print("TEAM COACHES", team_coaches)
    
    # Calcular o total de jogos para o ano atual
    total_games = team_coaches['won'].sum() + team_coaches['lost'].sum()
    print("TOTAL GAMES", total_games)
    
    total_coach_experience = 0
    
    for _, coach in team_coaches.iterrows():
        # Verificar se a equipe teve um treinador no ano anterior
        previous_year_coach = coaches[(coaches['coachID'] == coach['coachID']) & (coaches['tmID'] == team_id) & (coaches['year'] == year - 1)]
        
        if previous_year_coach.empty:
            # Se não existia treinador no ano anterior, a experiência é 0
            coach_experience = 0
        else:
            # Caso contrário, pegar as estatísticas dos anos anteriores para calcular a experiência do treinador
            coach_history = coaches[(coaches['coachID'] == coach['coachID']) & (coaches['year'] < year)]
            coach_history = coach_history.sort_values(by='year', ascending=False).head(year)

            weights = list(range(year, 0, -1)) 
            weighted_winrate = sum(coach_history['winrate'] * weights[:len(coach_history)])
            total_awards = coach_history['TotalAwards'].sum()
            coach_experience = weighted_winrate + total_awards
        
        # Calcular o peso do treinador na experiência total com base nos jogos
        coach_games = coach['won'] + coach['lost']
        coach_weight = coach_games / total_games if total_games > 0 else 0
        
        total_coach_experience += coach_experience * coach_weight
    
    return total_coach_experience


### Predict the Team Year Stats 

In [35]:
def predict_team_year_stats(team_id, year): 

    # Teams stats, considering the most recent year possible and if not available, using the average rookie team stats
    team_stats = []

    team_previous_stats = teams[(teams['tmID'] == team_id) & (teams['year'] < year)]

    if not team_previous_stats.empty:
        recent_stats = team_previous_stats.sort_values('year', ascending=False).head(1)
        team_stats.append(recent_stats)
    else:
        print(f"Team {team_id} is new in year {year}. Using average rookie team stats.")
        rookie_team_stats = teams[teams['tmID'] == f"rookie_team_avg_{year}"]
        print(rookie_team_stats)
        team_stats.append(rookie_team_stats)

    predicted_stats = pd.concat(team_stats, ignore_index=True)

    predicted_stats['year'] = year
    predicted_stats['tmID'] = team_id

    team_row = teams[teams['tmID'] == team_id]

    predicted_stats['franchID'] = team_row['franchID'].iloc[0]
    predicted_stats['confID'] = team_row['confID'].iloc[0]

    print(predicted_stats)
   

    # Select player ids for the team for that year
    players_ids = players_teams[(players_teams['tmID'] == team_id) & (players_teams['year'] == year)]['playerID']
    team_players_bio = players[(players['bioID'].isin(players_ids))]
    
    # Players stats, considering the most recent year possible and if not available, using the average rookie player stats
    team_players = []
    for player_id in players_ids:
        player_stats = players_teams[players_teams['playerID'] == player_id]
        if not player_stats.empty:
            recent_stats = player_stats[player_stats['year'] < year].sort_values('year', ascending=False).head(1)
            if not recent_stats.empty:
                team_players.append(recent_stats)
            else:
                print(f'No stats for player {player_id}')
                rookie_player_stats = players_teams[players_teams['playerID'] == f"average_rookie_{year}"]  
                team_players.append(rookie_player_stats)    


    
    team_players = pd.concat(team_players, ignore_index=True)
    
    
    # Calculate the player stats
    predicted_stats['player_average_height'] = team_players_bio['height'].mean()
    predicted_stats['player_average_weight'] = team_players_bio['weight'].mean()
    predicted_stats['player_total_GP'] = team_players['GP'].sum()
    predicted_stats['player_total_GS'] = team_players['GS'].sum()
    predicted_stats['player_total_points'] = team_players['points'].sum()
    predicted_stats['player_total_oRebounds'] = team_players['oRebounds'].sum()
    predicted_stats['player_total_dRebounds'] = team_players['dRebounds'].sum()
    predicted_stats['player_total_rebounds'] = team_players['rebounds'].sum()
    predicted_stats['player_total_assists'] = team_players['assists'].sum()
    predicted_stats['player_total_steals'] = team_players['steals'].sum()
    predicted_stats['player_total_blocks'] = team_players['blocks'].sum()
    predicted_stats['player_total_turnovers'] = team_players['turnovers'].sum()
    predicted_stats['player_total_PF'] = team_players['PF'].sum()
    predicted_stats['player_total_fgAttempted'] = team_players['fgAttempted'].sum()
    predicted_stats['player_total_fgMade'] = team_players['fgMade'].sum()
    predicted_stats['player_total_ftAttempted'] = team_players['ftAttempted'].sum()
    predicted_stats['player_total_ftMade'] = team_players['ftMade'].sum()
    predicted_stats['player_total_threeAttempted'] = team_players['threeAttempted'].sum()
    predicted_stats['player_total_threeMade'] = team_players['threeMade'].sum()
    predicted_stats['player_total_dq'] = team_players['dq'].sum()
    predicted_stats['player_total_PostGP'] = team_players['PostGP'].sum()
    predicted_stats['player_total_PostGS'] = team_players['PostGS'].sum()
    predicted_stats['player_total_PostMinutes'] = team_players['PostMinutes'].sum()
    predicted_stats['player_total_PostPoints'] = team_players['PostPoints'].sum()
    predicted_stats['player_total_PostoRebounds'] = team_players['PostoRebounds'].sum()
    predicted_stats['player_total_PostdRebounds'] = team_players['PostdRebounds'].sum()
    predicted_stats['player_total_PostRebounds'] = team_players['PostRebounds'].sum()
    predicted_stats['player_total_PostAssists'] = team_players['PostAssists'].sum()
    predicted_stats['player_total_PostSteals'] = team_players['PostSteals'].sum()
    predicted_stats['player_total_PostBlocks'] = team_players['PostBlocks'].sum()
    predicted_stats['player_total_PostTurnovers'] = team_players['PostTurnovers'].sum()
    predicted_stats['player_total_PostPF'] = team_players['PostPF'].sum()
    predicted_stats['player_total_PostfgAttempted'] = team_players['PostfgAttempted'].sum()
    predicted_stats['player_total_PostfgMade'] = team_players['PostfgMade'].sum()
    predicted_stats['player_total_PostftAttempted'] = team_players['PostftAttempted'].sum()
    predicted_stats['player_total_PostftMade'] = team_players['PostftMade'].sum()
    predicted_stats['player_total_PostthreeAttempted'] = team_players['PostthreeAttempted'].sum()
    predicted_stats['player_total_PostthreeMade'] = team_players['PostthreeMade'].sum()
    predicted_stats['player_total_PostDQ'] = team_players['PostDQ'].sum()
    predicted_stats['player_total_awards'] = team_players['TotalAwards'].sum()
    
    coach_experience = calculate_coach_experience_for_team(coaches, team_id, year)
    predicted_stats['coach_experience'] = coach_experience

    
    predicted_stats['playoff'] = ""
    predicted_stats['firstRound'] = ""
    predicted_stats['semis'] = ""
    predicted_stats['finals'] = ""
    
    return predicted_stats


# Function that returns a dataframe with all team stats for every year from 1 to year-1 plus the predicted stats for year
def get_year_predictions(year):
    team_predictions = []   
    for index, row in teams.iterrows():
        if row['year'] < year:
            team_predictions.append(
                teams.loc[(teams['tmID'] == row['tmID']) & (teams['year'] == row['year'])]
            )
        elif row['year'] == year:
            predicted_stats = predict_team_year_stats(row['tmID'], year)
            team_predictions.append(predicted_stats)
            
    print(f"Predicted stats for year {year} calculated.")
    return pd.concat(team_predictions, ignore_index=True)


year_prediction = get_year_predictions(11)

year_prediction = year_prediction[~year_prediction['tmID'].str.contains("rookie_team_avg")]


   year tmID franchID confID  rank playoff firstRound semis finals   won  ...  \
0    11  ATL      ATL     EA   2.0       Y          L   NaN    NaN  18.0  ...   

   o_ft_pct  o_3p_pct  o_oreb_pct  o_dreb_pct  d_fg_pct  d_ft_pct  d_3p_pct  \
0  0.753642  0.304813     0.32089     0.67911  0.421498  0.773234  0.341509   

   d_oreb_pct  d_dreb_pct  pts_diff  
0    0.300681    0.699319      64.0  

[1 rows x 75 columns]
TEAM COACHES         coachID  year tmID  stint  won  lost  winrate  TotalAwards
169  meadoma99w    11  ATL    NaN  NaN   NaN      NaN          NaN
TOTAL GAMES 0.0
   year tmID franchID confID  rank playoff firstRound semis finals   won  ...  \
0    11  CHI      CHI     EA   5.0       N        NaN   NaN    NaN  16.0  ...   

   o_ft_pct  o_3p_pct  o_oreb_pct  o_dreb_pct  d_fg_pct  d_ft_pct  d_3p_pct  \
0  0.760462  0.394904    0.283472    0.716528  0.442486  0.781403  0.347386   

   d_oreb_pct  d_dreb_pct  pts_diff  
0    0.310554    0.689446    -120.0  

[1 rows x 75 colu

# Test Models Accuracy

### Model: Random Forest Classifier

In [36]:
def evaluate_RFC_model_with_PCA(year, year_prediction):
    
    train_data = year_prediction[year_prediction['year'] <= (year-1)].dropna(subset=['playoff'])
    print("Shape of the data before PCA to Year " + str(year) + ": ", train_data.shape)
    X = train_data.drop(columns=['playoff', 'year', 'tmID', 'franchID', 'confID', 'firstRound', 'semis', 'finals', 'rank'])
    y = train_data['playoff'] 
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # PCA - With n_components=0.95, keep 95% of the variance
    pca = PCA(n_components=0.95)
    X_pca = pca.fit_transform(X_scaled)

    print("Shape of the data after PCA to Year " + str(year) + ": ", X_pca.shape)
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_pca, y)

    year_10_data = year_prediction[year_prediction['year'] == year].drop(columns=['playoff', 'year', 'tmID', 'franchID', 'confID', 'firstRound', 'semis', 'finals', 'rank'])
    year_10_data_scaled = scaler.transform(year_10_data)  
    year_10_data_pca = pca.transform(year_10_data_scaled)

    predictions = model.predict(year_10_data_pca)
    year_prediction.loc[year_prediction['year'] == year, 'playoff'] = predictions

    real_values = teams[teams['year'] == year]['playoff']

    accuracy = accuracy_score(real_values, predictions)
    precision = precision_score(real_values, predictions, pos_label='Y')  
    recall = recall_score(real_values, predictions, pos_label='Y')
    f1 = f1_score(real_values, predictions, pos_label='Y')

    print("\nResults for year " + str(year) + ":")
    print("Model: Random Forest Classifier with PCA (n_components=0.95)")
    print("Accuracy: ", accuracy)
    print("Precision: ", precision)
    print("Recall: ", recall)
    print("F1 Score: ", f1)

    print("\nClassification Report:")
    print(classification_report(real_values, predictions, target_names=['N', 'Y']))


evaluate_RFC_model_with_PCA(7, year_prediction)


Shape of the data before PCA to Year 7:  (88, 75)
Shape of the data after PCA to Year 7:  (88, 19)

Results for year 7:
Model: Random Forest Classifier with PCA (n_components=0.95)
Accuracy:  0.9285714285714286
Precision:  0.8888888888888888
Recall:  1.0
F1 Score:  0.9411764705882353

Classification Report:
              precision    recall  f1-score   support

           N       1.00      0.83      0.91         6
           Y       0.89      1.00      0.94         8

    accuracy                           0.93        14
   macro avg       0.94      0.92      0.93        14
weighted avg       0.94      0.93      0.93        14



### Model: Logistic Regression

In [37]:
from sklearn.linear_model import LogisticRegression

def evaluate_LR_model_with_PCA(year, year_prediction):
    print("Shape of the data: ", year_prediction.shape)
    
    train_data = year_prediction[year_prediction['year'] <= (year-1)].dropna(subset=['playoff'])
    X = train_data.drop(columns=['playoff', 'year', 'tmID', 'franchID', 'confID', 'firstRound', 'semis', 'finals', 'rank'])
    y = train_data['playoff'] 

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

   # PCA - With n_components=0.95, keep 95% of the variance
    pca = PCA(n_components=0.95)
    X_pca = pca.fit_transform(X_scaled)
    
    model = LogisticRegression(random_state=42)
    model.fit(X_pca, y)

    year_10_data = year_prediction[year_prediction['year'] == year].drop(columns=['playoff', 'year', 'tmID', 'franchID', 'confID', 'firstRound', 'semis', 'finals', 'rank'])
    year_10_data_scaled = scaler.transform(year_10_data) 
    year_10_data_pca = pca.transform(year_10_data_scaled)  

    # Fazer as previsões
    predictions = model.predict(year_10_data_pca)

    # Atualizar as previsões no dataframe
    year_prediction.loc[year_prediction['year'] == year, 'playoff'] = predictions

    # Valores reais para o cálculo das métricas
    real_values = teams[teams['year'] == year]['playoff']

    # Calcular as métricas
    accuracy = accuracy_score(real_values, predictions)
    precision = precision_score(real_values, predictions, pos_label='Y')  
    recall = recall_score(real_values, predictions, pos_label='Y')
    f1 = f1_score(real_values, predictions, pos_label='Y')

    print("\nResults for year " + str(year) + ":")
    print("Model: Logistic Regression with PCA and random_state=42")
    print("Accuracy: ", accuracy)
    print("Precision: ", precision)
    print("Recall: ", recall)
    print("F1 Score: ", f1)

    print("\nClassification Report:")
    print(classification_report(real_values, predictions, target_names=['N', 'Y']))


evaluate_LR_model_with_PCA(7, year_prediction)


Shape of the data:  (154, 75)

Results for year 7:
Model: Logistic Regression with PCA and random_state=42
Accuracy:  1.0
Precision:  1.0
Recall:  1.0
F1 Score:  1.0

Classification Report:
              precision    recall  f1-score   support

           N       1.00      1.00      1.00         6
           Y       1.00      1.00      1.00         8

    accuracy                           1.00        14
   macro avg       1.00      1.00      1.00        14
weighted avg       1.00      1.00      1.00        14



### Model: SVM (Support Vector Machine)

In [38]:
from sklearn.svm import SVC

def evaluate_SVM_model_with_PCA(year, year_prediction):
    print("Shape of the data: ", year_prediction.shape)
    
    train_data = year_prediction[year_prediction['year'] <= (year-1)].dropna(subset=['playoff'])
    X = train_data.drop(columns=['playoff', 'year', 'tmID', 'franchID', 'confID', 'firstRound', 'semis', 'finals', 'rank'])
    y = train_data['playoff'] 

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # PCA - With n_components=0.95, keep 95% of the variance
    pca = PCA(n_components=0.95)
    X_pca = pca.fit_transform(X_scaled)
    
    model = SVC(random_state=42)
    model.fit(X_pca, y)

    year_10_data = year_prediction[year_prediction['year'] == year].drop(columns=['playoff', 'year', 'tmID', 'franchID', 'confID', 'firstRound', 'semis', 'finals', 'rank'])
    year_10_data_scaled = scaler.transform(year_10_data)  
    year_10_data_pca = pca.transform(year_10_data_scaled) 

    predictions = model.predict(year_10_data_pca)

    year_prediction.loc[year_prediction['year'] == year, 'playoff'] = predictions

    real_values = teams[teams['year'] == year]['playoff']

    accuracy = accuracy_score(real_values, predictions)
    precision = precision_score(real_values, predictions, pos_label='Y')  
    recall = recall_score(real_values, predictions, pos_label='Y')
    f1 = f1_score(real_values, predictions, pos_label='Y')

    print("\nResults for year " + str(year) + ":")
    print("Model: Support Vector Machine with PCA and random_state=42")
    print("Accuracy: ", accuracy)
    print("Precision: ", precision)
    print("Recall: ", recall)
    print("F1 Score: ", f1)

    print("\nClassification Report:")
    print(classification_report(real_values, predictions, target_names=['N', 'Y']))


evaluate_SVM_model_with_PCA(7, year_prediction)

Shape of the data:  (154, 75)

Results for year 7:
Model: Support Vector Machine with PCA and random_state=42
Accuracy:  1.0
Precision:  1.0
Recall:  1.0
F1 Score:  1.0

Classification Report:
              precision    recall  f1-score   support

           N       1.00      1.00      1.00         6
           Y       1.00      1.00      1.00         8

    accuracy                           1.00        14
   macro avg       1.00      1.00      1.00        14
weighted avg       1.00      1.00      1.00        14



### Model: KNN (K-Nearest Neighbors)

In [39]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

def evaluate_KNN_model_with_PCA(year, year_prediction):
    print("Shape of the data: ", year_prediction.shape)
    
    train_data = year_prediction[year_prediction['year'] <= (year-1)].dropna(subset=['playoff'])
    X = train_data.drop(columns=['playoff', 'year', 'tmID', 'franchID', 'confID', 'firstRound', 'semis', 'finals', 'rank'])
    y = train_data['playoff'] 

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # PCA - With n_components=0.95, keep 95% of the variance
    pca = PCA(n_components=0.95)
    X_pca = pca.fit_transform(X_scaled)
    
    model = KNeighborsClassifier(n_neighbors=3)
    model.fit(X_pca, y)

    year_10_data = year_prediction[year_prediction['year'] == year].drop(columns=['playoff', 'year', 'tmID', 'franchID', 'confID', 'firstRound', 'semis', 'finals', 'rank'])
    year_10_data_scaled = scaler.transform(year_10_data)  
    year_10_data_pca = pca.transform(year_10_data_scaled)  

    predictions = model.predict(year_10_data_pca)

    year_prediction.loc[year_prediction['year'] == year, 'playoff'] = predictions

    real_values = teams[teams['year'] == year]['playoff']

    accuracy = accuracy_score(real_values, predictions)
    precision = precision_score(real_values, predictions, pos_label='Y')  
    recall = recall_score(real_values, predictions, pos_label='Y')
    f1 = f1_score(real_values, predictions, pos_label='Y')

    print("\nResults for year " + str(year) + ":")
    print("Model: K-Nearest Neighbors with PCA and n_neighbors=3")
    print("Accuracy: ", accuracy)
    print("Precision: ", precision)
    print("Recall: ", recall)
    print("F1 Score: ", f1)

    print("\nClassification Report:")
    print(classification_report(real_values, predictions, target_names=['N', 'Y']))

evaluate_KNN_model_with_PCA(7, year_prediction)

Shape of the data:  (154, 75)

Results for year 7:
Model: K-Nearest Neighbors with PCA and n_neighbors=3
Accuracy:  1.0
Precision:  1.0
Recall:  1.0
F1 Score:  1.0

Classification Report:
              precision    recall  f1-score   support

           N       1.00      1.00      1.00         6
           Y       1.00      1.00      1.00         8

    accuracy                           1.00        14
   macro avg       1.00      1.00      1.00        14
weighted avg       1.00      1.00      1.00        14



### Model: Decision Tree

In [40]:
from sklearn.tree import DecisionTreeClassifier

def evaluate_DTC_model_with_PCA(year, year_prediction):
    print("Shape of the data: ", year_prediction.shape)
    
    train_data = year_prediction[year_prediction['year'] <= (year-1)].dropna(subset=['playoff'])
    X = train_data.drop(columns=['playoff', 'year', 'tmID', 'franchID', 'confID', 'firstRound', 'semis', 'finals', 'rank'])
    y = train_data['playoff'] 

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # PCA - With n_components=0.95, keep 95% of the variance
    pca = PCA(n_components=0.95)
    X_pca = pca.fit_transform(X_scaled)
    
    model = DecisionTreeClassifier(random_state=42)
    model.fit(X_pca, y)

    year_10_data = year_prediction[year_prediction['year'] == year].drop(columns=['playoff', 'year', 'tmID', 'franchID', 'confID', 'firstRound', 'semis', 'finals', 'rank'])
    year_10_data_scaled = scaler.transform(year_10_data) 
    year_10_data_pca = pca.transform(year_10_data_scaled)  

    predictions = model.predict(year_10_data_pca)

    year_prediction.loc[year_prediction['year'] == year, 'playoff'] = predictions

    real_values = teams[teams['year'] == year]['playoff']

    accuracy = accuracy_score(real_values, predictions)
    precision = precision_score(real_values, predictions, pos_label='Y')  
    recall = recall_score(real_values, predictions, pos_label='Y')
    f1 = f1_score(real_values, predictions, pos_label='Y')

    print("\nResults for year " + str(year) + ":")
    print("Model: Decision Tree Classifier with PCA")
    print("Accuracy: ", accuracy)
    print("Precision: ", precision)
    print("Recall: ", recall)
    print("F1 Score: ", f1)

    print("\nClassification Report:")
    print(classification_report(real_values, predictions, target_names=['N', 'Y']))

evaluate_DTC_model_with_PCA(7, year_prediction)


Shape of the data:  (154, 75)

Results for year 7:
Model: Decision Tree Classifier with PCA
Accuracy:  0.8571428571428571
Precision:  0.8
Recall:  1.0
F1 Score:  0.8888888888888888

Classification Report:
              precision    recall  f1-score   support

           N       1.00      0.67      0.80         6
           Y       0.80      1.00      0.89         8

    accuracy                           0.86        14
   macro avg       0.90      0.83      0.84        14
weighted avg       0.89      0.86      0.85        14



# Predict Season 11 Playoffs

### Model: Random Forest Classifier

In [41]:
year_prediction = get_year_predictions(11)

year_prediction = year_prediction[~year_prediction['tmID'].str.contains("rookie_team_avg")]

def predict_RFC_S11(year, year_prediction):

    train_data = year_prediction[year_prediction['year'] <= (year - 1)].dropna(subset=['playoff'])
    
    X = train_data.drop(columns=['playoff', 'year', 'tmID', 'franchID', 'confID', 'firstRound', 'semis', 'finals', 'rank'])
    y = train_data['playoff']
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # PCA - With n_components=0.95, keep 95% of the variance
    pca = PCA(n_components=0.95)
    X_pca = pca.fit_transform(X_scaled)
    
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_pca, y)
    
    year_data = year_prediction[year_prediction['year'] == year].drop(columns=['playoff', 'year', 'tmID', 'franchID', 'confID', 'firstRound', 'semis', 'finals', 'rank'])
    year_data_scaled = scaler.transform(year_data)  
    year_data_pca = pca.transform(year_data_scaled)
    
    probabilities = model.predict_proba(year_data_pca)
    probabilities_playoff = probabilities[:, 1] 

    predictions = (probabilities_playoff >= 0.5).astype(int)  
    playoff_labels = ['Y' if pred == 1 else 'N' for pred in predictions]
    
    year_prediction.loc[year_prediction['year'] == year, 'playoff_prob'] = probabilities_playoff
    year_prediction.loc[year_prediction['year'] == year, 'playoff'] = playoff_labels

    year_prediction = year_prediction[year_prediction['year'] == 11]
    year_prediction.to_csv('data/results/season11_RFC_predictions.csv', index=False)
    

predict_RFC_S11(11, year_prediction)


   year tmID franchID confID  rank playoff firstRound semis finals   won  ...  \
0    11  ATL      ATL     EA   2.0       Y          L   NaN    NaN  18.0  ...   

   o_ft_pct  o_3p_pct  o_oreb_pct  o_dreb_pct  d_fg_pct  d_ft_pct  d_3p_pct  \
0  0.753642  0.304813     0.32089     0.67911  0.421498  0.773234  0.341509   

   d_oreb_pct  d_dreb_pct  pts_diff  
0    0.300681    0.699319      64.0  

[1 rows x 75 columns]
TEAM COACHES         coachID  year tmID  stint  won  lost  winrate  TotalAwards
169  meadoma99w    11  ATL    NaN  NaN   NaN      NaN          NaN
TOTAL GAMES 0.0
   year tmID franchID confID  rank playoff firstRound semis finals   won  ...  \
0    11  CHI      CHI     EA   5.0       N        NaN   NaN    NaN  16.0  ...   

   o_ft_pct  o_3p_pct  o_oreb_pct  o_dreb_pct  d_fg_pct  d_ft_pct  d_3p_pct  \
0  0.760462  0.394904    0.283472    0.716528  0.442486  0.781403  0.347386   

   d_oreb_pct  d_dreb_pct  pts_diff  
0    0.310554    0.689446    -120.0  

[1 rows x 75 colu

### Model: Logistic Regression

In [42]:
from sklearn.linear_model import LogisticRegression

year_prediction = get_year_predictions(11)

year_prediction = year_prediction[~year_prediction['tmID'].str.contains("rookie_team_avg")]

def predict_LR_S11(year, year_prediction):
    train_data = year_prediction[year_prediction['year'] <= (year - 1)].dropna(subset=['playoff'])
    
    X = train_data.drop(columns=['playoff', 'year', 'tmID', 'franchID', 'confID', 'firstRound', 'semis', 'finals', 'rank'])
    y = train_data['playoff']
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # PCA - With n_components=0.95, keep 95% of the variance
    pca = PCA(n_components=0.95)
    X_pca = pca.fit_transform(X_scaled)
    
    model = LogisticRegression(random_state=42)
    model.fit(X_pca, y)
    
    year_data = year_prediction[year_prediction['year'] == year].drop(columns=['playoff', 'year', 'tmID', 'franchID', 'confID', 'firstRound', 'semis', 'finals', 'rank'])
    year_data_scaled = scaler.transform(year_data)
    year_data_pca = pca.transform(year_data_scaled)
    
    probabilities = model.predict_proba(year_data_pca)
    probabilities_playoff = probabilities[:, 1] 

    predictions = (probabilities_playoff >= 0.5).astype(int)
    playoff_labels = ['Y' if pred == 1 else 'N' for pred in predictions]
    
    year_prediction.loc[year_prediction['year'] == year, 'playoff_prob'] = probabilities_playoff
    year_prediction.loc[year_prediction['year'] == year, 'playoff'] = playoff_labels
    
    year_prediction = year_prediction[year_prediction['year'] == 11]
    year_prediction.to_csv('data/results/season' + str(year) + '_LR_predictions.csv', index=False)
    
predict_LR_S11(11, year_prediction)


   year tmID franchID confID  rank playoff firstRound semis finals   won  ...  \
0    11  ATL      ATL     EA   2.0       Y          L   NaN    NaN  18.0  ...   

   o_ft_pct  o_3p_pct  o_oreb_pct  o_dreb_pct  d_fg_pct  d_ft_pct  d_3p_pct  \
0  0.753642  0.304813     0.32089     0.67911  0.421498  0.773234  0.341509   

   d_oreb_pct  d_dreb_pct  pts_diff  
0    0.300681    0.699319      64.0  

[1 rows x 75 columns]
TEAM COACHES         coachID  year tmID  stint  won  lost  winrate  TotalAwards
169  meadoma99w    11  ATL    NaN  NaN   NaN      NaN          NaN
TOTAL GAMES 0.0
   year tmID franchID confID  rank playoff firstRound semis finals   won  ...  \
0    11  CHI      CHI     EA   5.0       N        NaN   NaN    NaN  16.0  ...   

   o_ft_pct  o_3p_pct  o_oreb_pct  o_dreb_pct  d_fg_pct  d_ft_pct  d_3p_pct  \
0  0.760462  0.394904    0.283472    0.716528  0.442486  0.781403  0.347386   

   d_oreb_pct  d_dreb_pct  pts_diff  
0    0.310554    0.689446    -120.0  

[1 rows x 75 colu

### Model: SVM (Support Vector Machine)

In [43]:
from sklearn.svm import SVC
from sklearn.calibration import CalibratedClassifierCV

year_prediction = get_year_predictions(11)

year_prediction = year_prediction[~year_prediction['tmID'].str.contains("rookie_team_avg")]

def predict_SVM_S11(year, year_prediction):

    train_data = year_prediction[year_prediction['year'] <= (year - 1)].dropna(subset=['playoff'])
    X = train_data.drop(columns=['playoff', 'year', 'tmID', 'franchID', 'confID', 'firstRound', 'semis', 'finals', 'rank'])
    y = train_data['playoff']
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # PCA - With n_components=0.95, keep 95% of the variance
    pca = PCA(n_components=0.95)
    X_pca = pca.fit_transform(X_scaled)
    
    base_svm = SVC(random_state=42)
    model = CalibratedClassifierCV(base_svm)
    model.fit(X_pca, y)
    
    year_data = year_prediction[year_prediction['year'] == year].drop(columns=['playoff', 'year', 'tmID', 'franchID', 'confID', 'firstRound', 'semis', 'finals', 'rank'])
    year_data_scaled = scaler.transform(year_data)
    year_data_pca = pca.transform(year_data_scaled)
    
    probabilities = model.predict_proba(year_data_pca)
    probabilities_playoff = probabilities[:, 1]
    
    predictions = (probabilities_playoff >= 0.5).astype(int)
    playoff_labels = ['Y' if pred == 1 else 'N' for pred in predictions]
    
    year_prediction.loc[year_prediction['year'] == year, 'playoff_prob'] = probabilities_playoff
    year_prediction.loc[year_prediction['year'] == year, 'playoff'] = playoff_labels
    
    year_prediction = year_prediction[year_prediction['year'] == 11]
    year_prediction.to_csv('data/results/season' + str(year) + '_SVM_predictions.csv', index=False)
    
predict_SVM_S11(11, year_prediction)


   year tmID franchID confID  rank playoff firstRound semis finals   won  ...  \
0    11  ATL      ATL     EA   2.0       Y          L   NaN    NaN  18.0  ...   

   o_ft_pct  o_3p_pct  o_oreb_pct  o_dreb_pct  d_fg_pct  d_ft_pct  d_3p_pct  \
0  0.753642  0.304813     0.32089     0.67911  0.421498  0.773234  0.341509   

   d_oreb_pct  d_dreb_pct  pts_diff  
0    0.300681    0.699319      64.0  

[1 rows x 75 columns]
TEAM COACHES         coachID  year tmID  stint  won  lost  winrate  TotalAwards
169  meadoma99w    11  ATL    NaN  NaN   NaN      NaN          NaN
TOTAL GAMES 0.0
   year tmID franchID confID  rank playoff firstRound semis finals   won  ...  \
0    11  CHI      CHI     EA   5.0       N        NaN   NaN    NaN  16.0  ...   

   o_ft_pct  o_3p_pct  o_oreb_pct  o_dreb_pct  d_fg_pct  d_ft_pct  d_3p_pct  \
0  0.760462  0.394904    0.283472    0.716528  0.442486  0.781403  0.347386   

   d_oreb_pct  d_dreb_pct  pts_diff  
0    0.310554    0.689446    -120.0  

[1 rows x 75 colu

### Model: KNN (K-Nearest Neighbors)

In [44]:
from sklearn.neighbors import KNeighborsClassifier

year_prediction = get_year_predictions(11)

year_prediction = year_prediction[~year_prediction['tmID'].str.contains("rookie_team_avg")]

def predict_KNN_S11(year, year_prediction):
    
    train_data = year_prediction[year_prediction['year'] <= (year - 1)].dropna(subset=['playoff'])
    X = train_data.drop(columns=['playoff', 'year', 'tmID', 'franchID', 'confID', 'firstRound', 'semis', 'finals', 'rank'])
    y = train_data['playoff']
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # PCA - With n_components=0.95, keep 95% of the variance
    pca = PCA(n_components=0.95)
    X_pca = pca.fit_transform(X_scaled)
    
    model = KNeighborsClassifier(n_neighbors=3)
    model.fit(X_pca, y)
    
    year_data = year_prediction[year_prediction['year'] == year].drop(columns=['playoff', 'year', 'tmID', 'franchID', 'confID', 'firstRound', 'semis', 'finals', 'rank'])
    year_data_scaled = scaler.transform(year_data)
    year_data_pca = pca.transform(year_data_scaled)
    
    probabilities = model.predict_proba(year_data_pca)
    probabilities_playoff = probabilities[:, 1] 
    
    predictions = (probabilities_playoff >= 0.5).astype(int)
    playoff_labels = ['Y' if pred == 1 else 'N' for pred in predictions]
    
    year_prediction.loc[year_prediction['year'] == year, 'playoff_prob'] = probabilities_playoff
    year_prediction.loc[year_prediction['year'] == year, 'playoff'] = playoff_labels
    
    year_prediction = year_prediction[year_prediction['year'] == 11]
    year_prediction.to_csv(f'data/results/season{year}_KNN_predictions.csv', index=False)

predict_KNN_S11(11, year_prediction)


   year tmID franchID confID  rank playoff firstRound semis finals   won  ...  \
0    11  ATL      ATL     EA   2.0       Y          L   NaN    NaN  18.0  ...   

   o_ft_pct  o_3p_pct  o_oreb_pct  o_dreb_pct  d_fg_pct  d_ft_pct  d_3p_pct  \
0  0.753642  0.304813     0.32089     0.67911  0.421498  0.773234  0.341509   

   d_oreb_pct  d_dreb_pct  pts_diff  
0    0.300681    0.699319      64.0  

[1 rows x 75 columns]
TEAM COACHES         coachID  year tmID  stint  won  lost  winrate  TotalAwards
169  meadoma99w    11  ATL    NaN  NaN   NaN      NaN          NaN
TOTAL GAMES 0.0
   year tmID franchID confID  rank playoff firstRound semis finals   won  ...  \
0    11  CHI      CHI     EA   5.0       N        NaN   NaN    NaN  16.0  ...   

   o_ft_pct  o_3p_pct  o_oreb_pct  o_dreb_pct  d_fg_pct  d_ft_pct  d_3p_pct  \
0  0.760462  0.394904    0.283472    0.716528  0.442486  0.781403  0.347386   

   d_oreb_pct  d_dreb_pct  pts_diff  
0    0.310554    0.689446    -120.0  

[1 rows x 75 colu

### Model: Decision Tree

In [45]:
from sklearn.tree import DecisionTreeClassifier

year_prediction = get_year_predictions(11)

year_prediction = year_prediction[~year_prediction['tmID'].str.contains("rookie_team_avg")]

def predict_DTC_S11(year, year_prediction):
    
    train_data = year_prediction[year_prediction['year'] <= (year - 1)].dropna(subset=['playoff'])
    X = train_data.drop(columns=['playoff', 'year', 'tmID', 'franchID', 'confID', 'firstRound', 'semis', 'finals', 'rank'])
    y = train_data['playoff']
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # PCA - With n_components=0.95, keep 95% of the variance
    pca = PCA(n_components=0.95)
    X_pca = pca.fit_transform(X_scaled)
    
    model = DecisionTreeClassifier(random_state=42)
    model.fit(X_pca, y)
    
    year_data = year_prediction[year_prediction['year'] == year].drop(columns=['playoff', 'year', 'tmID', 'franchID', 'confID', 'firstRound', 'semis', 'finals', 'rank'])
    year_data_scaled = scaler.transform(year_data)
    year_data_pca = pca.transform(year_data_scaled)
    
    probabilities = model.predict_proba(year_data_pca)
    probabilities_playoff = probabilities[:, 1]  
    
    predictions = (probabilities_playoff >= 0.5).astype(int)
    playoff_labels = ['Y' if pred == 1 else 'N' for pred in predictions]
    
    year_prediction.loc[year_prediction['year'] == year, 'playoff_prob'] = probabilities_playoff
    year_prediction.loc[year_prediction['year'] == year, 'playoff'] = playoff_labels
    
    year_prediction = year_prediction[year_prediction['year'] == 11]
    year_prediction.to_csv(f'data/results/season{year}_DTC_predictions.csv', index=False)
    
predict_DTC_S11(11, year_prediction)


   year tmID franchID confID  rank playoff firstRound semis finals   won  ...  \
0    11  ATL      ATL     EA   2.0       Y          L   NaN    NaN  18.0  ...   

   o_ft_pct  o_3p_pct  o_oreb_pct  o_dreb_pct  d_fg_pct  d_ft_pct  d_3p_pct  \
0  0.753642  0.304813     0.32089     0.67911  0.421498  0.773234  0.341509   

   d_oreb_pct  d_dreb_pct  pts_diff  
0    0.300681    0.699319      64.0  

[1 rows x 75 columns]
TEAM COACHES         coachID  year tmID  stint  won  lost  winrate  TotalAwards
169  meadoma99w    11  ATL    NaN  NaN   NaN      NaN          NaN
TOTAL GAMES 0.0
   year tmID franchID confID  rank playoff firstRound semis finals   won  ...  \
0    11  CHI      CHI     EA   5.0       N        NaN   NaN    NaN  16.0  ...   

   o_ft_pct  o_3p_pct  o_oreb_pct  o_dreb_pct  d_fg_pct  d_ft_pct  d_3p_pct  \
0  0.760462  0.394904    0.283472    0.716528  0.442486  0.781403  0.347386   

   d_oreb_pct  d_dreb_pct  pts_diff  
0    0.310554    0.689446    -120.0  

[1 rows x 75 colu

### Clean the Season 11 Data Output

In [46]:
dtc_dataset = pd.read_csv('data/results/season11_DTC_predictions.csv')
knn_dataset = pd.read_csv('data/results/season11_KNN_predictions.csv')
lr_dataset = pd.read_csv('data/results/season11_LR_predictions.csv')
rfc_dataset = pd.read_csv('data/results/season11_RFC_predictions.csv')
svm_dataset = pd.read_csv('data/results/season11_SVM_predictions.csv')

dtc_dataset = dtc_dataset[['year','tmID', 'franchID', 'confID', 'playoff','playoff_prob']]
knn_dataset = knn_dataset[['year','tmID', 'franchID', 'confID', 'playoff','playoff_prob']]
lr_dataset = lr_dataset[['year','tmID', 'franchID', 'confID', 'playoff','playoff_prob']]
rfc_dataset = rfc_dataset[['year','tmID', 'franchID', 'confID', 'playoff','playoff_prob']]
svm_dataset = svm_dataset[['year','tmID', 'franchID', 'confID', 'playoff','playoff_prob']]

dtc_dataset.to_csv('data/results/DTC_cleaned.csv', index=False)
knn_dataset.to_csv('data/results/KNN_cleaned.csv', index=False)
lr_dataset.to_csv('data/results/LR_cleaned.csv', index=False)
rfc_dataset.to_csv('data/results/RFC_cleaned.csv', index=False)
svm_dataset.to_csv('data/results/SVM_cleaned.csv', index=False)


### Submission File - Season 11

In [47]:
dtc_dataset = pd.read_csv('data/results/DTC_cleaned.csv')
data_sorted_dtc = dtc_dataset.sort_values(by='tmID')
data_sorted_dtc["Playoff"] = data_sorted_dtc["playoff_prob"].apply(lambda x: f"{x:.2f}")
submission_dtc = data_sorted_dtc[["tmID", "Playoff"]]
submission_dtc.to_csv("data/results/submission/DTC_submission.csv", index=False)

knn_dataset = pd.read_csv('data/results/KNN_cleaned.csv')
data_sorted_knn = knn_dataset.sort_values(by='tmID')
data_sorted_knn["Playoff"] = data_sorted_knn["playoff_prob"].apply(lambda x: f"{x:.2f}")
submission_knn = data_sorted_knn[["tmID", "Playoff"]]
submission_knn.to_csv("data/results/submission/KNN_submission.csv", index=False)

lr_dataset = pd.read_csv('data/results/LR_cleaned.csv')
data_sorted_lr = lr_dataset.sort_values(by='tmID')
data_sorted_lr["Playoff"] = data_sorted_lr["playoff_prob"].apply(lambda x: f"{x:.2f}")
submission_lr = data_sorted_lr[["tmID", "Playoff"]]
submission_lr.to_csv("data/results/submission/LR_submission.csv", index=False)

rfc_dataset = pd.read_csv('data/results/RFC_cleaned.csv')
data_sorted_rfc = rfc_dataset.sort_values(by='tmID')
data_sorted_rfc["Playoff"] = data_sorted_rfc["playoff_prob"].apply(lambda x: f"{x:.2f}")
submission_rfc = data_sorted_rfc[["tmID", "Playoff"]]
submission_rfc.to_csv("data/results/submission/RFC_submission.csv", index=False)

svm_dataset = pd.read_csv('data/results/SVM_cleaned.csv')
data_sorted_svm = svm_dataset.sort_values(by='tmID')
data_sorted_svm["Playoff"] = data_sorted_svm["playoff_prob"].apply(lambda x: f"{x:.2f}")
submission_svm = data_sorted_svm[["tmID", "Playoff"]]
submission_svm.to_csv("data/results/submission/SVM_submission.csv", index=False)


