# Prepare predictions for the 11th year

## Cleaning data on 11th year files

In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

coaches_11th = pd.read_csv('../data/Season_11/coaches.csv')
players_teams_11th = pd.read_csv('../data/Season_11/players_teams.csv')
teams_11th = pd.read_csv('../data/Season_11/teams.csv')

In [16]:
# coaches_11th = coaches_11th.drop(columns=['stint', 'lgID', 'year'])
# players_teams_11th = players_teams_11th.drop(columns=['stint', 'lgID', 'year'])
# teams_11th = teams_11th.drop(columns=['lgID', 'franchID', 'year', 'name', 'arena'])

# coaches_11th.to_csv('../data/Season_11/coaches.csv', index=False)
# players_teams_11th.to_csv('../data/Season_11/players_teams.csv', index=False)
# teams_11th.to_csv('../data/Season_11/teams.csv', index=False)

## Maping categorical values

In [17]:
# map_teams = {
#     'ATL': 0, 'CHI': 2, 'CON': 4, 'IND': 7, 
#     'LAS': 8, 'MIN': 10, 'NYL': 11, 'PHO': 13, 
#     'SAS': 16, 'SEA': 17, 'WAS': 19, 'TUL': 20
# }

# map_conf = {
#   'EA': 0, 'WE': 1
# }

# coaches_11th['tmID'] = coaches_11th['tmID'].map(map_teams)
# players_teams_11th['tmID'] = players_teams_11th['tmID'].map(map_teams)
# teams_11th['tmID'] = teams_11th['tmID'].map(map_teams)
# teams_11th['confID'] = teams_11th['confID'].map(map_conf)

# coaches_11th.to_csv('../data/Season_11/coaches.csv', index=False)
# players_teams_11th.to_csv('../data/Season_11/players_teams.csv', index=False)
# teams_11th.to_csv('../data/Season_11/teams.csv', index=False)

## Import data from the past 10 years

In [18]:
awards_players_cleaned = pd.read_csv('../data/basketballPlayoffs_cleaned/awards_players_cleaned.csv')
coaches_cleaned = pd.read_csv('../data/basketballPlayoffs_cleaned/coaches_cleaned.csv')
players_cleaned = pd.read_csv('../data/basketballPlayoffs_cleaned/players_cleaned.csv')
players_teams_cleaned = pd.read_csv('../data/basketballPlayoffs_cleaned/players_teams_cleaned.csv')
series_post_cleaned = pd.read_csv('../data/basketballPlayoffs_cleaned/series_post_cleaned.csv')
teams_cleaned = pd.read_csv('../data/basketballPlayoffs_cleaned/teams_cleaned.csv')
teams_post_cleaned = pd.read_csv('../data/basketballPlayoffs_cleaned/teams_post_cleaned.csv')

In [19]:
awards_players_10_years = awards_players_cleaned[awards_players_cleaned['year'] != 11]
coaches__10_years = coaches_cleaned[coaches_cleaned['year'] != 11]
players_teams_10_years = players_teams_cleaned[players_teams_cleaned['year'] != 11]
series_post_10_years = series_post_cleaned[series_post_cleaned['year'] != 11]
teams_10_years = teams_cleaned[teams_cleaned['year'] != 11]
teams_post_10_years = teams_post_cleaned[teams_post_cleaned['year'] != 11]

for df, name in [(awards_players_10_years, 'awards'), 
                 (coaches__10_years, 'coaches'),
                 (players_teams_10_years, 'players'),
                 (series_post_10_years, 'series'),
                 (teams_10_years, 'teams'),
                 (teams_post_10_years, 'teams_post')]:
    if df['year'].max() != 10:
        print(f"Warning: {name} contains data beyond year 10")

awards_players_10_years.to_csv('../data/Season_11/awards_players_10_years.csv', index=False)
coaches__10_years.to_csv('../data/Season_11/coaches_10_years.csv', index=False)
players_cleaned.to_csv('../data/Season_11/players_10_years.csv', index=False)
players_teams_10_years.to_csv('../data/Season_11/players_teams_10_years.csv', index=False)
series_post_10_years.to_csv('../data/Season_11/series_post_10_years.csv', index=False)
teams_10_years.to_csv('../data/Season_11/teams_10_years.csv', index=False)
teams_post_10_years.to_csv('../data/Season_11/teams_post_10_years.csv', index=False)

## Overalls from 10 years

In [20]:
players_overall_avg = players_teams_10_years.groupby('playerID')['OVERALL'].mean().reset_index()

players_overall_avg.rename(columns={'OVERALL': 'OVERALL_ALL_TIME'}, inplace=True)

players_overall_avg['OVERALL_ALL_TIME'] = players_overall_avg['OVERALL_ALL_TIME'].round(1)

players_overall_avg.to_csv('../data/Season_11/players_overall_all_time_10_years.csv', index=False)

In [21]:
rookie_players = players_teams_10_years[players_teams_10_years['is_rookie'] == 1]

rookie_overall_avg = rookie_players['OVERALL'].mean().round(1)

rookie_overall_avg_df = pd.DataFrame({'rookie_overall_avg': [rookie_overall_avg]})

rookie_overall_avg_df.to_csv('../data/Season_11/rookie_overall_avg_10_years.csv', index=False)

In [22]:
rookie_coaches = coaches__10_years[coaches__10_years['is_rookie'] == 1]

rookie_overall_coaches_avg = rookie_coaches['OVERALL'].mean().round(1)

rookie_overall_coaches_avg_df = pd.DataFrame({'rookie_overall_coaches_avg': [rookie_overall_coaches_avg]})

rookie_overall_coaches_avg_df.to_csv('../data/Season_11/rookie_overall_coaches_avg_10_years.csv', index=False)

In [23]:
coaches_overall_avg = coaches__10_years.groupby('coachID')['OVERALL'].mean().reset_index()

coaches_overall_avg.rename(columns={'OVERALL': 'OVERALL_ALL_TIME'}, inplace=True)

coaches_overall_avg['OVERALL_ALL_TIME'] = coaches_overall_avg['OVERALL_ALL_TIME'].round(1)

coaches_overall_avg.to_csv('../data/Season_11/coaches_overall_all_time_10_years.csv', index=False)

# Add overalls calculated from 10 years to the 11th year players and coach

In [24]:
# players_teams = pd.read_csv('../data/Season_11/players_teams.csv')
# players_overall_all_time_10_years = pd.read_csv('../data/Season_11/players_overall_all_time_10_years.csv')

# rookie_avg = 5.6 

# players_teams = players_teams.merge(
#     players_overall_all_time_10_years[['playerID', 'OVERALL_ALL_TIME']],
#     on='playerID',
#     how='left'
# ).fillna({'OVERALL_ALL_TIME': rookie_avg})

# players_teams.rename(columns={'OVERALL_ALL_TIME': 'OVERALL'}, inplace=True)

# players_teams.to_csv('../data/Season_11/players_teams.csv', index=False)

In [25]:
# coaches = pd.read_csv('../data/Season_11/coaches.csv')
# coaches_overall_all_time_10_years = pd.read_csv('../data/Season_11/coaches_overall_all_time_10_years.csv')

# rookie_avg_coach = 9.0 

# coaches = coaches.merge(
#     coaches_overall_all_time_10_years[['coachID', 'OVERALL_ALL_TIME']],
#     on='coachID',
#     how='left'
# ).fillna({'OVERALL_ALL_TIME': rookie_avg_coach})

# coaches.rename(columns={'OVERALL_ALL_TIME': 'OVERALL'}, inplace=True)

# coaches.to_csv('../data/Season_11/coaches.csv', index=False)

## Given the teams players composition and their respective overalls calculate the team overall (mean of all players)

In [26]:
players = pd.read_csv('../data/Season_11/players_teams.csv')

team_overall = players.groupby('tmID')['OVERALL'].mean().reset_index()

team_overall['overall_team'] = team_overall['OVERALL'].round(1)

team_overall = team_overall.drop(columns=['OVERALL'])

team_overall.to_csv('../data/Season_11/team_overall.csv', index=False)

## Given the team coaches and their respective overalls calculate the coach overall (mean of all coaches(1 or more than 1))

In [27]:
coaches = pd.read_csv('../data/Season_11/coaches.csv')

team_overall_coaches = coaches.groupby('tmID')['OVERALL'].mean().reset_index()

team_overall_coaches['overall_team_coach'] = team_overall_coaches['OVERALL'].round(1)

team_overall_coaches = team_overall_coaches.drop(columns=['OVERALL'])

team_overall_coaches.to_csv('../data/Season_11/team_overall_coaches.csv', index=False)

## Models

In [45]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# Load data
teams_10_years = pd.read_csv('../data/Season_11/teams_10_years.csv')
players_teams_10_years = pd.read_csv('../data/Season_11/players_teams_10_years.csv')
coaches_10_years = pd.read_csv('../data/Season_11/coaches_10_years.csv')
teams_year_11 = pd.read_csv('../data/Season_11/teams.csv')
players_teams_11 = pd.read_csv('../data/Season_11/players_teams.csv')
coaches_11 = pd.read_csv('../data/Season_11/coaches.csv')

def create_features_for_team(team_data, teams_history, players_teams_data, coaches_data, year):
    features = {
        'won': team_data['won'] if 'won' in team_data else 0,
        'lost': team_data['lost'] if 'lost' in team_data else 0,
        'o_pts': team_data['o_pts'] if 'o_pts' in team_data else 0,
        'd_pts': team_data['d_pts'] if 'd_pts' in team_data else 0,
        'o_reb': team_data['o_reb'] if 'o_reb' in team_data else 0,
        'd_reb': team_data['d_reb'] if 'd_reb' in team_data else 0,
        'confID': team_data['confID']
    }
    
    # Verifique se há histórico para a equipe
    team_history_data = teams_history[
        (teams_history['tmID'] == team_data['tmID']) & 
        (teams_history['year'] <= year)
    ].sort_values('year')
    
    if len(team_history_data) > 0:
        # Se a equipe tem dados históricos, calcule as médias de 3 anos
        features['win_rate_3yr'] = team_history_data['won'].tail(3).mean()
        features['playoff_rate_3yr'] = team_history_data['playoff'].tail(3).mean()
        features['points_diff_3yr'] = (team_history_data['o_pts'] - team_history_data['d_pts']).tail(3).mean()
    else:
        # Se a equipe não tem dados históricos (nova equipe), atribua valores padrão baixos
        features['win_rate_3yr'] = 0
        features['playoff_rate_3yr'] = 0
        features['points_diff_3yr'] = 0
    
    # Dados do jogador
    team_players = players_teams_data[
        players_teams_data['tmID'] == team_data['tmID']
    ]
    features['player_overall_avg'] = team_players['OVERALL'].mean() if len(team_players) > 0 else 0
    
    # Dados do treinador
    team_coach = coaches_data[
        coaches_data['tmID'] == team_data['tmID']
    ]
    features['coach_overall'] = team_coach['OVERALL'].mean() if len(team_coach) > 0 else 0
    
    return list(features.values())

# Prepare training data (years 1-10)
X_train = []
y_train = []

for year in range(1, 10):
    current_year_teams = teams_10_years[teams_10_years['year'] == year]
    next_year_teams = teams_10_years[teams_10_years['year'] == year + 1]
    
    for _, team in current_year_teams.iterrows():
        if team['tmID'] in next_year_teams['tmID'].values:
            features = create_features_for_team(
                team, 
                teams_10_years, 
                players_teams_10_years[players_teams_10_years['year'] == year],
                coaches_10_years[coaches_10_years['year'] == year],
                year
            )
            X_train.append(features)
            next_year_playoff = next_year_teams[next_year_teams['tmID'] == team['tmID']]['playoff'].iloc[0]
            y_train.append(next_year_playoff)

# Add year 10 data for training
year_10_teams = teams_10_years[teams_10_years['year'] == 10]
for _, team in year_10_teams.iterrows():
    features = create_features_for_team(
        team,
        teams_10_years,
        players_teams_10_years[players_teams_10_years['year'] == 10],
        coaches_10_years[coaches_10_years['year'] == 10],
        10
    )
    X_train.append(features)
    y_train.append(team['playoff'])

X_train = np.array(X_train)
y_train = np.array(y_train)

# Prepare prediction data for year 11
X_predict = []
for _, team in teams_year_11.iterrows():
    features = create_features_for_team(
        team,
        teams_10_years,
        players_teams_11,
        coaches_11,
        11
    )
    X_predict.append(features)
X_predict = np.array(X_predict)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_predict_scaled = scaler.transform(X_predict)

# Initialize models with same parameters as year 10
models = {
    'SVM': SVC(kernel='rbf', probability=True, C=0.5, random_state=42),
    'Decision Tree': DecisionTreeClassifier(
        max_depth=3,
        min_samples_leaf=4,
        random_state=42
    ),
    'Random Forest': RandomForestClassifier(
        n_estimators=100,
        max_depth=5,
        min_samples_split=5,
        random_state=42
    ),
    'Logistic Regression': LogisticRegression(
        C=0.8,
        max_iter=1000, 
        random_state=42
    ),
    'KNN': KNeighborsClassifier(n_neighbors=7)
}

# Train models and get predictions
model_predictions = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Train model
    model.fit(X_train_scaled, y_train)
    
    # Get training accuracy
    train_accuracy = accuracy_score(y_train, model.predict(X_train_scaled)) * 100
    
    # Get probabilities for year 11
    probs = model.predict_proba(X_predict_scaled)[:, 1]
    
    probs_adjusted = 8 * (1 - probs) / np.sum(1 - probs)

    # Store results
    model_predictions[name] = {
        'training_accuracy': train_accuracy,
        'probabilities': dict(zip(teams_year_11['tmID'], probs_adjusted))
    }
    
    print(f"Training Accuracy: {train_accuracy:.2f}%")
    print(f"Sum of probabilities: {np.sum(probs_adjusted):.2f}")

# Find best model based on training accuracy
best_model = max(model_predictions.items(), key=lambda x: x[1]['training_accuracy'])
print(f"\nBest Model: {best_model[0]}")
print(f"Training Accuracy: {best_model[1]['training_accuracy']:.2f}%")

# Create final predictions CSV with the best model's predictions
final_predictions = pd.DataFrame({
    'tmID': teams_year_11['tmID'],
    'Playoff': [best_model[1]['probabilities'][tmID] for tmID in teams_year_11['tmID']]
})

# Sort by tmID and format probabilities
final_predictions = final_predictions.sort_values('tmID')
final_predictions['Playoff'] = final_predictions['Playoff'].apply(lambda x: "{:.2f}".format(x))

# Save predictions
final_predictions.to_csv('../data/Season_11/playoff_predictions.csv', index=False)

print("\nFinal predictions (sorted by tmID):")
print(final_predictions.to_string(index=False))

# Verify sum of probabilities
prob_sum = sum(float(x) for x in final_predictions['Playoff'])
print(f"\nSum of probabilities: {prob_sum:.2f}")


Training SVM...
Training Accuracy: 76.30%
Sum of probabilities: 8.00

Training Decision Tree...
Training Accuracy: 78.52%
Sum of probabilities: 8.00

Training Random Forest...
Training Accuracy: 93.33%
Sum of probabilities: 8.00

Training Logistic Regression...
Training Accuracy: 71.11%
Sum of probabilities: 8.00

Training KNN...
Training Accuracy: 73.33%
Sum of probabilities: 8.00

Best Model: Random Forest
Training Accuracy: 93.33%

Final predictions (sorted by tmID):
 tmID Playoff
    0    0.57
    2    0.57
    4    0.76
    7    0.76
    8    0.63
   10    0.52
   11    0.70
   13    0.77
   16    0.73
   17    0.75
   19    0.63
   20    0.62

Sum of probabilities: 8.01


In [41]:
map_teams = {
    'ATL': 0, 'CHI': 2, 'CON': 4, 'IND': 7, 
    'LAS': 8, 'MIN': 10, 'NYL': 11, 'PHO': 13, 
    'SAS': 16, 'SEA': 17, 'WAS': 19, 'TUL': 20
}

reverse_map_teams = {v: k for k, v in map_teams.items()}

final_predictions = pd.read_csv('../data/Season_11/playoff_predictions.csv')

final_predictions['tmID'] = final_predictions['tmID'].map(reverse_map_teams)
final_predictions = final_predictions.sort_values(by='tmID')

final_predictions.to_csv('../data/Season_11/playoff_predictions.csv', index=False)

# Predicitons using only overalls

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Load the data
data_team = pd.read_csv('../data/Season_11/team_overall.csv')  # Player averages
data_coach = pd.read_csv('../data/Season_11/team_overall_coaches.csv')  # Coach averages
data_conference = pd.read_csv('../data/Season_11/teams.csv')  # Conferences

# Combine the data
data = data_team.merge(data_coach, on="tmID").merge(data_conference, on="tmID")

# Give more weight to the overall_team (2x)
data["weighted_overall"] = 2 * data["overall_team"] + data["overall_team_coach"]

# Create the target variable (1 for the top 4 teams in each conference)
data["playoff"] = 0
for conf in data["confID"].unique():
    conf_teams = data[data["confID"] == conf]
    top_teams = conf_teams.nlargest(4, "weighted_overall")
    data.loc[top_teams.index, "playoff"] = 1

# Normalize "weighted_overall" by conference
data["conf_weighted_overall"] = data.groupby("confID")["weighted_overall"].transform(
    lambda x: (x - x.min()) / (x.max() - x.min())
)

# Separate independent and dependent variables
X = data[["overall_team", "overall_team_coach", "confID"]]
y = data["playoff"]

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Create a pipeline with standardization and SVM
model = Pipeline([
    ("scaler", StandardScaler()),
    ("svm", SVC(probability=True, random_state=42))
])

# Train the model
model.fit(X_train, y_train)

# Evaluate the model
accuracy = model.score(X_test, y_test)

# Display the validation, now with adjusted probabilities
validation = data[["tmID", "confID", "weighted_overall", "playoff"]].sort_values(
    by=["confID"], ascending=[True]
)

# Display the validation
print("\nConference classification:")
print(validation)


Conference classification:
    tmID  confID  weighted_overall  playoff
0      0       0              21.8        0
1      2       0              23.2        0
2      4       0              26.0        1
3      7       0              27.7        1
6     11       0              26.7        1
10    19       0              23.5        1
5     10       1              29.0        1
4      8       1              27.4        1
7     13       1              29.6        1
8     16       1              26.3        1
9     17       1              25.8        0
11    20       1              22.6        0
