# Test Data Prediction

in this notebook we will make the prediction for the 3 task for the test year data

using the best models for each prediction



## Import Libraries and Setup

In [27]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.linear_model import Ridge

import warnings
warnings.filterwarnings("ignore")

## 1. Awards Predictions for Year 11

Based on the analysis, we use the best model for each award:
- **MVP**: Logistic Regression
- **DPOTY**: Logistic Regression
- **ROTY**: CatBoostClassifier
- **MIP**: XGBClassifier
- **KPSA**: CatBoostClassifier
- **SWOTY**: CatBoostClassifier
- **FMVP**: Logistic Regression
- **ASGMVP**: Logistic Regression
- **COTY**: XGBClassifier

In [28]:
def predict_award_winner(year, award_name, features, target, df, model_type='logistic'):
    """
    Predict award winner for a given year using specified model.
    Returns the predicted winner's ID and probability.
    """
    test_df = df[df['year'] == year].copy()
    train_df = df[df["year"].isin(list(range(year-4, year)))].copy()
    
    # Select model
    if model_type == 'logistic':
        model = LogisticRegression(
            C=1.0, penalty='l2', solver='lbfgs',
            max_iter=1000, class_weight='balanced'
        )
    elif model_type == 'xgb':
        model = XGBClassifier(
            n_estimators=500, learning_rate=0.05, max_depth=5,
            subsample=0.8, colsample_bytree=0.7, eval_metric='logloss'
        )
    elif model_type == 'catboost':
        model = CatBoostClassifier(
            depth=4, learning_rate=0.05, iterations=500,
            loss_function="Logloss", verbose=False
        )
    
    # Train model
    train_clean = train_df.dropna(subset=features + [target])
    if len(train_clean) == 0:
        return None, None, None
    
    X_train = train_clean[features]
    y_train = train_clean[target]
    
    if model_type == 'logistic':
        pipe = Pipeline([('scaler', StandardScaler()), ('model', model)])
        pipe.fit(X_train, y_train)
        y_prob = pipe.predict_proba(test_df[features])[:, 1]
    else:
        model.fit(X_train, y_train)
        y_prob = model.predict_proba(test_df[features])[:, 1]
    
    # Get predicted winner
    test_clean = test_df.copy()
    test_clean[award_name + '_prob'] = y_prob
    test_clean['rank'] = test_clean[award_name + '_prob'].rank(ascending=False, method='dense')
    
    # Identify ID column
    id_col = 'playerID' if 'playerID' in test_clean.columns else 'coachID'
    # Get top 10 predictions
    top10 = test_clean.nsmallest(10, 'rank')[[id_col, award_name + '_prob', 'rank']].copy()
    top10.columns = ['ID', 'Probability', 'Rank']
    
    return top10

### 1.1 MVP Prediction (Logistic Regression)

In [29]:
mvp_params = {
    'features': ['overall_score_prev_1yr', 'overall_score_prev_2yr', 'overall_score_prev_3yr'],
    'target': 'mvp',
    'df': pd.read_csv('../predict_datasets/mvp.csv').fillna(0)
}

mvp_top3 = predict_award_winner(11, 'mvp', mvp_params['features'], 
                                 mvp_params['target'], mvp_params['df'], 
                                 model_type='logistic')

print('=' * 50)
print('MVP PREDICTION FOR YEAR 11')
print('=' * 50)
print(mvp_top3.to_string(index=False))
print()

MVP PREDICTION FOR YEAR 11
        ID  Probability  Rank
jacksla01w     0.907363   1.0
catchta01w     0.905549   2.0
swoopsh01w     0.863375   3.0
pierspl01w     0.795749   4.0
thompti01w     0.775144   5.0
milleke01w     0.770884   6.0
taylope01w     0.754751   7.0
pricear01w     0.749766   8.0
augusse01w     0.749310   9.0
whaleli01w     0.746111  10.0



### 1.2 DPOTY Prediction (Logistic Regression)

In [30]:
dpoty_params = {
    'features': ['defense_score_prev_1yr', 'defense_score_prev_2yr', 'defense_score_prev_3yr'],
    'target': 'defensive',
    'df': pd.read_csv('../predict_datasets/defensive.csv').fillna(0)
}

dpoty_top3 = predict_award_winner(11, 'defensive', dpoty_params['features'], 
                                   dpoty_params['target'], dpoty_params['df'], 
                                   model_type='logistic')

print('=' * 50)
print('DPOTY PREDICTION FOR YEAR 11')
print('=' * 50)
print(dpoty_top3.to_string(index=False))
print()

DPOTY PREDICTION FOR YEAR 11
        ID  Probability  Rank
parkeca01w     0.900995   1.0
catchta01w     0.878657   2.0
jacksla01w     0.835882   3.0
dupreca01w     0.766746   4.0
anosini01w     0.389660   5.0
brunsre01w     0.381818   6.0
fowlesy01w     0.381558   7.0
lyttlsa01w     0.337236   8.0
mcwilta01w     0.325594   9.0
swoopsh01w     0.267842  10.0



### 1.3 ROTY Prediction (CatBoostClassifier)

In [31]:
roty_params = {
    'features': ['tmID', 'college', 'team_prev_rank', 'college_count_before'],
    'target': 'rookie',
    'df': pd.read_csv('../predict_datasets/rookies.csv').fillna(0)
}

roty_top3 = predict_award_winner(11, 'rookie', roty_params['features'], 
                                  roty_params['target'], roty_params['df'], 
                                  model_type='catboost')

print('=' * 50)
print('ROTY PREDICTION FOR YEAR 11')
print('=' * 50)
print(roty_top3.to_string(index=False))
print()

ROTY PREDICTION FOR YEAR 11
        ID  Probability  Rank
bjorkan01w     0.517443   1.0
charlti01w     0.030449   2.0
greenka01w     0.030449   2.0
hightal01w     0.010768   3.0
moorema01w     0.009487   4.0
brelaje01w     0.003721   5.0
thomaja01w     0.002505   6.0
chriska02w     0.002417   7.0
cheekjo01w     0.002300   8.0
chriska02w     0.002040   9.0



### 1.4 MIP Prediction (XGBClassifier)

In [32]:
mip_params = {
    'features': ['overall_score_prev_1yr','overall_score_prev_2yr','overall_score_prev_3yr',
                 'minutes_category'],
    'target': 'improved',
    'df': pd.read_csv('../predict_datasets/mip.csv').fillna(0)
}

mip_top3 = predict_award_winner(11, 'mip', mip_params['features'], 
                                 mip_params['target'], mip_params['df'], 
                                 model_type='xgb')

print('=' * 50)
print('MIP PREDICTION FOR YEAR 11')
print('=' * 50)
print(mip_top3.to_string(index=False))
print()

MIP PREDICTION FOR YEAR 11
        ID  Probability  Rank
parisco01w     0.029776   1.0
januabr01w     0.029305   2.0
 holtam01w     0.023582   3.0
ajavoma01w     0.021779   4.0
snellbe01w     0.014201   5.0
sanfona01w     0.009654   6.0
bonnede01w     0.008975   7.0
mccouan01w     0.008975   7.0
montgre01w     0.008975   7.0
zellosh01w     0.008975   7.0



### 1.5 KPSA Prediction (CatBoostClassifier)

In [33]:
kpsa_params = {
    'features': ['prev_score_1', 'prev_score_2', 'prev_score_3', 'prev_attend'],
    'target': 'sportsmanship',
    'df': pd.read_csv('../predict_datasets/kpsa.csv').fillna(0)
}

kpsa_top3 = predict_award_winner(11, 'kpsa', kpsa_params['features'], 
                                  kpsa_params['target'], kpsa_params['df'], 
                                  model_type='catboost')

print('=' * 50)
print('KPSA PREDICTION FOR YEAR 11')
print('=' * 50)
print(kpsa_top3.to_string(index=False))
print()

KPSA PREDICTION FOR YEAR 11
        ID  Probability  Rank
hammobe01w     0.062858   1.0
bobbish01w     0.026365   2.0
thompti01w     0.010938   3.0
thorner01w     0.005622   4.0
dupreca01w     0.005574   5.0
quinnno01w     0.005574   6.0
mazzake01w     0.004771   7.0
 cashsw01w     0.004769   8.0
hodgero01w     0.004161   9.0
 birdsu01w     0.003603  10.0



### 1.6 SWOTY Prediction (CatBoostClassifier)

In [None]:
swoty_params = {
    'features': ['overall_score_prev_1yr','overall_score_prev_2yr','overall_score_prev_3yr', 'GS_category'],
    'target': 'sixth',
    'df': pd.read_csv('../predict_datasets/swoty.csv').fillna(0)
}

swoty_top3 = predict_award_winner(11, 'sixth', swoty_params['features'], 
                                   swoty_params['target'], swoty_params['df'], 
                                   model_type='catboost')

print('=' * 50)
print('SWOTY PREDICTION FOR YEAR 11')
print('=' * 50)
print(swoty_top3.to_string(index=False))
print()

SWOTY PREDICTION FOR YEAR 11
        ID  Probability  Rank
kraayca01w     0.123037   1.0
 cashsw01w     0.028527   2.0
willile01w     0.022942   3.0
beviltu01w     0.018786   4.0
currimo01w     0.012036   5.0
cantydo01w     0.007754   6.0
penicti01w     0.006629   7.0
swoopsh01w     0.002853   8.0
milleke01w     0.002219   9.0
carsoes01w     0.001944  10.0



### 1.7 FMVP Prediction (Logistic Regression)

In [35]:
fmvp_params = {
    'features': ['PrevPerformance','Performance_weighted_2yr','Performance_weighted_3yr',
                 'Performance_weighted_4yr','team_PrevPerformance','team_Performance_weighted_2yr',
                 'team_Performance_weighted_3yr','team_Performance_weighted_4yr'],
    'target': 'finals_mvp',
    'df': pd.read_csv('../predict_datasets/finals_mvp.csv').fillna(0)
}

fmvp_top3 = predict_award_winner(11, 'finals_mvp', fmvp_params['features'], 
                                  fmvp_params['target'], fmvp_params['df'], 
                                  model_type='logistic')

print('=' * 50)
print('FMVP PREDICTION FOR YEAR 11')
print('=' * 50)
print(fmvp_top3.to_string(index=False))
print()

FMVP PREDICTION FOR YEAR 11
        ID  Probability  Rank
mccouan01w     0.617974   1.0
anosini01w     0.491309   2.0
catchta01w     0.423408   3.0
jacksla01w     0.421527   4.0
tauradi01w     0.414092   5.0
augusse01w     0.378879   6.0
bonnede01w     0.373992   7.0
parkeca01w     0.268274   8.0
pondeca01w     0.256420   9.0
hammobe01w     0.231951  10.0



### 1.8 ASGMVP Prediction (Logistic Regression)

In [36]:
asgmvp_params = {
    'features': ['overall_score_prev_1yr','overall_score_prev_2yr','overall_score_prev_3yr'],
    'target': 'allstar_mvp',
    'df': pd.read_csv('../predict_datasets/all-star_game_mvp.csv').fillna(0)
}

asgmvp_top3 = predict_award_winner(11, 'allstar_mvp', asgmvp_params['features'], 
                                    asgmvp_params['target'], asgmvp_params['df'], 
                                    model_type='logistic')

print('=' * 50)
print('ASGMVP PREDICTION FOR YEAR 11')
print('=' * 50)
print(asgmvp_top3.to_string(index=False))
print()

ASGMVP PREDICTION FOR YEAR 11
        ID  Probability  Rank
jacksla01w     0.993183   1.0
catchta01w     0.988842   2.0
tauradi01w     0.981688   3.0
augusse01w     0.973069   4.0
youngso01w     0.950470   5.0
dupreca01w     0.946052   6.0
pondeca01w     0.943036   7.0
douglka01w     0.929293   8.0
whaleli01w     0.923183   9.0
hammobe01w     0.918077  10.0



### 1.9 COTY Prediction (XGBClassifier)

In [37]:
coty_params = {
    'features': ['coach_tenure','win_rate_prev_team_1yr','win_rate_prev_team_2yr',
                 'win_rate_prev_coach_1yr','win_rate_prev_coach_2yr','change_rate_prev'],
    'target': 'coach_of_the_year',
    'df': pd.read_csv('../predict_datasets/coty.csv').fillna(0)
}

coty_top3 = predict_award_winner(11, 'coach', coty_params['features'], 
                                  coty_params['target'], coty_params['df'], 
                                  model_type='xgb')

print('=' * 50)
print('COTY PREDICTION FOR YEAR 11')
print('=' * 50)
print(coty_top3.to_string(index=False))
print()

COTY PREDICTION FOR YEAR 11
        ID  Probability  Rank
meadoma99w     0.770409   1.0
thibami99w     0.284675   2.0
richano99w     0.228029   3.0
aglerbr99w     0.112508   4.0
hugheda99w     0.088486   5.0
dunnli99wc     0.045420   6.0
gaineco01w     0.041364   7.0
chatmda99w     0.023973   8.0
whisejo99w     0.008486   9.0
gilloje01w     0.004885  10.0



## 2. Coach Turnover Prediction for Year 11

Predicting which teams will change coaches in year 11.

In [38]:
def predict_coach_turnover(year):
    df = pd.read_csv('../predict_datasets/coaches_turnover.csv').fillna(0)
    test_df = df[df['year'] == year].copy()
    train_df = df[df['year'].isin(list(range(year-4, year)))].copy()
    
    features = ['coach_tenure', 'win_rate_prev_team_1yr', 'win_rate_prev_team_2yr',
                'win_rate_prev_coach_1yr', 'win_rate_prev_coach_2yr', 'change_rate_prev']
    target = 'change'
    
    model = XGBClassifier(n_estimators=500, learning_rate=0.05, max_depth=5,
                         subsample=0.8, colsample_bytree=0.7, eval_metric='logloss')
    
    train_clean = train_df.dropna(subset=features + [target])
    if len(train_clean) == 0:
        return None
    
    model.fit(train_clean[features], train_clean[target])
    y_prob = model.predict_proba(test_df[features])[:, 1]
    
    test_clean = test_df.copy()
    test_clean['turnover_prob'] = y_prob
    test_clean['predicted_change'] = (y_prob > 0.25).astype(int)
    
    turnover_teams = test_clean[test_clean['predicted_change'] == 1][['tmID', 'turnover_prob']].copy()
    turnover_teams = turnover_teams.sort_values('turnover_prob', ascending=False)
    turnover_teams.columns = ['Team', 'Probability']
    return turnover_teams

turnover_pred = predict_coach_turnover(11)
print('=' * 50)
print('COACH TURNOVER PREDICTION FOR YEAR 11')
print('=' * 50)
if turnover_pred is not None and len(turnover_pred) > 0:
    print(turnover_pred.to_string(index=False))
else:
    print('No coach turnovers predicted for year 11')
print()

COACH TURNOVER PREDICTION FOR YEAR 11
No coach turnovers predicted for year 11



## 3. Team Ranking Prediction for Year 11

Predicting conference rankings for all teams. Using ExtraTrees as it showed strong performance across multiple years.

In [39]:
def add_weighted_history(df, test_year=None):
    df = df.sort_values(['tmID', 'year']).copy()
    feature_weights = {
        'made_playoffs': 0.33, 'prev_win_pct': 0.8, 'prev_coach_win_pct': 0.5,
        'Performance_weighted_2yr': 0.85, 'OffPerformance_weighted_2yr': 0.6,
        'DefPerformance_weighted_2yr': 1.0, 'Performance_weighted_3yr': 0.35,
        'OffPerformance_weighted_3yr': 0.25, 'DefPerformance_weighted_3yr': 0.45,
        'Performance_weighted_4yr': 0.3, 'OffPerformance_weighted_4yr': 0.2,
        'DefPerformance_weighted_4yr': 0.35
    }
    if test_year is not None:
        df = df[df['year'] <= test_year].copy()
    for feat, importance in feature_weights.items():
        df[f'{feat}_weighted'] = (
            importance * 0.7 * df.groupby('tmID')[feat].shift(1) +
            importance * 0.15 * df.groupby('tmID')[feat].shift(2) +
            importance * 0.1 * df.groupby('tmID')[feat].shift(3) +
            importance * 0.05 * df.groupby('tmID')[feat].shift(4)
        )
    return df

def predict_team_rankings(test_year):
    df = pd.read_csv('../predict_datasets/teams.csv').fillna(0)
    df = add_weighted_history(df, test_year=test_year)
    feature_cols = ['made_playoffs_weighted', 'prev_win_pct_weighted', 'prev_coach_win_pct_weighted',
                    'Performance_weighted_2yr_weighted', 'OffPerformance_weighted_2yr_weighted',
                    'DefPerformance_weighted_2yr_weighted', 'Performance_weighted_3yr_weighted',
                    'OffPerformance_weighted_3yr_weighted', 'DefPerformance_weighted_3yr_weighted',
                    'Performance_weighted_4yr_weighted', 'OffPerformance_weighted_4yr_weighted',
                    'DefPerformance_weighted_4yr_weighted']
    train_df = df[df['year'].isin(list(range(test_year-4, test_year)))]
    test_df = df[df['year'] == test_year].copy()
    train_clean = train_df.dropna(subset=feature_cols + ['rank']).sort_values(['year', 'confID'])
    for col in feature_cols:
        test_df[col].fillna(train_clean[col].mean(), inplace=True)
    model = RandomForestRegressor(n_estimators=200, max_depth=4, min_samples_split=2, max_features='sqrt', random_state=42)
    model.fit(train_clean[feature_cols], train_clean['rank'])
    test_df['score'] = model.predict(test_df[feature_cols])
    test_df['Predicted_Rank'] = test_df.groupby(['year','confID'])['score'].rank(method='first', ascending=True).astype(int)
    return test_df[['year', 'confID', 'tmID', 'Predicted_Rank']]

rankings_pred = predict_team_rankings(11)
print('=' * 50)
print('TEAM RANKING PREDICTIONS FOR YEAR 11')
print('=' * 50)
for conf_id in sorted(rankings_pred['confID'].unique()):
    conf_results = rankings_pred[rankings_pred['confID'] == conf_id].sort_values('Predicted_Rank')
    print(f'\nConference {conf_id}:')
    print(conf_results[['tmID', 'Predicted_Rank']].to_string(index=False))
print()

TEAM RANKING PREDICTIONS FOR YEAR 11

Conference 0:
tmID  Predicted_Rank
 IND               1
 WAS               2
 CHI               3
 ATL               4
 NYL               5
 CON               6

Conference 1:
tmID  Predicted_Rank
 SAS               1
 SEA               2
 LAS               3
 PHO               4
 TUL               5
 MIN               6



## Summary of Year 11 Predictions

All predictions for year 11 have been generated using the best-performing models from the analysis:

### Awards (Top predicted winner shown):
- **MVP**: Logistic Regression
- **DPOTY**: Logistic Regression
- **ROTY**: CatBoostClassifier
- **MIP**: XGBClassifier
- **KPSA**: CatBoostClassifier
- **SWOTY**: CatBoostClassifier
- **FMVP**: Logistic Regression
- **ASGMVP**: Logistic Regression
- **COTY**: XGBClassifier

### Coach Turnover:
- Teams predicted to change coaches based on tenure, win rates, and historical patterns

### Team Rankings:
- Conference rankings predicted using ExtraTreesRegressor with weighted historical performance

These predictions are based on historical patterns from years 7-10 and represent the most likely outcomes according to the trained models.