In [None]:
import pandas as pd
import pickle
pd.set_option('display.max_columns', None)



### Reading and drop na columns
file_names = ['dataframes_list_season_-2017.pkl',
              'dataframes_list_season_-2018.pkl', 'dataframes_list_season_-2019.pkl',
              'dataframes_list_season_-2020.pkl', 'dataframes_list_season_-2021.pkl',
              'dataframes_list_season_-2022.pkl','dataframes_list_season_-2023.pkl']

all_cleaned_dataframes = []

for file_name in file_names:
    try:
        with open("Data/"+file_name, 'rb') as file:
            # Load the dataframe from pickle file
            dataframe = pickle.load(file)

            # Filter out columns that start with 'Unnamed:'
            dataframe = dataframe.loc[:, ~dataframe.columns.str.startswith('Unnamed:')]

            # Drop all columns that are entirely NA
            dataframe = dataframe.dropna(axis=1, how='all')

            # Add the cleaned dataframe to the list
            all_cleaned_dataframes.append(dataframe)
            
            ## print(f"Processed {file_name}, columns: {dataframe.columns}")

    except Exception as e:
        print(f"Error processing {file_name}: {e}")

# Concatenate all dataframes into one
df = pd.concat(all_cleaned_dataframes, ignore_index=True)



### delete some more columns
columns_to_delete = ['OT', 'OT_opp', '2OT', '3OT', '2OT_opp', '3OT_opp', '4OT', '4OT_opp',
                    'mp_total_opp','bpm_max','bpm_max_opp']

# Drop the specified columns from the dataframe
df.drop(columns=columns_to_delete, inplace=True)

#### rename columns
df.rename(columns = {'mp_total':'mp'}, inplace=True)

print("data shape:", df.shape)

columns_format = list(df.columns)

df.head(2)

In [None]:
df['+/-_max'].isnull().sum()

In [None]:
df['+/-_max_opp'].isnull().sum()

In [None]:
##### Abbrivate the Team names
##### Abbrivate the Team names
team_df = pd.read_csv('https://raw.githubusercontent.com/JetendraMulinti/DAV-6150---DataScience/main/FinalProject-NBA_Prediction/Data/Team_full-forms.csv')
team_df['team'] = team_df['team'].str.strip()
team_df['team1'] = team_df['team1'].str.strip()


##### Merge and delete the columns
df = pd.merge(team_df, df, on = ['team'], how='inner')
del df['team']
df.rename(columns = {'team1':'team'}, inplace=True)

team_df.rename(columns = {'team':'team_opp'}, inplace=True)
df = pd.merge(team_df, df, on = ['team_opp'], how='inner')
del df['team_opp']
df.rename(columns = {'team1':'team_opp'}, inplace=True)

print("data shape:", df.shape)

df = df[columns_format]

## ordering with date
df['date'] = pd.to_datetime(df['date']).dt.date
df = df.sort_values(by = ['date'], ascending=True).reset_index(drop=True)

df

1. We will create a target column (needs to be created on team level) -> represents the next game outcome. (Won column indicates the current match, target column indicates next match)

2. Replace the Null values in Target column with “2”, False (Loss) = 0, True (Won) = 1.


In [None]:
def add_target(team):
    team['target'] = team['won'].shift(-1)
    return team

df = df.groupby("team", group_keys=False).apply(add_target)

## Preprocessing Target column (Null = 2, True = 1, False = 0)

df['target'][pd.isnull(df['target'])] = 2
df['target'] = df['target'].astype(int, errors='ignore')

In [None]:
### checking the data is balance / Imbalanced

df['won'].value_counts()

In [None]:
df['target'].value_counts()

Checking Null values and dropping columns and rows

In [None]:
### Checking null values

null_columns = df.isnull().sum()
null_columns[null_columns > 0]

In [None]:
### delete some more columns
more_columns_to_delete = ['index_opp']

# Drop the specified columns from the dataframe
df.drop(columns=more_columns_to_delete, inplace=True)

## as we have only 1 null row (match) we will drop it
df = df.dropna()

null_columns = df.isnull().sum()
null_columns[null_columns > 0]

In [None]:
## re-ordering on date

## ordering with date
df['date'] = pd.to_datetime(df['date']).dt.date
df = df.sort_values(by = ['date'], ascending=True).reset_index(drop=True)

print("data shape:", df.shape)

df.head()

1. we are using TimeSeriesSplit as we need to split as dates only
2. reason: we don't want to split the old and new data in mix, we want only historic data to predict

In [None]:
from sklearn.linear_model import RidgeClassifier
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit


rr = RidgeClassifier(alpha=1)

split = TimeSeriesSplit(n_splits=3)

sfs = SequentialFeatureSelector(rr, 
                                n_features_to_select=40, 
                                direction="backward",
                                cv=split,
                                n_jobs=1
                               )

In [None]:
removed_columns = ["season", "date", "won", "target", "team", "team_opp"]
selected_columns = df.columns[~df.columns.isin(removed_columns)]

object_columns = df[selected_columns].select_dtypes(include='object').columns

object_columns = df[selected_columns].select_dtypes(include='object').columns

# Convert these object type columns to integers
## 1	2	3	4	total	1_opp	2_opp	3_opp	4_opp	total_opp columns
for column in object_columns:
    df[column] = df[column].astype(int)
    


In [None]:
df[selected_columns].info()

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df[selected_columns] = scaler.fit_transform(df[selected_columns])

In [None]:
from datetime import datetime

now = datetime.now()
dt_string_S = now.strftime("%d/%m/%Y %H:%M:%S")

print("Started at = ", dt_string_S)


sfs.fit(df[selected_columns], df["target"])

In [None]:
predictors = list(selected_columns[sfs.get_support()])
predictors

1. backtest function helps to split the data according to seasons, to make sure we use atleast last 2 seasons of games to predict the game.

In [None]:
def backtest(data, model, predictors, start=2, step=1):
    all_predictions = []
    
    seasons = sorted(data["season"].unique())
    
    for i in range(start, len(seasons), step):
        season = seasons[i]
        train = data[data["season"] < season] ## Previous season
        test = data[data["season"] == season] ## Current season
        
        model.fit(train[predictors], train["target"])
        
        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)
        combined = pd.concat([test["target"], preds], axis=1)
        combined.columns = ["actual", "prediction"]
        
        all_predictions.append(combined)
    return pd.concat(all_predictions)

In [None]:
predictions = backtest(df, rr, predictors)
predictions

In [None]:
from sklearn.metrics import accuracy_score

## removing unpredicted matches
predictions = predictions[predictions['actual'] != 2]
print("Accuracy Score: ",accuracy_score(predictions["actual"], predictions["prediction"]))

now = datetime.now()
dt_string_E = now.strftime("%d/%m/%Y %H:%M:%S")

print("Ended at = ", dt_string_E)


Checking home ground emotion to check with our accuracy

In [None]:
df.groupby(["home"]).apply(lambda x: x[x["won"] == 1].shape[0] / x.shape[0])

In [None]:
df.groupby(["team", "season"])['date'].count()

1. Here instead of predicting just using previous match we are trying to do it with last 10 matches
2. team.rolling(10).mean() -> Finding avg of last 10 matches

In [None]:
df_rolling = df[list(selected_columns) + ["won", "team", "season"]]

def find_team_averages(team):
    rolling = team.rolling(10).mean()
    return rolling

df_rolling = df_rolling.groupby(["team", "season"], group_keys=False).apply(find_team_averages)

df_rolling

In [None]:
rolling_cols = [f"{col}_10" for col in df_rolling.columns]
df_rolling.columns = rolling_cols
df = pd.concat([df, df_rolling], axis=1)

df = df.dropna()

print("df shape: ",df.shape)
df

Giving the prior information to modelling which we will be knowing before the start of the match

In [None]:
def shift_col(team, col_name):
    next_col = team[col_name].shift(-1)
    return next_col

def add_col(df, col_name):
    return df.groupby("team", group_keys=False).apply(lambda x: shift_col(x, col_name))

df["home_next"] = add_col(df, "home")
df["team_opp_next"] = add_col(df, "team_opp")
df["date_next"] = add_col(df, "date")
df

In [None]:
full = df.merge(df[rolling_cols + ["team_opp_next", "date_next", "team"]], left_on=["team", "date_next"], right_on=["team_opp_next", "date_next"])
full

In [None]:
full[["team_x", "team_opp_next_x", "team_y", "team_opp_next_y", "date_next"]]

In [None]:
removed_columns = list(full.columns[full.dtypes == "object"]) + removed_columns
removed_columns

In [None]:
selected_columns = full.columns[~full.columns.isin(removed_columns)]


from datetime import datetime

now = datetime.now()
dt_string_S = now.strftime("%d/%m/%Y %H:%M:%S")

print("Started at = ", dt_string_S)

sfs.fit(full[selected_columns], full["target"])

In [None]:
predictors = list(selected_columns[sfs.get_support()])
predictors

In [None]:
predictions = backtest(full, rr, predictors)

## removing unpredicted matches
predictions = predictions[predictions['actual'] != 2]
predictions

In [None]:


print("Accuracy Score: ",accuracy_score(predictions["actual"], predictions["prediction"]) * 100)


now = datetime.now()
dt_string_E = now.strftime("%d/%m/%Y %H:%M:%S")

print("Ended at = ", dt_string_E)


1. last 2 seasons backtest + 10 rolling (30 Features forward) accuracy: 54 % 
2. last 2 seasons backtest + 15 rolling (30 Features forward) accuracy: 53.7 % 
3. last 2 seasons backtest + 5 rolling (30 Features forward) accuracy: 52.8 %

Improving model:
1. try powerful model
2. no of features & try backward method
3. Try rolling mean (with different combinations)
4. Make sure no null matches (before full dataframe or get only till last season and try)