In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import pickle
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
import seaborn as sns
import time
import requests
from io import BytesIO

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.svm import SVC
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.feature_selection import SequentialFeatureSelector

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,roc_auc_score


# List of GitHub raw URLs
urls = [
    ## 'https://raw.githubusercontent.com/JetendraMulinti/DAV-6150---DataScience/main/FinalProject-NBA_Prediction/Data/season_-2016.csv',
    ## 'https://raw.githubusercontent.com/JetendraMulinti/DAV-6150---DataScience/main/FinalProject-NBA_Prediction/Data/season_-2017.csv',
    ## 'https://raw.githubusercontent.com/JetendraMulinti/DAV-6150---DataScience/main/FinalProject-NBA_Prediction/Data/season_-2018.csv',
    ## 'https://raw.githubusercontent.com/JetendraMulinti/DAV-6150---DataScience/main/FinalProject-NBA_Prediction/Data/season_-2019.csv',
    'https://raw.githubusercontent.com/JetendraMulinti/DAV-6150---DataScience/main/FinalProject-NBA_Prediction/Data/season_-2020.csv',
    'https://raw.githubusercontent.com/JetendraMulinti/DAV-6150---DataScience/main/FinalProject-NBA_Prediction/Data/season_-2021.csv',
    'https://raw.githubusercontent.com/JetendraMulinti/DAV-6150---DataScience/main/FinalProject-NBA_Prediction/Data/season_-2022.csv',
    'https://raw.githubusercontent.com/JetendraMulinti/DAV-6150---DataScience/main/FinalProject-NBA_Prediction/Data/season_-2023.csv'
]

all_cleaned_dataframes = []

for url in urls:
    try:
        # Load the dataframe from pickle data obtained from URL
        dataframe = pd.read_csv(url)
        
        # Filter out columns that start with 'Unnamed:'
        dataframe = dataframe.loc[:, ~dataframe.columns.str.startswith('Unnamed:')]

        # Drop all columns that are entirely NA
        dataframe = dataframe.dropna(axis=1, how='all')

        # Add the cleaned dataframe to the list
        all_cleaned_dataframes.append(dataframe)
        
        print(f"Processed data from {url}")

    except Exception as e:
        print(f"Error processing {url}: {e}")

# Concatenate all dataframes into one
df = pd.concat(all_cleaned_dataframes, ignore_index=True)


### delete some more columns
columns_to_delete = ['OT', 'OT_opp', '2OT', '3OT', '2OT_opp', '3OT_opp',
                     ## '4OT', '4OT_opp',
                    'mp_total_opp','bpm_max','bpm_max_opp']

# Drop the specified columns from the dataframe
df.drop(columns=columns_to_delete, inplace=True)


print("No of duplicate rows: ",df.duplicated().sum())

### Drop duplicates
df = df.drop_duplicates().reset_index(drop=True)

print("No of duplicate rows after dropping duplicates: ",df.duplicated().sum())

#### rename columns
df.rename(columns = {'mp_total':'mp'}, inplace=True)

#### Creating Season column
df['date'] = pd.to_datetime(df['date'])  # Convert 'date' column to datetime if it's not already

# Function to determine the season year based on the month
def get_season_year(row):
    if row['date'].month >= 10:
        return row['date'].year
    else:
        return row['date'].year - 1

# Apply the function to create a new 'season' column
df['season'] = df.apply(get_season_year, axis=1)


print("data shape:", df.shape)

columns_format = list(df.columns)


##### Abbrivate the Team names
team_df = pd.read_csv('https://raw.githubusercontent.com/JetendraMulinti/DAV-6150---DataScience/main/FinalProject-NBA_Prediction/Data/Team_full-forms.csv')
team_df['team'] = team_df['team'].str.strip()
team_df['team1'] = team_df['team1'].str.strip()


##### Merge and delete the columns
df = pd.merge(team_df, df, on = ['team'], how='inner')
del df['team']
df.rename(columns = {'team1':'team'}, inplace=True)

team_df.rename(columns = {'team':'team_opp'}, inplace=True)
df = pd.merge(team_df, df, on = ['team_opp'], how='inner')
del df['team_opp']
df.rename(columns = {'team1':'team_opp'}, inplace=True)

print("data shape:", df.shape)

df = df[columns_format]

## ordering with date
df['date'] = pd.to_datetime(df['date']).dt.date
df = df.sort_values(by = ['date'], ascending=True).reset_index(drop=True)

def add_target(team):
    team['target'] = team['won'].shift(-1)
    return team

df = df.groupby("team", group_keys=False).apply(add_target)

## Preprocessing Target column (Null = 2, True = 1, False = 0)

df['target'][pd.isnull(df['target'])] = 2
df['target'] = df['target'].astype(int, errors='ignore')


### Checking null values

null_columns = df.isnull().sum()
null_columns[null_columns > 0]


### delete some more columns
more_columns_to_delete = ['index_opp']

# Drop the specified columns from the dataframe
df.drop(columns=more_columns_to_delete, inplace=True)

## as we have only 1 null row (match) we will drop it
df = df.dropna()

null_columns = df.isnull().sum()
null_columns[null_columns > 0]

## re-ordering on date

## ordering with date
df['date'] = pd.to_datetime(df['date']).dt.date
df = df.sort_values(by = ['date'], ascending=True).reset_index(drop=True)

print("data shape:", df.shape)

# Generate descriptive statistics for key metrics
key_metrics = ['fg_total', 'fga_total', 'fg%_total', '3p_total', '3pa_total', '3p%_total', 'ft_total',
               'fta_total', 'ft%_total', 'total_opp']

# Selecting the key metrics and generating descriptive statistics
key_stats_summary = df[key_metrics].describe()

# Display the descriptive statistics for key metrics
key_stats_summary




fig, axes = plt.subplots(3, 3, figsize=(10, 10))

# Plotting field goals, three-point shots, and free throws
sns.histplot(df['fg_total'], bins=30, kde=True, ax=axes[0, 0]).set_title('Field Goals Made')
sns.histplot(df['3p_total'], bins=30, kde=True, ax=axes[0, 1]).set_title('Three-Points Made')
sns.histplot(df['ft_total'], bins=30, kde=True, ax=axes[0, 2]).set_title('Free Throws Made')

# Plotting percentages for field goals, three-point shots, and free throws
sns.histplot(df['fg%_total'], bins=30, kde=True, ax=axes[1, 0]).set_title('Field Goal Percentage')
sns.histplot(df['3p%_total'], bins=30, kde=True, ax=axes[1, 1]).set_title('Three-Point Percentage')
sns.histplot(df['ft%_total'], bins=30, kde=True, ax=axes[1, 2]).set_title('Free Throw Percentage')

# Plotting games per season and distributions of win and next game outcomes
sns.histplot(df['season'], bins=len(df['season'].unique()), kde=False, ax=axes[2, 0]).set_title('Games per Season')
sns.countplot(x='won', data=df, ax=axes[2, 1]).set_title('Win Distribution')
sns.countplot(x='target', data=df, ax=axes[2, 2]).set_title('Next Game Outcome Distribution')

plt.tight_layout()
plt.show()



def season_trend(column):
    # Check if the column data looks like percentages (values between 0 and 1)
    if df[column].max() <= 1:
        # If so, convert to percentage by multiplying by 100
        seasonal_averages = df.groupby('season')[column].mean() * 100
        ylabel = f'Average {column} (%)'
    else:
        # Otherwise, use the values as is
        seasonal_averages = df.groupby('season')[column].mean()
        ylabel = f'Average {column}'
    
    # Plotting the time series
    plt.figure(figsize=(14, 7))
    seasonal_averages.plot(kind='line', marker='o')
    plt.title(f'Average {column} by NBA Season')
    plt.xlabel('NBA Season')
    plt.ylabel(ylabel)
    plt.grid(True)
    plt.xticks(ticks=seasonal_averages.index, labels=seasonal_averages.index)
    plt.tight_layout()
    plt.show()



season_trend('fg%_total')



def plot_team_performance( metric):
    # Determine if the metric is a percentage (between 0 and 1)
    percentage_scale = df[metric].max() <= 1
    
    # Calculate the average metric for each team per season
    seasonal_team_averages = df.groupby(['season', 'team'])[metric].mean().unstack()

    # Scale up if the metric is a percentage
    if percentage_scale:
        seasonal_team_averages *= 100
        ylabel = f'Average {metric} (%)'
    else:
        ylabel = f'Average {metric}'

    # Identify the top and bottom 5 performing teams
    top_teams = seasonal_team_averages.mean(axis=0).sort_values(ascending=False).head(5).index
    bottom_teams = seasonal_team_averages.mean(axis=0).sort_values(ascending=True).head(5).index

    # Create subplots for the top and bottom performing teams
    fig, axs = plt.subplots(2, 1, figsize=(15, 10), sharex=True)

    # Top 5 performing teams plot
    for team in top_teams:
        axs[0].plot(seasonal_team_averages.index, seasonal_team_averages.loc[:, team], marker='o', label=team)
    axs[0].set_title(f'Top 5 Performing Teams by {metric}')
    axs[0].set_ylabel(ylabel)
    axs[0].grid(True)
    axs[0].legend()

    # Bottom 5 performing teams plot
    for team in bottom_teams:
        axs[1].plot(seasonal_team_averages.index, seasonal_team_averages.loc[:, team], marker='o', label=team)
    axs[1].set_title(f'Bottom 5 Performing Teams by {metric}')
    axs[1].set_ylabel(ylabel)
    axs[1].grid(True)
    axs[1].legend()

    # Set common X label
    plt.xlabel('NBA Season')
    plt.tight_layout()
    plt.show()


plot_team_performance('fg%_total') 

### Correlation matrix
fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(df[['fg_total', 'fga_total', 'fg%_total', '3p_total', '3pa_total', '3p%_total', 'ft_total', 'fta_total', 'ft%_total', 'total_opp', 'won', 'target']].corr(), annot=True, fmt=".2f", cmap='coolwarm', ax=ax)
ax.set_title('Correlation Matrix of Selected Metrics with Target')
plt.show()