In [19]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from datetime import datetime
from dateutil.relativedelta import relativedelta


In [2]:
class MathFunctions:
    """
    Calculated the variance of the target variable in a pandas dataframe
    df: dataframe that we will evaluvate
    return: float that represents the variance"""
    def calc_variance(df: pd.DataFrame) -> float:
        #var of target variables
        return df['target'].var()
        
    """
    Evaluvate how much info we have gained from a split
    left_child_df: the left child of a node
    right_child_df: the right child of a node
    return: the variance difference between the current variance the the avg of the two variances"""
    @staticmethod
    def calc_variance_reduction(left_child_df: pd.DataFrame, right_child_df: pd.DataFrame):
        #the weight just represents the portion that each child takes up relative to the size of the original node
        weight = float( len(left_child_df) / (len(left_child_df) + len(right_child_df)))
        #the split_variance is the sum of both variances, the lower this number is, the better
        split_variance = weight * MathFunctions.calc_variance(left_child_df) + (1-weight) * MathFunctions.calc_variance(right_child_df)
        return split_variance
        
        

In [3]:
#A class to hold the questions used to evaluate a decision tree
class Question:
    #initialize the variables
    def __init__(self, column: str, value: float):
        """
        Column stores what is the feature we are analyzing
        Value is the numeric value that will be compared
        ie. if column is 'PTS' and value is 12.4 the question is asking, is "PTS <= 12.4?"? """
        self.column = column 
        self.value = value
        
    """
    A function that will give us the condition we are checking - useful for pandas when querrying a data frame
    return a string that is the condition we are comparing
    """
    def __str__(self) -> str:
        return f"{self.column} <= {self.value}"
        
    def checkTrueCondition(self, row) -> str:
        if row[column] <= value:
            return True
        else:
            return False

In [4]:
"""
The Leaf class is what holds the possible predictions of an input"""
class Leaf:
    #initialize the variables
    def __init__(self, targets: pd.DataFrame):
        #targets are all of the predictions given going down a tree
        #target is a list of predictions
        self.targets = targets 
    """
    Find the average of all targets
    return: average of the list of predictions, self.targets
    """
    def calcPrediction(self, min, max) -> float:
        return self.targets['target'].mean() * (max - min) + min
        
    def calcPrediction_2(self) -> float:
        return self.targets['target'].mean()


In [5]:
"""
The Node class is what represents a decision in our decision tree
It keeps track of the question and the paritions following the question
"""
class Node:
    #initialize variables
    """
    question: the question that will cause the paritions (left child, right child)
    true: represents the right child, which is the true partition
    false: represents the left child, which is the false partition
    """
    def __init__(self, question: Question, true: pd.DataFrame, false: pd.DataFrame):
        self.question = question
        self.true = true
        self.false = false

    """
    A function that will take in a newVal to compare with self.question
    newVal: a value that will be used to compare against self.question
    return: True is if newVal is less than or equal to the question value, 
    False is if newVal is greater than the question value
    """
    def makeDecision(self, newVal: float) -> bool:
        #checkTrue compares a newVal with the question
        return self.question.checkTrue(newVal)
    

In [6]:
"""
The Partitions class is what splits the data and analyzes the best split for two split dataframes
"""
class Partitions:
    """
    A function that will return a divided version of the curDF based on the question
    curDF: original Pandas DF that will be split
    question: a question that will be used to split the curDF
    return: a dataframe where the question is evaluvated to true and another where the question is evaluvated to false
    """
    @staticmethod
    def partition(question: Question, curDF: pd.DataFrame) -> (pd.DataFrame, pd.DataFrame):
        #split the data based on the value and the column analyzed
        true_df = curDF[curDF[question.column] <= question.value]
        false_df = curDF[curDF[question.column] > question.value]  # > is the negation of <=
        #return the two data frames
        return true_df, false_df

    """
    A function that will determine the question that will give us the best partition
    df: the dataframe that will be analyzed to determine the best partition
    return: the best question question and the variance of the best question
    """
    @staticmethod
    def bestPartition(df: pd.DataFrame) -> (float, Question):
        #set the smallest variance to a very high number intitially - since data is scaled it is impossible to get over 1000000 variance
        smallest_variance = 10000000
        #set the cur smallest variance to this
        Partitions.findSmallestVariance.smallest_variance = smallest_variance
        #loop through every column
        for col in df.columns:
            if col == 'target':
                continue
            #apply the method that finds the smallest variance in a column
            df[col].apply(lambda value: Partitions.findSmallestVariance(col, value, df))
        #return the smallest variances and the best question
        return Partitions.findSmallestVariance.smallest_variance, Partitions.findSmallestVariance.best_question

    
    """
    A function that compares the variance of a split with the current smallest variance of a split
    column: the column of the value being analyzed
    value: the numerical value 
    df: the dataframe that will be analyzed to determine the best partition
    """
    @staticmethod
    def findSmallestVariance(column: str, value: float, df: pd.DataFrame):
        question = Question(column, value)
        true_rows, false_rows = Partitions.partition(question, df)
        if len(false_rows) == 0 or len(true_rows) == 0:
            current_weighted_variance = MathFunctions.calc_variance(df)
        else:
            current_weighted_variance = MathFunctions.calc_variance_reduction(true_rows, false_rows)

        if current_weighted_variance < Partitions.findSmallestVariance.smallest_variance:
            # Update the minimum weighted variance if necessary
            Partitions.findSmallestVariance.smallest_variance = current_weighted_variance
            Partitions.findSmallestVariance.best_question = question
            
            

In [57]:
"""
The Tree class is what actually builds the tree and hold the instance of a deicison tree
"""
class Tree:
    """
    originalData: the data that is intially given to the tree
    root: the first decision node"""
    def __init__(self, originalData: pd.DataFrame, max, min, root = None):
        self.max = max
        self.min = min
        self.root = root
        self.data = originalData
    """
    A function that will build the tree
    curDepth: the depth that the tree is currently at -> int
    data: the dataset to be analyzed at the current moment -> 
    maxDepth: to prevent overfitting we have a maxDepth of 4
    return: recursively return the Node or Leaf that is the result of a branch in the decision tree
    """
    def build_tree(self, curDepth = None, data=None, maxDepth = 4) -> Node or Leaf:
        #check to see if data is an instance of the dataframe to ensure it is not none
        if not isinstance(data, pd.DataFrame):
            #if it is none, make the data equal the original data
            data = self.data
        #set curDepth to 0 on the first loop
        if curDepth == None:
            curDepth = 0
        #get the best variance and best paritition
        variance, question = Partitions.bestPartition(data)
        if variance == 0 or curDepth == maxDepth:
            #we are at the last recursion so we return a Leaf that is the result of a branch
            return Leaf(data) #data represents the possible options that arrise from a decision tree
            
        #get the true and false data based on the best question and current data we have
        true, false = Partitions.partition(question, data)
        
        #now recusrively build the true branch based on the true partition
        true_branch = self.build_tree(data=true, curDepth = curDepth+1) 
        #now recursively build the false branch based on the false partition
        false_branch = self.build_tree(data=false, curDepth = curDepth + 1)
        
        #if we get to here and curDepth is 0, that means we are done building the tree
        if curDepth == 0:
            #assign the last node to be the root
            self.root = Node(question, true_branch, false_branch)
            #return the root
            return self.root
        
        #return a decision node that contains the question it asked and the true and false branch
        return Node(question, true_branch, false_branch)
    
    def print_tree(self, node = None, spacing=""): 
        if node == None:
            node = self.root
        # Base case: we've reached a leaf
        if isinstance(node, Leaf):
            prediction = node.calcPrediction(self.min, self.max)
            print (spacing + "Predict", prediction)
            return
    
        # Print the question at this node
        print (spacing + str(node.question))
    
        # Call this function recursively on the true branch
        print (spacing + '--> True:')
        self.print_tree(node.true, spacing + "  ")
    
        # Call this function recursively on the false branch
        print (spacing + '--> False:')
        self.print_tree(node.false, spacing + "  ")
    def classify(self, row):
        predicting_row = pd.DataFrame()
        curNode = self.root
        while isinstance(curNode, Node):
            if curNode.checkTrueCondition(row):
                curNode = curNode.true
            else:
                curNode = curNode.false
                
        return curNode.calcPrediction_2()        

In [49]:
"""
Forest class contains all the trees
"""
class Forest:
    #intialize a forest, with no trees at first
    def __init__(self, dataSet: pd.DataFrame, min, max):
        self.max_value = max
        self.min_value = min
        
        self.dataSet = dataSet
        self.trees = []

    #bootstrapping is a technique in ML that randomly takes rows from the training data to train the dataset
    def bootStrapping(self, n):
        random_indicies = np.random.randint(low=0, high=len(self.dataSet), size = n)
        return random_indicies

    def randomSubspace(self, n):
        target = 'target'
        random_columns = self.dataSet.columns[self.dataSet.columns != target].to_series().sample(n=9, random_state=42)
        selectedColumns = random_columns.tolist() + [target]
        return selectedColumns

    def random_dataset(self, numRows, numCol):
        random_indicies = self.bootStrapping(numRows)
        random_columns = self.randomSubspace(numCol)
        randomized_data = self.dataSet.iloc[random_indicies][random_columns]
        return randomized_data

    def createTree(self, numRows, numCol):
        random_dataset = self.random_dataset(numRows, numCol)
        tree = Tree(random_dataset, self.max_value, self.min_value)
        tree.build_tree()
        tree.print_tree()
        self.trees.append(tree)

    def createForest(self, n, numRows=500, numCol=12):
        for i in range(n):
            self.createTree(numRows, numCol)

    def predict(self, test_df):
        predictions = test_df.apply(lambda row: make_prediction(row), axis=1)
        return predictions 
        
    def make_predictions(self, row):
        prediction_sum = 0
        for tree in self.trees:
            prediction_sum += tree.classify(row)
        return prediction_sum/len(self.trees)
            

In [9]:
# Define a function to shift values for a specific player
def shift_player_points(group, target):
    group['target'] = group[target].shift(-1)
    return group

In [10]:
def get_max_min(df: pd.DataFrame, targetVar):
    return df[targetVar].min(), df[targetVar].max()

In [54]:
def setTarget(data: pd.DataFrame):
    #get rid of unecessary column
    data.drop("Unnamed: 0", axis="columns")
    # Apply the shifting function within each player group
    df_shifted = data.groupby('ID').apply(lambda x: shift_player_points(x, 'PTS')).copy()

    #make everything sorted by the season
    df_shifted[['StartYear', 'EndYear']] = df_shifted['Season'].str.split('-', expand=True).copy()
    df_shifted['StartYear'] = pd.to_datetime(df_shifted['StartYear'], format='%Y').copy()
    df_final  = df_shifted.sort_values(by='StartYear')
    
    return df_final

In [56]:
def scaleData(df):
    #drop all the columns except for the numeric columns
    numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns


    #get the max and min
    min, max = get_max_min(df, 'PTS')
    
    #intialize the scaler
    scaler = MinMaxScaler()
    
    #scale the data for ML
    df_scaled[numeric_columns] = scaler.fit_transform(df[numeric_columns]).copy()
    
    #get rid of any NA values
    df_scaled = df_scaled.dropna()
    
    #only take numeric columns
    df_final = df_scaled[numeric_columns]
    df_final = df_final.drop("Unnamed: 0", axis="columns").copy()
    
    return df_final
    

In [46]:
def backtesting(df):
    # Initial threshold datetime set to 2006, so that we can have atleast three years of data before we start predicting
    threshold_date = datetime(2006, 1, 1)
    
    #last season where we can train our model
    end_date = datetime(2022, 1, 1)

    df_prep = setTarget(df)
    all_predictions = []
    while threshold_date != end_date:
        #train the data using every stat before the year
        train = df_prep[df_prep['StartYear'] <= threshold_date]
        
        #make predictions on the current year using previous data
        test = df[df['StartYear'] == threshold_date]

        #scale the data so that we can train it adequately with the model
        
        #very important for random forest since we use variation regretion to make decisions
        ready_training_df, min, max = scaleData(train)
        
        forest = Forest(ready_training_df, min, max)
        
        forest.createForest(100)

        predictions = forest.predict(test)
        all_prediciton.append(predictions)
        
        return train, test
    return all_predictions
    

In [47]:
def testing_main():
    # Create a sample DataFrame
    data = nba_player_stats = pd.read_csv("all_players_stats.csv")
    backtesting(data)
    return forest

In [25]:
def test_backtesting():
    df = pd.read_csv("all_players_stats.csv")
    train, test = backtesting(df)
    return train

In [52]:
test_backtesting()

Unnamed: 0_level_0,Unnamed: 1_level_0,AGE,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,...,BPM,VORP,teamPPG,oppPPG,SRS,pace,teamOFRtg,teamDFRtg,target,StartYear
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
owarjo01,728,0.208333,0.804878,0.349398,0.546988,0.290598,0.322449,0.430,0.056604,0.075758,0.303,...,0.556976,0.202899,0.471429,0.393795,0.743580,0.370370,0.610108,0.734317,0.349030,2003-01-01
diawbo01,1316,0.125000,0.914634,0.445783,0.585542,0.153846,0.167347,0.447,0.018868,0.022727,0.231,...,0.533546,0.130435,0.176190,0.315036,0.354010,0.243386,0.209386,0.372694,0.132964,2003-01-01
barnema02,4517,0.208333,0.451220,0.108434,0.436145,0.145299,0.146939,0.457,0.018868,0.022727,0.154,...,0.534611,0.144928,0.223810,0.360382,0.403793,0.253968,0.299639,0.376384,0.105263,2003-01-01
barbole01,1026,0.125000,0.841463,0.554217,0.491566,0.256410,0.273469,0.447,0.226415,0.227273,0.395,...,0.549521,0.181159,0.209524,0.324582,0.435401,0.338624,0.223827,0.413284,0.193906,2003-01-01
udokaim01,2729,0.333333,0.036585,0.000000,0.144578,0.068376,0.093878,0.333,0.000000,0.022727,0.000,...,0.504792,0.144928,0.304762,0.238663,0.723429,0.322751,0.371841,0.715867,0.077562,2003-01-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
varejan01,4385,0.250000,0.975610,0.072289,0.551807,0.205128,0.204082,0.476,0.000000,0.007576,0.000,...,0.558040,0.217391,0.271429,0.205251,0.683129,0.243386,0.371841,0.715867,0.185596,2006-01-01
smithcr01,857,0.208333,0.987805,0.060241,0.426506,0.256410,0.228571,0.531,0.000000,0.007576,0.000,...,0.538871,0.152174,0.254762,0.367542,0.426709,0.253968,0.314079,0.413284,0.260388,2006-01-01
brewero02,1619,0.125000,0.670732,0.168675,0.267470,0.153846,0.138776,0.528,0.000000,0.015152,0.000,...,0.570820,0.188406,0.383333,0.341289,0.672461,0.285714,0.537906,0.675277,0.332410,2006-01-01
arizatr01,1369,0.125000,0.682927,0.084337,0.515663,0.299145,0.265306,0.539,0.000000,0.007576,0.000,...,0.575080,0.231884,0.223810,0.231504,0.565389,0.195767,0.350181,0.594096,0.152355,2006-01-01


In [51]:
forest = testing_main()

2P <= 0.45132743362831856
--> True:
  2P <= 0.1946902654867257
  --> True:
    2P <= 0.09734513274336284
    --> True:
      2P <= 0.02654867256637168
      --> True:
        Predict 0.925
      --> False:
        Predict 5.020754716981132
    --> False:
      TOV% <= 0.14400000000000002
      --> True:
        Predict 7.830232558139536
      --> False:
        Predict 5.437931034482758
  --> False:
    OWS <= 0.24861878453038672
    --> True:
      STL_100 <= 0.2990654205607477
      --> True:
        Predict 9.864444444444446
      --> False:
        Predict 0.7
    --> False:
      TRB_100 <= 0.17680608365019013
      --> True:
        Predict 14.22325581395349
      --> False:
        Predict 10.114285714285714
--> False:
  2P <= 0.6106194690265487
  --> True:
    TRB_100 <= 0.35551330798479086
    --> True:
      OWS <= 0.34806629834254144
      --> True:
        Predict 15.51
      --> False:
        Predict 20.65833333333333
    --> False:
      Predict 8.1
  --> False:
    STL_