<H1>Linear Tree Algorithm experiment</H1>

1.) Collect, clean exemplary dataset for regression problem suited for linear tree model.<br>
2.) Develop single tree algorithm with variables for depth, omitted features as input (others to be added later) and feature importance and classification solution as output<br>
3.) Define simple Gradient Boosting algorithm<br>
4.) Bring full algorithm together<br>
5.) Run tests with exemplary dataset<br>

In [9]:
import pandas as pd
import numpy as np
import plotly as pl
import plotly.express as px

pd.set_option('display.max_rows', 500)

In [10]:
#data cleaning and pre-preparation completed in EDA module
df=pd.read_csv('data/ames_housing_price_data_prepared_v01.csv')
df['SalePriceLog']=np.log10(df['SalePrice'])
df=df.drop(['Unnamed: 0','SalePrice'],axis=1)

#price=df['SalePrice']
#price_log = np.log10(price)
#df.head(10).T

<H1>Tree model

'''
keep: feature importance dictionary (key: feature, val: total RSS delta through feature split)
keep: dataframe split into subtrees (through extra column identifying the subtree_id
Input: (dictionary of used subtree_ids(increasing), including a tuple of splitting variable, splitting position, current maxdepth, subtree_id it was split from), same dict, but for unused subtrees, current RSS
for each subtree
    if not yet at maxdepth
    check if split was searched before in unused subtree dict
    for each feature
        for each gap between two observations
            divide tree into two subtrees
            run linar regression on each subtree
            save tuple: (feature, gapposition, RSS-delta)
    find tuple with max RSS-delta, discard remaining tuples
find remaining tuple with max RSS-delta
split dataframe, update dictionaries, update feature importance dictionary with RSS delta
if all subtrees at maxdepth: return RSS, (how to return splitting logic??)
else: feed dataframes, RSS current-delta, list of maxdepth per subtree into next tree
'''

In [11]:
class TreeBooster():
    
    def __init__(self, n_iter=20, n_iterations=10, learning_rate=0.1):
        self.n_iter=n_iter
        self.n_iterations=n_iterations
        self.learning_rate=learning_rate
        self.split_conditions={}
        self.tree_means={}
        
    
    def check_error(self,df):
        mean=df['SalePriceLog'].mean()
        error=sum(df['SalePriceLog'].apply(lambda x: (x-mean)**2))/df.shape[0]
        return error

    def treebuilder(self, df, tree_num=0):

        df['subtree_id']=0
        #feature_importance_dict={}
        max_subtree_id=0

        columns=list(df.columns)
        columns.remove('subtree_id')
        columns.pop()

        tmp_dict={}
        self.split_conditions[tree_num]={}
        self.tree_means[tree_num]={}

        k=0
        while k<self.n_iter:

            subtree_ids=df['subtree_id'].unique()

            for subtree_id in subtree_ids: #each current subtree

                for column in columns:#each feature

                    if (subtree_id,column) not in tmp_dict:

                        currentError=self.check_error(df[df['subtree_id']==subtree_id])

                        split_values=list(df[df['subtree_id']==subtree_id][column].unique())
                        split_values.sort()
                        split_val=[]
                        for i in range(1, len(split_values)):
                            split_val.append(split_values[i-1]+(split_values[i]-split_values[i-1])/2)

                        maxError=0
                        for i in split_val:#each split

                            newError=(self.check_error(df[(df['subtree_id']==subtree_id) & (df[column]<i)])*df[(df['subtree_id']==subtree_id) & (df[column]<i)].shape[0] + self.check_error(df[(df['subtree_id']==subtree_id) & (df[column]>=i)])*df[(df['subtree_id']==subtree_id) & (df[column]>=i)].shape[0])/df[(df['subtree_id']==subtree_id)].shape[0]

                            deltaError=currentError-newError

                            if deltaError>maxError:
                                tmp_dict[(subtree_id,column)]=(i,deltaError)
                                maxError=deltaError

            #print(tmp_dict)
            best_subtree_id, best_column=max(tmp_dict, key=lambda x: tmp_dict.get(x)[1])
            best_split_pos=tmp_dict[(best_subtree_id, best_column)][0]
            max_subtree_id+=1
            df.loc[(df['subtree_id']==best_subtree_id) & (df[best_column]<best_split_pos),'subtree_id']=max_subtree_id
            max_subtree_id+=1
            df.loc[(df['subtree_id']==best_subtree_id)&(df[best_column]>=best_split_pos),'subtree_id']=max_subtree_id

            for i,j in list(tmp_dict.keys()):
                if i==best_subtree_id:
                    del tmp_dict[(i,j)]   

            k+=1
            self.split_conditions[tree_num][best_subtree_id]=(best_column,best_split_pos,max_subtree_id-1,max_subtree_id)

            #print(df['subtree_id'].value_counts())

        
        subtree_ids=df['subtree_id'].unique()
        for subtree_id in subtree_ids: #each subtree
            mean=df[df['subtree_id']==subtree_id]['SalePriceLog'].mean()
            self.tree_means[tree_num][subtree_id]=mean

        return df
    
    
    def tree_predictor(self, df, tree_num=0):
        
        df['Predict']=0
        for i in range(df.shape[0]):
            current_subtree_id=0
            while current_subtree_id not in self.tree_means[tree_num]:
                column,split_pos,id_left,id_right = self.split_conditions[tree_num][current_subtree_id]
                if df[column][i]<split_pos:
                    current_subtree_id=id_left
                else:
                    current_subtree_id=id_right
            df.loc[i,'Predict']=self.tree_means[tree_num][current_subtree_id]

        return df

    def tree_scorer(self,df):
        mean1=sum(df.iloc[:,0])/df.shape[0]
        TSS=sum(df.iloc[:,0].map(lambda x: (x-mean1)**2))

        RSS=sum((df.iloc[:,1]-df.iloc[:,0])**2)

        return 1-RSS/TSS


    def iteration_builder(self, df):
        df2=df.copy()
        for x in range(self.n_iterations):
            df2=self.treebuilder(df2,tree_num=x)
            df2=self.tree_predictor(df2,tree_num=x)
            df2['SalePriceLog']=df2['SalePriceLog']+(df2['Predict']-df2['SalePriceLog'])*self.learning_rate
            df2=df2.drop(['Predict'],axis=1)
        return df2
    
    def iteration_predictor(self, df):

        df['Predict_fin']=0
        for x in range(n_iterations):
            df=self.tree_predictor(df,tree_num=x)
            df['Predict_fin']=df['Predict_fin']+self.learning_rate*df['Predict']
            
        df['Predict']=df['Predict_fin']
        df=df.drop('Predict_fin',axis=1)

        return df
        

In [None]:
treemodel=TreeBooster(n_iter=20, n_iterations=10, learning_rate=0.3)
df= treemodel.iteration_builder(df)

In [None]:
df2=df.copy()
df2=treemodel.iteration_predictor(df2)
treemodel.tree_scorer(df2[['SalePriceLog','Predict']])

In [6]:
#0.8163734050358602 at 20 splits

In [8]:
df2['subtree_id'].value_counts()

32    266
35    255
29    250
39    193
40    175
17    171
33    128
28    112
34    103
36     96
26     96
31     93
25     88
20     81
21     80
22     51
30     50
27     48
24     45
37     41
38     33
Name: subtree_id, dtype: int64