<H1>Linear Tree Algorithm experiment</H1>

1.) Collect, clean exemplary dataset for regression problem suited for linear tree model.<br>
2.) Develop single tree algorithm with variables for depth, omitted features as input (others to be added later) and feature importance and classification solution as output<br>
3.) Define simple Gradient Boosting algorithm<br>
4.) Bring full algorithm together<br>
5.) Run tests with exemplary dataset<br>

In [4]:
import pandas as pd
import numpy as np
import plotly as pl
import plotly.express as px

pd.set_option('display.max_rows', 500)

In [5]:
#data cleaning and pre-preparation completed in EDA module
df=pd.read_csv('data/ames_housing_price_data_prepared_v01.csv')
df['SalePriceLog']=np.log10(df['SalePrice'])
df=df.drop(['Unnamed: 0','SalePrice'],axis=1)

#price=df['SalePrice']
#price_log = np.log10(price)
#df.head(10).T

<H1>Tree model

'''
keep: feature importance dictionary (key: feature, val: total RSS delta through feature split)
keep: dataframe split into subtrees (through extra column identifying the subtree_id
Input: (dictionary of used subtree_ids(increasing), including a tuple of splitting variable, splitting position, current maxdepth, subtree_id it was split from), same dict, but for unused subtrees, current RSS
for each subtree
    if not yet at maxdepth
    check if split was searched before in unused subtree dict
    for each feature
        for each gap between two observations
            divide tree into two subtrees
            run linar regression on each subtree
            save tuple: (feature, gapposition, RSS-delta)
    find tuple with max RSS-delta, discard remaining tuples
find remaining tuple with max RSS-delta
split dataframe, update dictionaries, update feature importance dictionary with RSS delta
if all subtrees at maxdepth: return RSS, (how to return splitting logic??)
else: feed dataframes, RSS current-delta, list of maxdepth per subtree into next tree
'''

In [8]:
class TreeBooster():
    
    def __init__(self, n_iter=16, n_iterations=10, learning_rate=0.1, max_depth=6, min_elements=1,alpha=1):
        self.n_iter=n_iter
        self.n_iterations=n_iterations
        self.learning_rate=learning_rate
        self.max_depth=max_depth
        self.min_elements=min_elements
        self.alpha=alpha
        self.split_conditions={}
        self.tree_means={}
        self.tree_depth={}
        self.columns=[]
        
        self.initial_mean=0
        
    
    def check_error(self,df):
        #extract y and x columns and relevant rows for split (2x)
        #Run RidgeRegression fit and score for each
        #RidgeRegressionScore as new error
        
        #Ridge solution: 𝛽=(𝑋𝑇𝑋+𝜆𝐼)−1𝑋𝑇𝑦.

        onerow=np.ones(df.shape[0]).reshape(df.shape[0],1)
        X_vals=np.c_[onerow,np.matrix(df[columns])]
        Y_vals=np.matrix(df['Residuals'])
        Identity=np.eye(X_vals.shape[1])
        Identity[0,0]=0

        coefs=(X_vals.T.dot(X_vals)+self.alpha*Identity).I.dot(X_vals.T).dot(Y_vals.T)
        
        error=sum(df.apply(lambda x: (x['Residuals']-(np.matrix(x[columns])*coefs[1:]+coefs[0]))**2, axis=1))/df.shape[0]
        return error

    def treebuilder(self, df, tree_num=0):

        df['subtree_id']=0
        #feature_importance_dict={}
        max_subtree_id=0

        tmp_dict={}
        self.split_conditions[tree_num]={}
        self.tree_means[tree_num]={}
        self.tree_depth[tree_num]={}
        
        self.tree_depth[tree_num][0]=0

        k=0
        while k<self.n_iter:

            subtree_ids=df['subtree_id'].unique()

            for subtree_id in subtree_ids: #each current subtree
                
                if self.tree_depth[tree_num][subtree_id]<self.max_depth:
                    
                    currentError=self.check_error(df[df['subtree_id']==subtree_id])
                    
                    for column in self.columns:#each feature

                        if (subtree_id,column) not in tmp_dict:

                            split_values=list(df[df['subtree_id']==subtree_id][column].unique())
                            split_values.sort()
                            split_val=[]
                            for i in range(1, len(split_values)):
                                split_val.append(split_values[i-1]+(split_values[i]-split_values[i-1])/2)

                            maxError=0
                            for i in split_val:#each split
                                
                                bottom_shape=df[(df['subtree_id']==subtree_id) & (df[column]<i)].shape[0]
                                top_shape=df[(df['subtree_id']==subtree_id) & (df[column]>=i)].shape[0]
                                total_shape=df[(df['subtree_id']==subtree_id)].shape[0]
                                
                                if bottom_shape>=self.min_elements and top_shape>=self.min_elements:
                                    
                                    newError=(self.check_error(df[(df['subtree_id']==subtree_id) & (df[column]<i)])*bottom_shape + self.check_error(df[(df['subtree_id']==subtree_id) & (df[column]>=i)])*top_shape)/total_shape

                                    deltaError=currentError-newError

                                    if deltaError>maxError:
                                        tmp_dict[(subtree_id,column)]=(i,deltaError)
                                        maxError=deltaError

            #print(tmp_dict)
            if tmp_dict:
                best_subtree_id, best_column=max(tmp_dict, key=lambda x: tmp_dict.get(x)[1])
                best_split_pos=tmp_dict[(best_subtree_id, best_column)][0]
                max_subtree_id+=1
                df.loc[(df['subtree_id']==best_subtree_id) & (df[best_column]<best_split_pos),'subtree_id']=max_subtree_id
                self.tree_depth[tree_num][max_subtree_id]=self.tree_depth[tree_num][best_subtree_id]+1
                max_subtree_id+=1
                df.loc[(df['subtree_id']==best_subtree_id)&(df[best_column]>=best_split_pos),'subtree_id']=max_subtree_id
                self.tree_depth[tree_num][max_subtree_id]=self.tree_depth[tree_num][best_subtree_id]+1

                for i,j in list(tmp_dict.keys()):
                    if i==best_subtree_id:
                        del tmp_dict[(i,j)]   
                        
                        
                self.split_conditions[tree_num][best_subtree_id]=(best_column,best_split_pos,max_subtree_id-1,max_subtree_id)
            
            k+=1
            
            #print(df['subtree_id'].value_counts())

        
        subtree_ids=df['subtree_id'].unique()
        for subtree_id in subtree_ids: #each subtree
            mean=df[df['subtree_id']==subtree_id]['Residuals'].mean()
            self.tree_means[tree_num][subtree_id]=mean

        return df
    
    
    def tree_predictor(self, df, tree_num=0):
        
        df['Predict']=0
        for i in df.index:
            current_subtree_id=0
            while current_subtree_id not in self.tree_means[tree_num]:
                column,split_pos,id_left,id_right = self.split_conditions[tree_num][current_subtree_id]
                if df[column][i]<split_pos:
                    current_subtree_id=id_left
                else:
                    current_subtree_id=id_right
            df.loc[i,'Predict']=self.tree_means[tree_num][current_subtree_id]

        return df

    def tree_scorer(self,df):
        mean1=sum(df.iloc[:,0])/df.shape[0]
        TSS=sum(df.iloc[:,0].map(lambda x: (x-mean1)**2))

        RSS=sum((df.iloc[:,1]-df.iloc[:,0])**2)

        return 1-RSS/TSS


    def iteration_builder(self, df):
        df2=df.copy()
        #df2.reset_index(drop=True)
        
        self.columns=list(df2.columns)
        self.columns.pop()
        
        df2['Base_prediction']=df2['SalePriceLog'].mean()
        self.initial_mean=df2['SalePriceLog'].mean()
        df2['Residuals']=df2['SalePriceLog']-df2['Base_prediction']
        
        res_mean=df2['Residuals'].mean()
        print(f'Starting; residual mean at {res_mean}')
        
        for x in range(self.n_iterations):
            df2=self.treebuilder(df2,tree_num=x)
            df2=self.tree_predictor(df2,tree_num=x)
            df2['Base_prediction']=df2['Base_prediction']+df2['Predict']*self.learning_rate
            df2['Residuals']=df2['SalePriceLog']-df2['Base_prediction']
            df2=df2.drop(['Predict'],axis=1)
            
            res_mean=df2['Residuals'].mean()
            print(f'Iteration {x} complete, residual mean at {res_mean}')
        return df2
    
    def iteration_predictor(self, df):
        
        df2=df.copy()
        df2=df2.reset_index(drop=False)

        df2['Predict_fin']=self.initial_mean
        for x in range(self.n_iterations):
            df2=self.tree_predictor(df2,tree_num=x)
            df2['Predict_fin']=df2['Predict_fin']+self.learning_rate*df2['Predict']
            
        df2['Predict']=df2['Predict_fin']
        df2=df2.drop('Predict_fin',axis=1)
        
        df2=df2.set_index('index',drop=True)
        df2.index.name=None

        return df2
        

In [93]:
train_indices=list(np.random.choice(df.index, size=int(df.shape[0]*0.7),replace=False))
test_indices=[x if x not in train_indices else -1 for x in df.index]
df_train=df.loc[df.index.isin(train_indices)]
df_test=df.loc[df.index.isin(test_indices)]

In [94]:
treemodel=TreeBooster(n_iter=12, n_iterations=16, learning_rate=0.2, max_depth=4, min_elements=5,alpha=1)
df_train= treemodel.iteration_builder(df_train)

Starting; residual mean at 1.3717652030455834e-14
Iteration 0 complete, residual mean at 1.1125494523833933e-14
Iteration 1 complete, residual mean at 8.997071617020535e-15
Iteration 2 complete, residual mean at 7.163329559583781e-15
Iteration 3 complete, residual mean at 5.64908358094486e-15
Iteration 4 complete, residual mean at 4.347318004224885e-15
Iteration 5 complete, residual mean at 3.4875737015698747e-15
Iteration 6 complete, residual mean at 2.698656199554513e-15
Iteration 7 complete, residual mean at 2.059147058012572e-15
Iteration 8 complete, residual mean at 1.5933445224189674e-15
Iteration 9 complete, residual mean at 1.2728144757285847e-15
Iteration 10 complete, residual mean at 1.0742926403590572e-15
Iteration 11 complete, residual mean at 8.773217568283542e-16
Iteration 12 complete, residual mean at 5.08712203134414e-16
Iteration 13 complete, residual mean at 3.288017898307798e-16
Iteration 14 complete, residual mean at 3.6033781055354326e-16
Iteration 15 complete, res

In [103]:
df_test=df_test.reset_index(drop=False)
df_test2=treemodel.iteration_predictor(df_test)
treemodel.tree_scorer(df_test2[['SalePriceLog','Predict']])

0.9024810993000432

In [113]:
columns=list(df.columns)
columns.pop()

#Ridge solution: 𝛽=(𝑋𝑇𝑋+𝜆𝐼)−1𝑋𝑇𝑦.

alpha=5

onerow=np.ones(df.shape[0]).reshape(df.shape[0],1)
X_vals=np.c_[onerow,np.matrix(df[columns])]
Y_vals=np.matrix(df['SalePriceLog'])
Identity=np.eye(X_vals.shape[1])
Identity[0,0]=0

coefs=(X_vals.T.dot(X_vals)+alpha*Identity).I.dot(X_vals.T).dot(Y_vals.T)


In [138]:
np.matrix(df[columns].iloc[0])*coefs[1:]+coefs[0]

matrix([[5.06988387]])

In [149]:
sum(df.apply(lambda x: (x['SalePriceLog']-(np.matrix(x[columns])*coefs[1:]+coefs[0]))**2, axis=1))/df.shape[0]

matrix([[0.00167333]])

In [117]:
error=sum(df['Residuals'].apply(lambda x: (x-mean)**2))/df.shape[0]
error

matrix([[ 4.84707792e+00],
        [ 1.25709774e-04],
        [ 9.52299652e-07],
        [ 2.63177651e-01],
        [ 5.38010471e-05],
        [-1.33832592e-03],
        [ 2.08744203e-02],
        [ 2.61102361e-03],
        [ 5.23383141e-03],
        [ 7.25253877e-03],
        [ 1.77698455e-02],
        [ 6.30631294e-05],
        [ 9.40928090e-05],
        [ 1.30058882e-02],
        [-1.48881599e-03],
        [-2.26297306e-03],
        [ 2.98398898e-04],
        [-8.87426585e-03],
        [ 5.26773929e-03],
        [ 3.35403279e-03],
        [ 1.25040847e-02],
        [ 4.02121256e-04],
        [-1.15841079e-02],
        [-2.24948118e-03],
        [-4.66425057e-03],
        [ 5.42355744e-04],
        [-1.01228680e-03],
        [-1.11186005e-02],
        [-1.87518680e-03],
        [ 7.45129054e-03],
        [ 3.06570978e-02],
        [-1.76349982e-03],
        [-1.59760567e-03],
        [ 1.10901976e-01],
        [ 2.37664799e-02],
        [ 2.69660120e-03],
        [-1.99343940e-02],
 

In [91]:
from sklearn.linear_model import Ridge

ridge=Ridge(alpha=5)
ridge.fit(X_vals,Y_vals.T)

Ridge(alpha=5)

In [115]:
ridge.intercept_

array([4.84707792])