In [16]:
from DecisionTree import DecisionTreeRegressor
import numpy as np
from pprint import pprint
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split 

In [17]:
## convert text file into matrix
def file2matrix(filename , header = True, delimeter="\t", index_y = None):
    """
    Takes a .txt file and returns a list of column names,a matrix of features and
    a vectore of y in case of having a target variable.
    """    
    fr = open(filename)
    colName = []
    if header == True:
        colName = list(fr.readline().strip().split(delimeter))
    numberOfLines = len(fr.readlines())
    fr = open(filename)
    numberOfX = len(fr.readline().split(delimeter)) 
    if index_y is not None:
        numberOfX = len(fr.readline().split(delimeter))-1                   
    returnMatX = np.zeros((numberOfLines,numberOfX))
    classLabelVector = []
    fr = open(filename)
    firstRow = 0
    if header == True:
        firstRow = 1
    index = 0
    for line in fr.readlines()[firstRow:]:
        line = line.strip()                            
        listFromLine = line.split(delimeter)
        if index_y is not None:
            classLabelVector.append(float(listFromLine[index_y]))
            listFromLine.pop(index_y)
        fltListFromLine = list(map(lambda x: float(x) if x!="" else np.nan, listFromLine))                                 
        returnMatX[index,:] = fltListFromLine        
        index += 1
    return colName, returnMatX, classLabelVector

In [18]:
## Accuracy measures 
def compute_error(trues,predicted):
    corr=np.corrcoef(predicted,trues)[0,1]
    mae=np.mean(np.abs(predicted-trues))
    rae=np.sum(np.abs(predicted-trues))/np.sum(np.abs(trues-np.mean(trues)))
    rmse=np.sqrt(np.mean((predicted-trues)**2))
    r2=max(0,1-np.sum((trues-predicted)**2)/np.sum((trues-np.mean(trues))**2))
    return corr,mae,rae,rmse,r2

In [19]:
filename = 'abalone.txt'
colName, returnMatX, target = file2matrix(filename , header = False, delimeter="\t", index_y = -1)
X = returnMatX
y = np.array(target)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [33]:
model = DecisionTreeRegressor(min_size=3, tol_error=10, criterion="mae")
model.fit(X_train,y_train)
pprint(model.trees_)

{'col': 7,
 'cut_off': 0.145,
 'index_col': 7,
 'left': {'col': 7,
          'cut_off': 0.0675,
          'index_col': 7,
          'left': {'col': 6,
                   'cut_off': 0.0275,
                   'index_col': 6,
                   'left': {'col': 4,
                            'cut_off': 0.0525,
                            'index_col': 4,
                            'left': 4.066666666666666,
                            'right': 5.5,
                            'samples': 131},
                   'right': {'col': 7,
                             'cut_off': 0.048,
                             'index_col': 7,
                             'left': 6.321428571428571,
                             'right': 6.969072164948454,
                             'samples': 153},
                   'samples': 284},
          'right': {'col': 0,
                    'cut_off': 0.0,
                    'index_col': 0,
                    'left': 9.25925925925926,
                    'right': {'

In [34]:
pred = model.prediction(X_test)
corr, MAE, RAE, RMSE, R2 = compute_error(y_test,pred)
print("\nCorrCoef: %.3f\nMAE: %.3f\nRMSE: %.3f\nR2: %.3f" %(corr, MAE,RMSE, R2))


CorrCoef: 0.710
MAE: 1.569
RMSE: 2.217
R2: 0.496


In [35]:
## post_pruning
tree = model.trees_
testData = np.hstack((X_test, y_test.reshape(-1,1)))
pprint(model.prune(tree, testData))

merging
merging
merging
merging
merging
{'col': 7,
 'cut_off': 0.145,
 'index_col': 7,
 'left': {'col': 7,
          'cut_off': 0.0675,
          'index_col': 7,
          'left': {'col': 6,
                   'cut_off': 0.0275,
                   'index_col': 6,
                   'left': {'col': 4,
                            'cut_off': 0.0525,
                            'index_col': 4,
                            'left': 4.066666666666666,
                            'right': 5.5,
                            'samples': 131},
                   'right': {'col': 7,
                             'cut_off': 0.048,
                             'index_col': 7,
                             'left': 6.321428571428571,
                             'right': 6.969072164948454,
                             'samples': 153},
                   'samples': 284},
          'right': {'col': 0,
                    'cut_off': 0.0,
                    'index_col': 0,
                    'left': 9.2592592

In [36]:
pred = model.prediction(X_test)
corr, MAE, RAE, RMSE, R2 = compute_error(y_test,pred)
print("\nCorrCoef: %.3f\nMAE: %.3f\nRMSE: %.3f\nR2: %.3f" %(corr, MAE,RMSE, R2))


CorrCoef: 0.711
MAE: 1.564
RMSE: 2.209
R2: 0.499
