In [52]:
import pandas as pd
import random
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import numpy as np

In [53]:
class DenisTree:
    
    def __init__(self):
        self.class_count = 2
        self.model = {}
    
    def h(self, p):
            h = 0 
            for k in range(2):
                h += p[k] * (1-p[k])
            return h


    def p(self, y):
            p = []
            for i in range(self.class_count):
                p.append(0)
                for Y in y:
                    if Y == i:
                        p[i] += 1 

                p[i] = p[i] / len(y)       

            return p


    def gini(self, slice, df):
        """
        Gini impurity
        """

        # for i in range(class_count):       
        q_left = df[df[df.columns[0]] < slice]['y']     
        q_right = df[df[df.columns[0]] >= slice]['y']

        if len(q_left) == 0 or len(q_right) == 0:
            # not a slice
            return [-1, 1, 1]

        p_left = self.p(q_left)
        h_left = self.h(p_left)

        p_right = self.p(q_right)
        h_right = self.h(p_right)

        q_left = len(q_left) / len(df)
        q_right = len(q_right) / len(df)       

        IG = self.h(self.p(df['y'])) - q_left * h_left - q_right * h_right

        #print(f'GINI || Q_LEFT: {q_left} | Q_RIGHT: {q_right} | P_LEFT: {p_left}')       

        return [IG, h_left, h_right]


    def best_gini(self, feature, y):
        best = -1
        best_slice = -1
        best_left = 1
        best_right = 1

        # can be no devide opportunity
        if feature.min() != feature.max():
            df = feature.to_frame()
            df['y'] = y

            # 100 попыток найти лучший gini
            for i in range(10):
                slice = random.uniform(feature.min(), feature.max())
                g = self.gini(slice, df)

                if g[0] > best:
                    best = g[0]
                    best_slice = slice
                    best_left = g[1]
                    best_right = g[2]

        result = {feature.name: {'gini': best, 'slice': best_slice, 'left': best_left, 'right': best_right}}
        #print(result)
        return result


    def node(self, X, y):  
        # apply to every feature
        features_gini = X.apply(lambda x: self.best_gini(x, y))

        best_feature = {'gini': 0}

        df = X
        df['y'] = y

        for f in features_gini:
            for key, value in f.items():
                if value['gini'] > best_feature['gini']:
                    best_feature['name'] = key
                    best_feature['slice'] = value['slice']
                    best_feature['gini'] = value['gini']
                    best_feature['h_left'] = value['left']
                    best_feature['h_right'] = value['right']

                # we have the final node!
                if value['gini'] == 0:
                    left = df[df[key] < value['slice']]
                    dleft = left['y'].iloc[0]
                    right = df[df[key] >= value['slice']]
                    dright = right['y'].iloc[0]
                    print('STOP!!!')
                    print(f'Left way is desicion: Class #{dleft}')
                    print(f'Right way is desicion: Class #{dright}')

                    return {'feature': key, 'slice': value['slice'], 'left': dleft, 'right': dright} 


        # still there is 
        #if best_feature['gini'] > 0 :
        print('Best step feature: ' + str(best_feature['name']) + ' | slice: ' + str(best_feature['slice']) + ' | GINI: '  + str(best_feature['gini']))

        #save model
        model = {'feature': best_feature['name'], 'slice': best_feature['slice']} 

        # left ветка
        left = df[df[best_feature['name']] < best_feature['slice']]
        if best_feature['h_left'] == 0:
            # decision
            decision = left['y']
            try:
                decision = decision.iloc[0]
            except:
                print(df[best_feature['name']])
                print('Trying to slice ', best_feature['slice'])
                print('WRONG ', decision)
                input()
            print(f'Left way is desicion: Class #{decision}')
            model['left'] = decision
        else:
            model['left'] = self.node(left.drop('y', axis=1), left['y'])

        # right ветка
        right = df[df[best_feature['name']] >= best_feature['slice']]
        if best_feature['h_right'] == 0:
            # decision
            decision = right['y']
            try:
                decision = decision.iloc[0]
            except:
                print(decision)
            print(f'Right way is desicion: Class #{decision}')
            model['right'] = decision
        else:
            model['right'] = self.node(right.drop('y', axis=1), right['y'])

        return model


    def fit(self, X, y):
        self.class_count = len(y.value_counts())
        print('Class count ', self.class_count)
        self.model = self.node(X,y)
        print('==== Model ready!')
        
    def __predict(self, model, X):
        # left 
        if X[model['feature']] < model['slice']:
            # next node
            if type(model['left']) == dict:
                return self.__predict(model['left'], X)
            else:
                return model['left']

        # Right
        else:
            if type(model['right']) == dict:
                return self.__predict(model['right'], X)
            else:
                return model['right']

    def predict(self, X):
        """
        And know we will predict class
        """   
        return self.__predict(self.model, X)
    

# Ирисы

In [87]:
# pental data

from sklearn import datasets
iris = datasets.load_iris()

X = pd.DataFrame(iris.data)
y = pd.Series(iris.target)

# Данные от такси

Load data from my qualification work. See github Midriaz/python

In [81]:
# simple prediciton. taxi driver response base on service class
dset = pd.read_csv('taxi_tree.csv')
X = dset.drop(['driver_response', 'close_to_driver', 'close_to_client'], axis = 1)
y = dset['driver_response']

#min_max_scaler = preprocessing.MinMaxScaler()
#X = pd.DataFrame(min_max_scaler.fit_transform(X))

# Учим нашу модель

In [88]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = DenisTree()
model.fit(X_train, y_train)

Class count  3
Best step feature: 2 | slice: 1.9932106102239096 | GINI: 0.2805902777777778
Left way is desicion: Class #0
Best step feature: 2 | slice: 4.748384207151906 | GINI: 0.18244902969830298
Best step feature: 3 | slice: 1.6118637965068263 | GINI: 0.02629656683710733


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Left way is desicion: Class #1
Right way is desicion: Class #2
Best step feature: 3 | slice: 1.774215808380236 | GINI: 0.033655257668237654
Best step feature: 2 | slice: 5.1486793998163165 | GINI: 0.08333333333333331
Best step feature: 1 | slice: 2.418637802927986 | GINI: 0.0888888888888889
Left way is desicion: Class #2
Best step feature: 2 | slice: 5.041636874585465 | GINI: 0.05999999999999997
Left way is desicion: Class #1
Best step feature: 0 | slice: 6.203667925467657 | GINI: 0.25
Left way is desicion: Class #1
Right way is desicion: Class #2
Right way is desicion: Class #2
Best step feature: 0 | slice: 6.040673028486875 | GINI: 0.0027551020408163283
Best step feature: 1 | slice: 3.035416677242385 | GINI: 0.109375
Left way is desicion: Class #2
Right way is desicion: Class #1
Right way is desicion: Class #2
==== Model ready!


# OUR PREDICTION

In [89]:
def convert_x(_x, X):
    """
    We need dictionary to pass in predict function
    """
    result = {}
    
    col = [x for x in X.columns]
    
    for i in range(len(col)):
        result[col[i]] = _x[i]
    
    return result
        
    
pred = X_test.copy()
pred['y'] = pred.apply(lambda x: model.predict(convert_x(x, X_test)), axis=1)

# Скорим нашу модель

In [90]:
print('Metrics')
print('F1: ', f1_score(y_test, pred['y'], average="macro"))
print('Accuracy score', accuracy_score(y_test, pred['y']))
print('Precision score', precision_score(y_test, pred['y'], average="macro"))
print('Recall score', recall_score(y_test, pred['y'], average="macro"))

Metrics
F1:  1.0
Accuracy score 1.0
Precision score 1.0
Recall score 1.0


## Неплохая точность с учетом того, что мы брали только 1% выборки для обучения

Точно не скажешь, что модель переучена :)

# Вопросы к преподавателю

Правильно ли я ставлю условия выхода? По Gini = 0, Hleft = 0, hright = 0?