In [1]:
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


# рандомный дата сет и его разделение на обуч/тест
X, y = make_blobs(n_samples=100, centers=5, n_features=5, random_state=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

model = LogisticRegression(solver='lbfgs')
model.fit(X_train, y_train)
print('Model is fitted')

Model is fitted


In [2]:
X_train[:5]

array([[  0.02893715,   5.03230134, -11.60907632,  -3.89325081,
         -6.60246139],
       [ -1.03699711,   4.83405194,  -9.9237029 ,  -4.40651715,
         -6.46072854],
       [  5.28698894,   8.65679949,  -4.38161   ,   4.07961662,
          7.98826864],
       [  3.104177  ,   0.17186983,   0.81160271,  -7.3938379 ,
         -6.98062579],
       [ -1.87147283,   2.8597658 ,  -8.95251188,  -2.94297307,
         -6.99404554]])

In [3]:
import pandas as pd

df = pd.DataFrame(data=X_train, columns=['x1', 'x2', 'x3', 'x4', 'x5'])
df.head(5)

Unnamed: 0,x1,x2,x3,x4,x5
0,0.028937,5.032301,-11.609076,-3.893251,-6.602461
1,-1.036997,4.834052,-9.923703,-4.406517,-6.460729
2,5.286989,8.656799,-4.38161,4.079617,7.988269
3,3.104177,0.17187,0.811603,-7.393838,-6.980626
4,-1.871473,2.859766,-8.952512,-2.942973,-6.994046


In [4]:
df['x1'].unique()

array([ 0.02893715, -1.03699711,  5.28698894,  3.104177  , -1.87147283,
       -9.85996586, -1.00327583, -2.40751365, -1.32054642,  3.23725023,
        4.13010907, -8.14826765,  5.69588913, -7.93554555,  3.43036148,
       -2.25744388,  4.21565006, -1.12574899, -9.55778871, -6.64637746,
        3.61215626,  2.68355961, -1.45074656, -0.2925272 , -0.05205286,
       -1.20460766,  4.03501569, -8.21370145,  6.45505482,  5.38802673,
       -7.24577694, -2.20251282, -7.37608159, -5.76802229,  4.52051058,
       -1.24022528,  7.06802541, -0.94197008,  2.89260577, -1.38384186,
       -5.94131323, -7.92039581,  5.50409524,  4.36261415, -2.41200275,
        3.34257243,  4.18318307, -8.04001006, -2.78915507, -0.78614167,
        4.93677769, -7.22923511, -8.26407257, -7.81492334,  6.73677914,
       -2.06953728,  3.09556326,  3.067091  , -0.55049232, -7.74053793,
       -0.40950986, -0.97472342,  5.81459149, -2.96484623,  4.77302336,
        7.33348264, -0.74236389])

# что "под капотом"

In [5]:
model.coef_

array([[-0.07859722,  0.22289687, -0.5211671 , -0.33432242, -0.12508447],
       [-0.34726567, -0.33893886,  0.04533889,  0.01851502,  0.24391597],
       [-0.05655281,  0.04065792,  0.06595581,  0.47792999, -0.30079391],
       [ 0.30455425, -0.12659892,  0.36888877, -0.28606693, -0.15174447],
       [ 0.17786144,  0.20198299,  0.04098362,  0.12394434,  0.33370689]])

In [6]:
list(zip(model.coef_, ['x1', 'x2', 'x3', 'x4', 'x5']))

[(array([-0.07859722,  0.22289687, -0.5211671 , -0.33432242, -0.12508447]),
  'x1'),
 (array([-0.34726567, -0.33893886,  0.04533889,  0.01851502,  0.24391597]),
  'x2'),
 (array([-0.05655281,  0.04065792,  0.06595581,  0.47792999, -0.30079391]),
  'x3'),
 (array([ 0.30455425, -0.12659892,  0.36888877, -0.28606693, -0.15174447]),
  'x4'),
 (array([0.17786144, 0.20198299, 0.04098362, 0.12394434, 0.33370689]), 'x5')]

In [7]:
model.intercept_

array([-2.72207627,  1.44814412, -0.62821412,  1.44892504,  0.45322123])

In [8]:
from skompiler import skompile
expr = skompile(model.predict)
sql = expr.to('sqlalchemy/postgresql')
print(sql)



WITH _tmp1 AS 
(SELECT data.id AS __id__, -0.07859721568507655 * x1 + 0.22289687068059238 * x2 + -0.5211670965050653 * x3 + -0.33432242359685665 * x4 + -0.12508447250035992 * x5 + -2.7220762685789914 AS f1, -0.3472656746645461 * x1 + -0.3389388614703779 * x2 + 0.04533889093339693 * x3 + 0.018515018915193963 * x4 + 0.24391596523719872 * x5 + 1.4481441208638028 AS f2, -0.0565528060318782 * x1 + 0.04065792409477134 * x2 + 0.06595581245037392 * x3 + 0.47792999226088473 * x4 + -0.30079391119014287 * x5 + -0.6282141173677712 AS f3, 0.3045542523776756 * x1 + -0.12659891935192882 * x2 + 0.36888877291846 * x3 + -0.2860669264035789 * x4 + -0.1517444702028635 * x5 + 1.4489250400041591 AS f4, 0.17786144400379647 * x1 + 0.20198298604690426 * x2 + 0.04098362020280504 * x3 + 0.12394433882426911 * x4 + 0.3337068886561315 * x5 + 0.45322122507869983 AS f5 
FROM data), 
_tmp2 AS 
(SELECT _tmp1.__id__ AS __id__, greatest(greatest(greatest(greatest(_tmp1.f1, _tmp1.f2), _tmp1.f3), _tmp1.f4), _tmp1.f5) AS _m

# Делаем SQL скрипт из Дерева

In [None]:
from sklearn.tree import _tree

def tree_to_code(tree, feature_names):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]
    print ("SELECT")

    def recurse(node, depth):
        indent = "  " * depth
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            print(node)
            print ("CASE WHEN {} <= {}".format(name, threshold))
            recurse(tree_.children_left[node], depth + 1)
            print ("CASE WHEN {} > {}".format(name, threshold))
            recurse(tree_.children_right[node], depth + 1)
        else:
            print ("{}THEN {}".format(indent, tree_.value[node]))

    recurse(0, 1)
    
    print('FROM')

In [None]:
from sklearn import tree

dt = tree.DecisionTreeClassifier(criterion='entropy')
dt.fit(X_train, y_train)
tree_to_code(dt, ['x1', 'x2', 'x3', 'x4', 'x5'])

In [None]:
from skompiler import skompile
expr = skompile(model.predict)
sql = expr.to('sqlalchemy/postgresql')
print(sql)