# Подготовка данных для обучения

In [2]:
import pandas as pd

data_path = 'credit_history_all.csv'

df = pd.read_csv(data_path)

In [3]:
del df['created_timestamp']
del df['event_timestamp']
del df['loan_id']
del df['zipcode']
del df['dob_ssn']

In [4]:
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_amnt,loan_int_rate,loan_status,city,state,...,total_wages,credit_card_due,mortgage_due,student_loan_due,vehicle_loan_due,hard_pulls,missed_payments_2y,missed_payments_1y,missed_payments_6m,bankruptcies
0,52,69996,OWN,3.0,PERSONAL,12000,11.11,0,CUMMINGS,KS,...,9617849,7649,779433,28579,3583,0,0,0,0,0
1,61,10636,RENT,3.0,PERSONAL,4000,12.21,1,CRESSON,TX,...,38291019,5101,1877182,32008,20745,6,1,3,0,0
2,36,69000,RENT,0.0,MEDICAL,15600,10.99,0,PLAINVILLE,GA,...,23478174,4223,1142978,24020,11711,3,5,1,1,2
3,36,48686,RENT,21.0,MEDICAL,12000,13.99,0,CLINTON,NC,...,35505191,6993,592528,40529,17508,1,1,0,0,0
4,36,44000,RENT,4.0,DEBTCONSOLIDATION,8000,17.49,1,NASHVILLE,TN,...,571965786,1552,1786677,26664,19749,8,2,0,1,0


Для проверки в пайплайне с энкодером

In [5]:
X = df[df.columns.drop("loan_status")].copy()
y = df[["loan_status"]].copy()

In [6]:
columns = list(X.columns)
columns

['person_age',
 'person_income',
 'person_home_ownership',
 'person_emp_length',
 'loan_intent',
 'loan_amnt',
 'loan_int_rate',
 'city',
 'state',
 'location_type',
 'tax_returns_filed',
 'population',
 'total_wages',
 'credit_card_due',
 'mortgage_due',
 'student_loan_due',
 'vehicle_loan_due',
 'hard_pulls',
 'missed_payments_2y',
 'missed_payments_1y',
 'missed_payments_6m',
 'bankruptcies']

In [7]:
X = X.values
X

array([[52, 69996, 'OWN', ..., 0, 0, 0],
       [61, 10636, 'RENT', ..., 3, 0, 0],
       [36, 69000, 'RENT', ..., 1, 1, 2],
       ...,
       [24, 200000, 'MORTGAGE', ..., 0, 0, 0],
       [23, 74000, 'RENT', ..., 1, 0, 0],
       [23, 12996, 'OWN', ..., 0, 0, 2]], dtype=object)

In [8]:
y = y.values
y

array([[0],
       [1],
       [0],
       ...,
       [0],
       [1],
       [1]])

In [9]:
len(y)

28638

# Инициализируем модели

In [196]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OrdinalEncoder

class DTCModel(object):  
    
    def __init__(self):
        
        self.categorical_features = [
            "person_home_ownership",
            "loan_intent",
            "city",
            "state",
            "location_type",
        ]
        
        self.encoder = joblib.load("encoder.pkl")
        
        print("Encoder loaded")
        
        self.model = joblib.load("DTC.pkl")
        
        print("Model loaded")
        
        self.cm = {"tp": 0, "fp": 0, "tn": 0, "fn": 0}

        self.tries = 0
        self.success = 0
        self.value = 0
        

    def predict(self,X,features_names):

        df = pd.DataFrame(X, columns=features_names)
        
        df[self.categorical_features] = self.encoder.transform(df[self.categorical_features])
        df = df.reindex(sorted(df.columns), axis=1)
        
        predictions = self.model.predict(df)
        
        return predictions

    def send_feedback(self, features, feature_names, reward, truth, routing=None):
        print("DTC model send-feedback entered")
        print(f"Truth: {truth}, Reward: {reward}")

        if reward == 1:
            if truth == 1:
                self.cm["tp"] += 1
            if truth == 0:
                self.cm["tn"] += 1
        if reward == 0:
            if truth == 1:
                self.cm["fn"] += 1
            if truth == 0:
                self.cm["fp"] += 1

        self.tries += 1
        self.success = self.success + 1 if reward else self.success
        self.value = self.success / self.tries

        print(self.cm)
        print(
            "Tries: %s, successes: %s, values: %s", self.tries, self.success, self.value
        )

    def metrics(self):
        tp = {
            "type": "GAUGE",
            "key": "true_pos_total",
            "value": self.cm["tp"],
            "tags": {"branch_name": "DTC"},
        }
        tn = {
            "type": "GAUGE",
            "key": "true_neg_total",
            "value": self.cm["tn"],
            "tags": {"branch_name": "DTC"},
        }
        fp = {
            "type": "GAUGE",
            "key": "false_pos_total",
            "value": self.cm["fp"],
            "tags": {"branch_name": "DTC"},
        }
        fn = {
            "type": "GAUGE",
            "key": "false_neg_total",
            "value": self.cm["fn"],
            "tags": {"branch_name": "DTC"},
        }

        value = {
            "type": "GAUGE",
            "key": "branch_value",
            "value": self.value,
            "tags": {"branch_name": "DTC"},
        }
        success = {
            "type": "GAUGE",
            "key": "n_success_total",
            "value": self.success,
            "tags": {"branch_name": "DTC"},
        }
        tries = {
            "type": "GAUGE",
            "key": "n_tries_total",
            "value": self.tries,
            "tags": {"branch_name": "DTC"},
        }

        return [tp, tn, fp, fn, value, success, tries]

In [197]:
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import OrdinalEncoder

class GBCModel(object):  
    
    def __init__(self):
        
        self.categorical_features = [
            "person_home_ownership",
            "loan_intent",
            "city",
            "state",
            "location_type",
        ]
        
        self.encoder = joblib.load("encoder.pkl")
        
        print("Encoder loaded")
        
        self.model = joblib.load("GBC.pkl")
        
        print("Model loaded")
        
        self.cm = {"tp": 0, "fp": 0, "tn": 0, "fn": 0}

        self.tries = 0
        self.success = 0
        self.value = 0

    def predict(self,X,features_names):

        df = pd.DataFrame(X, columns=features_names)
        
        df[self.categorical_features] = self.encoder.transform(df[self.categorical_features])
        df = df.reindex(sorted(df.columns), axis=1)
        
        predictions = self.model.predict(df)
        
        print(predictions)
        
        return predictions

    def send_feedback(self, features, feature_names, reward, truth, routing=None):
        print("GBC model send-feedback entered")
        print(f"Truth: {truth}, Reward: {reward}")

        if reward == 1:
            if truth == 1:
                self.cm["tp"] += 1
            if truth == 0:
                self.cm["tn"] += 1
        if reward == 0:
            if truth == 1:
                self.cm["fn"] += 1
            if truth == 0:
                self.cm["fp"] += 1

        self.tries += 1
        self.success = self.success + 1 if reward else self.success
        self.value = self.success / self.tries

        print(self.cm)
        print(
            "Tries: %s, successes: %s, values: %s", self.tries, self.success, self.value
        )

    def metrics(self):
        tp = {
            "type": "GAUGE",
            "key": "true_pos_total",
            "value": self.cm["tp"],
            "tags": {"branch_name": "GBC"},
        }
        tn = {
            "type": "GAUGE",
            "key": "true_neg_total",
            "value": self.cm["tn"],
            "tags": {"branch_name": "GBC"},
        }
        fp = {
            "type": "GAUGE",
            "key": "false_pos_total",
            "value": self.cm["fp"],
            "tags": {"branch_name": "GBC"},
        }
        fn = {
            "type": "GAUGE",
            "key": "false_neg_total",
            "value": self.cm["fn"],
            "tags": {"branch_name": "GBC"},
        }

        value = {
            "type": "GAUGE",
            "key": "branch_value",
            "value": self.value,
            "tags": {"branch_name": "GBC"},
        }
        success = {
            "type": "GAUGE",
            "key": "n_success_total",
            "value": self.success,
            "tags": {"branch_name": "GBC"},
        }
        tries = {
            "type": "GAUGE",
            "key": "n_tries_total",
            "value": self.tries,
            "tags": {"branch_name": "GBC"},
        }

        return [tp, tn, fp, fn, value, success, tries]

# Цикл тестирования 

In [198]:
import joblib

dtc = DTCModel()
gbc = GBCModel()

Encoder loaded
Model loaded
Encoder loaded
Model loaded


In [11]:
import numpy as np
import requests

n = 100

for i in range(n):
    print(f"Processed {i+1}/{n} samples", flush=True)  
    
#     Локальный тест классов !!!!!!!!!!

#     print([X[i].tolist()])
#     pred = dtc.predict([X[i].tolist()], columns)
#     print(pred)
    
#     truth_val = int(y[i][0])
    
#     reward = int(pred == truth_val)
#     print(reward)
    
#     _ = dtc.send_feedback([], [], reward, truth_val)
    
#     reward = int(pred == truth_val)


#     Тест на сервере !!!!!!!!!!
    port = "30636"
    url = "http://eg-experiment-eg-2.seldon-mesh.svc.cluster.local:8000/api/v1.0/predictions" # Надо заполнить
    url_feedback = "http://eg-experiment-eg-2.seldon-mesh.svc.cluster.local:8000/api/v1.0/feedback"
    req = {"data": {"ndarray": [X[i].tolist()], "names": columns}}
    
    res_raw = requests.post(
        url, json=req
    )
    res = res_raw.json()
    print("Respones: ", res)
    pred = res.get("data").get("ndarray")[0]
    
    truth_val = int(y[i][0])
    
    reward = int(pred == truth_val)
    
    truth = [truth_val]
    
    feedback = {
        "request": req,
        "response": res,
        "reward": reward,
        "truth": {"data": {"ndarray": truth}}
    }
    res_raw = requests.post(
        url_feedback, json=feedback
    )
    print(res_raw)
    assert res_raw.status_code == 200


Processed 1/100 samples
Respones:  {'data': {'names': [], 'ndarray': [0]}, 'meta': {'metrics': [{'key': 'true_pos_total', 'tags': {'branch_name': 'DTC', 'method': 'predict'}, 'type': 'GAUGE', 'value': 55}, {'key': 'true_neg_total', 'tags': {'branch_name': 'DTC', 'method': 'predict'}, 'type': 'GAUGE', 'value': 196}, {'key': 'false_pos_total', 'tags': {'branch_name': 'DTC', 'method': 'predict'}, 'type': 'GAUGE', 'value': 0}, {'key': 'false_neg_total', 'tags': {'branch_name': 'DTC', 'method': 'predict'}, 'type': 'GAUGE', 'value': 0}, {'key': 'branch_value', 'tags': {'branch_name': 'DTC', 'method': 'predict'}, 'type': 'GAUGE', 'value': 1}, {'key': 'n_success_total', 'tags': {'branch_name': 'DTC', 'method': 'predict'}, 'type': 'GAUGE', 'value': 251}, {'key': 'n_tries_total', 'tags': {'branch_name': 'DTC', 'method': 'predict'}, 'type': 'GAUGE', 'value': 251}], 'requestPath': {'dtc-model': 'blcox/dtc-model:v11.0'}, 'routing': {'dtc-model': -1, 'eg-router': 0}}}
<Response [200]>
Processed 2/10

In [None]:
type([])