# Подготовка данных для обучения

In [1]:
import pandas as pd

data_path = 'data/credit_history_all.csv'

df = pd.read_csv(data_path)

In [2]:
del df['created_timestamp']
del df['event_timestamp']
del df['loan_id']
del df['zipcode']
del df['dob_ssn']

In [3]:
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_amnt,loan_int_rate,loan_status,city,state,...,total_wages,credit_card_due,mortgage_due,student_loan_due,vehicle_loan_due,hard_pulls,missed_payments_2y,missed_payments_1y,missed_payments_6m,bankruptcies
0,52,69996,OWN,3.0,PERSONAL,12000,11.11,0,CUMMINGS,KS,...,9617849,7649,779433,28579,3583,0,0,0,0,0
1,61,10636,RENT,3.0,PERSONAL,4000,12.21,1,CRESSON,TX,...,38291019,5101,1877182,32008,20745,6,1,3,0,0
2,36,69000,RENT,0.0,MEDICAL,15600,10.99,0,PLAINVILLE,GA,...,23478174,4223,1142978,24020,11711,3,5,1,1,2
3,36,48686,RENT,21.0,MEDICAL,12000,13.99,0,CLINTON,NC,...,35505191,6993,592528,40529,17508,1,1,0,0,0
4,36,44000,RENT,4.0,DEBTCONSOLIDATION,8000,17.49,1,NASHVILLE,TN,...,571965786,1552,1786677,26664,19749,8,2,0,1,0


Для проверки в пайплайне с энкодером

In [4]:
X = df[df.columns.drop("loan_status")].copy()
y = df[["loan_status"]].copy()

In [5]:
columns = list(X.columns)
columns

['person_age',
 'person_income',
 'person_home_ownership',
 'person_emp_length',
 'loan_intent',
 'loan_amnt',
 'loan_int_rate',
 'city',
 'state',
 'location_type',
 'tax_returns_filed',
 'population',
 'total_wages',
 'credit_card_due',
 'mortgage_due',
 'student_loan_due',
 'vehicle_loan_due',
 'hard_pulls',
 'missed_payments_2y',
 'missed_payments_1y',
 'missed_payments_6m',
 'bankruptcies']

In [6]:
X = X.values
X

array([[52, 69996, 'OWN', ..., 0, 0, 0],
       [61, 10636, 'RENT', ..., 3, 0, 0],
       [36, 69000, 'RENT', ..., 1, 1, 2],
       ...,
       [24, 200000, 'MORTGAGE', ..., 0, 0, 0],
       [23, 74000, 'RENT', ..., 1, 0, 0],
       [23, 12996, 'OWN', ..., 0, 0, 2]], dtype=object)

In [7]:
y = y.values
y

array([[0],
       [1],
       [0],
       ...,
       [0],
       [1],
       [1]], dtype=int64)

In [8]:
len(y)

28638

# Инициализируем модели

In [9]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OrdinalEncoder

class DTCModel(object):  
    
    def __init__(self):
        
        self.categorical_features = [
            "person_home_ownership",
            "loan_intent",
            "city",
            "state",
            "location_type",
        ]
        
        self.encoder = joblib.load("encoder.pkl")
        
        print("Encoder loaded")
        
        self.model = joblib.load("DTC.pkl")
        
        print("Model loaded")
        
        self.cm = {"tp": 0, "fp": 0, "tn": 0, "fn": 0}

        self.tries = 0
        self.success = 0
        self.value = 0
        

    def predict(self,X,features_names):

        df = pd.DataFrame(X, columns=features_names)
        
        df[self.categorical_features] = self.encoder.transform(df[self.categorical_features])
        df = df.reindex(sorted(df.columns), axis=1)
        
        predictions = self.model.predict(df)
        
        return predictions

    def send_feedback(self, features, feature_names, reward, truth, routing=None):
        print("DTC model send-feedback entered")
        print(f"Truth: {truth}, Reward: {reward}")

        if reward == 1:
            if truth == 1:
                self.cm["tp"] += 1
            if truth == 0:
                self.cm["tn"] += 1
        if reward == 0:
            if truth == 1:
                self.cm["fn"] += 1
            if truth == 0:
                self.cm["fp"] += 1

        self.tries += 1
        self.success = self.success + 1 if reward else self.success
        self.value = self.success / self.tries

        print(self.cm)
        print(
            "Tries: %s, successes: %s, values: %s", self.tries, self.success, self.value
        )

    def metrics(self):
        tp = {
            "type": "GAUGE",
            "key": "true_pos_total",
            "value": self.cm["tp"],
            "tags": {"branch_name": "DTC"},
        }
        tn = {
            "type": "GAUGE",
            "key": "true_neg_total",
            "value": self.cm["tn"],
            "tags": {"branch_name": "DTC"},
        }
        fp = {
            "type": "GAUGE",
            "key": "false_pos_total",
            "value": self.cm["fp"],
            "tags": {"branch_name": "DTC"},
        }
        fn = {
            "type": "GAUGE",
            "key": "false_neg_total",
            "value": self.cm["fn"],
            "tags": {"branch_name": "DTC"},
        }

        value = {
            "type": "GAUGE",
            "key": "branch_value",
            "value": self.value,
            "tags": {"branch_name": "DTC"},
        }
        success = {
            "type": "GAUGE",
            "key": "n_success_total",
            "value": self.success,
            "tags": {"branch_name": "DTC"},
        }
        tries = {
            "type": "GAUGE",
            "key": "n_tries_total",
            "value": self.tries,
            "tags": {"branch_name": "DTC"},
        }

        return [tp, tn, fp, fn, value, success, tries]

In [10]:
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import OrdinalEncoder

class GBCModel(object):  
    
    def __init__(self):
        
        self.categorical_features = [
            "person_home_ownership",
            "loan_intent",
            "city",
            "state",
            "location_type",
        ]
        
        self.encoder = joblib.load("encoder.pkl")
        
        print("Encoder loaded")
        
        self.model = joblib.load("GBC.pkl")
        
        print("Model loaded")

    def predict(self,X,features_names):

        df = pd.DataFrame(X, columns=features_names)
        
        df[self.categorical_features] = self.encoder.transform(df[self.categorical_features])
        df = df.reindex(sorted(df.columns), axis=1)
        
        predictions = self.model.predict(df)
        
        print(predictions)
        
        return predictions

    def send_feedback(self, features, feature_names, reward, truth, routing=None):
        print("GBC model send-feedback entered")
        print(f"Truth: {truth}, Reward: {reward}")

        if reward == 1:
            if truth == 1:
                self.cm["tp"] += 1
            if truth == 0:
                self.cm["tn"] += 1
        if reward == 0:
            if truth == 1:
                self.cm["fn"] += 1
            if truth == 0:
                self.cm["fp"] += 1

        self.tries += 1
        self.success = self.success + 1 if reward else self.success
        self.value = self.success / self.tries

        print(self.cm)
        print(
            "Tries: %s, successes: %s, values: %s", self.tries, self.success, self.value
        )

    def metrics(self):
        tp = {
            "type": "GAUGE",
            "key": "true_pos_total",
            "value": self.cm["tp"],
            "tags": {"branch_name": "GBC"},
        }
        tn = {
            "type": "GAUGE",
            "key": "true_neg_total",
            "value": self.cm["tn"],
            "tags": {"branch_name": "GBC"},
        }
        fp = {
            "type": "GAUGE",
            "key": "false_pos_total",
            "value": self.cm["fp"],
            "tags": {"branch_name": "GBC"},
        }
        fn = {
            "type": "GAUGE",
            "key": "false_neg_total",
            "value": self.cm["fn"],
            "tags": {"branch_name": "GBC"},
        }

        value = {
            "type": "GAUGE",
            "key": "branch_value",
            "value": self.value,
            "tags": {"branch_name": "GBC"},
        }
        success = {
            "type": "GAUGE",
            "key": "n_success_total",
            "value": self.success,
            "tags": {"branch_name": "GBC"},
        }
        tries = {
            "type": "GAUGE",
            "key": "n_tries_total",
            "value": self.tries,
            "tags": {"branch_name": "GBC"},
        }

        return [tp, tn, fp, fn, value, success, tries]

# Цикл тестирования 

In [11]:
import joblib

dtc = DTCModel()
gbc = GBCModel()

Encoder loaded
Model loaded
Encoder loaded
Model loaded


In [12]:
import numpy as np
import requests

n = 100

for i in range(n):
    print(f"Processed {i+1}/{n} samples", flush=True)  
    
#     Локальный тест классов !!!!!!!!!!

#     print([X[i].tolist()])
#     pred = dtc.predict([X[i].tolist()], columns)
#     print(pred)
    
#     truth_val = int(y[i][0])
    
#     reward = int(pred == truth_val)
#     print(reward)
    
#     _ = dtc.send_feedback([], [], reward, truth_val)
    
#     reward = int(pred == truth_val)


#     Тест на сервере !!!!!!!!!!

    url = '' # Надо заполнить
    
    req = {"data": {"ndarray": [X[i].tolist()], "names": columns}}
    
    res_raw = requests.post(
        f"{url}/predictions", json=req
    )
    res = res_raw.json()
    
    pred = response.get("data").get("ndarray")[0]
    
    truth_val = int(y[i][0])
    
    reward = int(pred == truth_val)
    
    truth = [truth_val]
    
    feedback = {
        "request": req,
        "response": res,
        "reward": reward,
        "truth": {"data": {"ndarray": truth}}
    }
    
    res_raw = requests.post(
        f"{url}/feedback", json=feedback
    )
    assert res_raw.status_code == 200
    

Processed 1/100 samples
[[52, 69996, 'OWN', 3.0, 'PERSONAL', 12000, 11.11, 'CUMMINGS', 'KS', 'PRIMARY', 253, 472, 9617849, 7649, 779433, 28579, 3583, 0, 0, 0, 0, 0]]
[0]
1
DTC model send-feedback entered
Truth: 0, Reward: 1
{'tp': 0, 'fp': 0, 'tn': 1, 'fn': 0}
Tries: %s, successes: %s, values: %s 1 1 1.0
Processed 2/100 samples
[[61, 10636, 'RENT', 3.0, 'PERSONAL', 4000, 12.21, 'CRESSON', 'TX', 'PRIMARY', 703, 1342, 38291019, 5101, 1877182, 32008, 20745, 6, 1, 3, 0, 0]]
[1]
1
DTC model send-feedback entered
Truth: 1, Reward: 1
{'tp': 1, 'fp': 0, 'tn': 1, 'fn': 0}
Tries: %s, successes: %s, values: %s 2 2 1.0
Processed 3/100 samples
[[36, 69000, 'RENT', 0.0, 'MEDICAL', 15600, 10.99, 'PLAINVILLE', 'GA', 'PRIMARY', 824, 1578, 23478174, 4223, 1142978, 24020, 11711, 3, 5, 1, 1, 2]]
[0]
1
DTC model send-feedback entered
Truth: 0, Reward: 1
{'tp': 1, 'fp': 0, 'tn': 2, 'fn': 0}
Tries: %s, successes: %s, values: %s 3 3 1.0
Processed 4/100 samples
[[36, 48686, 'RENT', 21.0, 'MEDICAL', 12000, 13.9

[[36, 31000, 'RENT', 2.0, 'PERSONAL', 8000, 14.11, 'DARLINGTON', 'WI', 'PRIMARY', 2196, 3993, 62007687, 2104, 677511, 15566, 12939, 8, 0, 1, 0, 0]]
[1]
1
DTC model send-feedback entered
Truth: 1, Reward: 1
{'tp': 4, 'fp': 0, 'tn': 23, 'fn': 0}
Tries: %s, successes: %s, values: %s 27 27 1.0
Processed 28/100 samples
[[36, 84996, 'MORTGAGE', 11.0, 'PERSONAL', 3500, 12.69, 'WILLIAMS', 'AZ', 'PRIMARY', 2401, 4238, 67604746, 7033, 986411, 23313, 8266, 0, 0, 0, 0, 0]]
[0]
1
DTC model send-feedback entered
Truth: 0, Reward: 1
{'tp': 4, 'fp': 0, 'tn': 24, 'fn': 0}
Tries: %s, successes: %s, values: %s 28 28 1.0
Processed 29/100 samples
[[55, 75000, 'MORTGAGE', 6.0, 'PERSONAL', 7700, 14.84, 'MENIFEE', 'CA', 'PRIMARY', 14456, 28269, 642746625, 7463, 1624026, 19352, 29641, 0, 0, 0, 0, 0]]
[0]
1
DTC model send-feedback entered
Truth: 0, Reward: 1
{'tp': 4, 'fp': 0, 'tn': 25, 'fn': 0}
Tries: %s, successes: %s, values: %s 29 29 1.0
Processed 30/100 samples
[[42, 64464, 'MORTGAGE', 13.0, 'MEDICAL', 300

[[46, 60000, 'RENT', 9.0, 'HOMEIMPROVEMENT', 15000, 16.32, 'HICKMAN', 'KY', 'PRIMARY', 1170, 2108, 24786335, 7488, 1680839, 43147, 17194, 0, 1, 0, 0, 0]]
[0]
1
DTC model send-feedback entered
Truth: 0, Reward: 1
{'tp': 11, 'fp': 0, 'tn': 42, 'fn': 0}
Tries: %s, successes: %s, values: %s 53 53 1.0
Processed 54/100 samples
[[42, 50426, 'RENT', 0.0, 'MEDICAL', 4200, 7.49, 'FOSS', 'OK', 'PRIMARY', 321, 595, 10017823, 2395, 269401, 31834, 18682, 9, 6, 0, 0, 2]]
[0]
1
DTC model send-feedback entered
Truth: 0, Reward: 1
{'tp': 11, 'fp': 0, 'tn': 43, 'fn': 0}
Tries: %s, successes: %s, values: %s 54 54 1.0
Processed 55/100 samples
[[38, 30000, 'MORTGAGE', 8.0, 'HOMEIMPROVEMENT', 10000, 11.22, 'BRANDON', 'MN', 'PRIMARY', 745, 1355, 23203616, 870, 555643, 3781, 10059, 4, 6, 1, 0, 0]]
[0]
1
DTC model send-feedback entered
Truth: 0, Reward: 1
{'tp': 11, 'fp': 0, 'tn': 44, 'fn': 0}
Tries: %s, successes: %s, values: %s 55 55 1.0
Processed 56/100 samples
[[41, 90000, 'MORTGAGE', 3.0, 'PERSONAL', 3000,

[[55, 94000, 'MORTGAGE', 11.0, 'PERSONAL', 4000, 6.62, 'FAIRCHANCE', 'PA', 'PRIMARY', 1267, 2180, 33272489, 7044, 370857, 24474, 7277, 1, 1, 0, 0, 0]]
[0]
1
DTC model send-feedback entered
Truth: 0, Reward: 1
{'tp': 17, 'fp': 0, 'tn': 62, 'fn': 0}
Tries: %s, successes: %s, values: %s 79 79 1.0
Processed 80/100 samples
[[41, 52106, 'RENT', 4.0, 'EDUCATION', 5000, 12.84, 'WEST DOVER', 'VT', 'PRIMARY', 687, 1043, 16848829, 6505, 1717133, 31418, 23388, 2, 2, 2, 0, 0]]
[1]
1
DTC model send-feedback entered
Truth: 1, Reward: 1
{'tp': 18, 'fp': 0, 'tn': 62, 'fn': 0}
Tries: %s, successes: %s, values: %s 80 80 1.0
Processed 81/100 samples
[[84, 94800, 'MORTGAGE', 2.0, 'PERSONAL', 10000, 7.51, 'HAYDEN', 'AL', 'PRIMARY', 3410, 6765, 129851527, 3399, 1984292, 11697, 10166, 0, 1, 0, 0, 0]]
[0]
1
DTC model send-feedback entered
Truth: 0, Reward: 1
{'tp': 18, 'fp': 0, 'tn': 63, 'fn': 0}
Tries: %s, successes: %s, values: %s 81 81 1.0
Processed 82/100 samples
[[53, 105000, 'RENT', 1.0, 'EDUCATION', 120