In [17]:
import pandas as pd
import dill
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve
from sklearn.metrics import f1_score


#pipeline
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import precision_score,recall_score

#imputer
from sklearn.impute import SimpleImputer



Загрузим данные


In [7]:
!wget 'https://drive.google.com/uc?export=download&id=1x2Dup30bZYEdJBJ8YMYrllTHwCJ2m1NG' -O data.csv

--2022-06-11 12:44:00--  https://drive.google.com/uc?export=download&id=1x2Dup30bZYEdJBJ8YMYrllTHwCJ2m1NG
Resolving drive.google.com (drive.google.com)... 172.217.193.138, 172.217.193.101, 172.217.193.102, ...
Connecting to drive.google.com (drive.google.com)|172.217.193.138|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://doc-0s-ak-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/21acu9as0uorl6dos6l5peut4oa9n3s5/1654951425000/12412789190724481313/*/1x2Dup30bZYEdJBJ8YMYrllTHwCJ2m1NG?e=download [following]
--2022-06-11 12:44:01--  https://doc-0s-ak-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/21acu9as0uorl6dos6l5peut4oa9n3s5/1654951425000/12412789190724481313/*/1x2Dup30bZYEdJBJ8YMYrllTHwCJ2m1NG?e=download
Resolving doc-0s-ak-docs.googleusercontent.com (doc-0s-ak-docs.googleusercontent.com)... 173.194.217.132, 2607:f8b0:400c:c13::84
Connecting to doc-0s-ak-docs.googleusercontent.com (doc-0s-ak

In [8]:
df = pd.read_csv("data.csv")
df.head(3)

Unnamed: 0,Home Ownership,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Purpose,Term,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score,Credit Default
0,Own Home,482087.0,,0.0,11.0,26.3,685960.0,1.0,,1.0,debt consolidation,Short Term,99999999.0,47386.0,7914.0,749.0,0
1,Own Home,1025487.0,10+ years,0.0,15.0,15.3,1181730.0,0.0,,0.0,debt consolidation,Long Term,264968.0,394972.0,18373.0,737.0,1
2,Home Mortgage,751412.0,8 years,0.0,11.0,35.0,1182434.0,0.0,,0.0,debt consolidation,Short Term,99999999.0,308389.0,13651.0,742.0,0


In [10]:
df.dropna(inplace=True)

Разделим данные на train/test и сохраним тестовую выборку на диск

In [33]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(['Credit Default' ], axis = 1), df['Credit Default'],
                                                    test_size=0.33, random_state=42)
# save test
X_test.to_csv("X_test.csv", index=None)
y_test.to_csv("y_test.csv", index=None)

# save train
X_train.to_csv("X_train.csv", index=None)
y_train.to_csv("y_train.csv", index=None)

Создадим необходимые пайплайны

In [12]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]
    

class NumberSelector(BaseEstimator, TransformerMixin):

    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key, drop_first=True).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key, drop_first=True)
        test_columns = [col for col in X.columns]
        
        for col_ in self.columns:
            if col_ not in test_columns:
                X[col_] = 0
        return X[self.columns]
    
class CatNaNInputer(BaseEstimator, TransformerMixin):
    def __init__(self):
        
        self.frequent_constant = 'None'

    def fit(self, X, y=None):
        self.frequent_constant = X.mode()[0]
        return self

    def transform(self, X):
        X = X.fillna(self.frequent_constant)
        return X


In [34]:
continuous_columns = X_train.select_dtypes(include='number').columns.to_list()
categorical_columns = X_train.select_dtypes(exclude='number').columns.to_list()

Соберем кусок, ответственный за feature engineering

In [35]:

final_transformers = list()

for cat_col in categorical_columns:
    cat_transformer = Pipeline([
                ('selector', FeatureSelector(column=cat_col)),
                ('nan_inputer', CatNaNInputer()),
                ('ohe', OHEEncoder(key=cat_col))
            ])
    
    final_transformers.append((cat_col, cat_transformer))
    
for cont_col in continuous_columns:
    cont_transformer = Pipeline([
                ('selector', NumberSelector(key=cont_col)),
                
                ('NAN', SimpleImputer(strategy='median')),
                
        
                
            ])
    
    final_transformers.append((cont_col, cont_transformer))
    
feats = FeatureUnion(final_transformers)

Добавим классификатор с подобранными параметрами

In [36]:
%%time
model = RandomForestClassifier(random_state=7, class_weight = 'balanced_subsample', max_depth = 3, min_samples_leaf = 10, min_samples_split = 2, n_estimators = 600  )
pipeline = Pipeline([
    ('features', feats),
    ('classifier', model),
])

pipeline.fit(X_train, y_train)

CPU times: user 1.75 s, sys: 13.2 ms, total: 1.76 s
Wall time: 1.77 s


Сохраним модель (пайплайн)

In [37]:
with open("rf_pipeline.dill", "wb") as f:
    dill.dump(pipeline, f)

### Проверка работоспособности и качества пайплайна


In [38]:
X_test = pd.read_csv("X_test.csv")
y_test = pd.read_csv("y_test.csv")

In [23]:
X_test.head(3)

Unnamed: 0,Home Ownership,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Purpose,Term,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score,Credit Default
0,Rent,658217.0,10+ years,0.0,9.0,10.9,150260.0,0.0,3.0,0.0,debt consolidation,Short Term,139370.0,53200.0,10970.0,733.0,1
1,Home Mortgage,1028242.0,7 years,0.0,6.0,13.9,123596.0,0.0,43.0,0.0,debt consolidation,Short Term,99999999.0,56791.0,9254.0,734.0,0
2,Rent,1045323.0,10+ years,0.0,4.0,16.3,299090.0,0.0,33.0,0.0,debt consolidation,Long Term,440132.0,181070.0,11847.0,732.0,0


In [39]:
with open('rf_pipeline.dill', 'rb') as in_strm:
    pipeline = dill.load(in_strm)

In [25]:
pipeline

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('Home Ownership',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Home '
                                                                                         'Ownership')),
                                                                 ('nan_inputer',
                                                                  CatNaNInputer()),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Home '
                                                                                 'Ownership'))])),
                                                ('Years in current job',
                                                 Pipeline(steps=[('selector',
                                                

In [40]:
preds = pipeline.predict_proba(X_test)[:, 1]

pred_df = pd.DataFrame({'preds': preds})
pred_df.to_csv("test_predictions.csv", index=None)

In [44]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)

fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print(f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}')

Best Threshold=0.4955028782641338, F-Score=0.591, Precision=0.547, Recall=0.644


In [45]:
!pip install flask-ngrok

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [46]:
from flask_ngrok import run_with_ngrok
from flask import Flask, request, jsonify
import pandas as pd

In [50]:
!wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.tgz
!tar -xvf /content/ngrok-stable-linux-amd64.tgz
!./ngrok authtoken 2AQwxtl9lZmWOYwwGv3dptsb2GJ_21ZJCVJ59W9RjFNtN2CJ6
!./ngrok http 80

--2022-06-11 13:53:31--  https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.tgz
Resolving bin.equinox.io (bin.equinox.io)... 54.161.241.46, 52.202.168.65, 18.205.222.128, ...
Connecting to bin.equinox.io (bin.equinox.io)|54.161.241.46|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 13770165 (13M) [application/octet-stream]
Saving to: ‘ngrok-stable-linux-amd64.tgz.1’


2022-06-11 13:53:31 (59.0 MB/s) - ‘ngrok-stable-linux-amd64.tgz.1’ saved [13770165/13770165]

ngrok
Authtoken saved to configuration file: /root/.ngrok2/ngrok.yml


In [None]:
import pandas as pd
import dill

### **Создаем сервис для обработки запросов к модели**

In [53]:
# Загружаем обученные модели
with open('rf_pipeline.dill', 'rb') as in_strm:
    model = dill.load(in_strm)

In [54]:
X_test = pd.read_csv("X_test.csv")
y_test = pd.read_csv("y_test.csv")

Запустить сервис и не глушить его, пока работаем 

In [None]:
# Обработчики и запуск Flask
app = Flask(__name__)
run_with_ngrok(app)  # Start ngrok when app is run


@app.route("/", methods=["GET"])
def general():
    return "Welcome to prediction process"

@app.route('/predict', methods=['POST'])
def predict():
    data = {"success": False}

    (home_ownership, annual_income, years_job, tax_liens, accounts, years_history, max_credit, n_problems, last_delinquent, bankruptcies, purpose, 
    term, current_loan_amount, current_credit_balance, monthly_debt, credit_score) = [np.nan]*16

    request_json = request.get_json()
    
    if request_json["Home Ownership"]:
        home_ownership = request_json['Home Ownership']
    
    if request_json["Annual Income"]:
        annual_income = request_json['Annual Income']
                
    if request_json["Years in current job"]:
        years_job = request_json['Years in current job']

    if request_json["Tax Liens"]:
        tax_liens = request_json['Tax Liens']
    
    if request_json["Number of Open Accounts"]:
        accounts = request_json['Number of Open Accounts']
                
    if request_json["Years of Credit History"]:
        years_history = request_json['Years of Credit History']

    if request_json["Maximum Open Credit"]:
        max_credit = request_json['Maximum Open Credit']
    
    if request_json["Number of Credit Problems"]:
        n_problems = request_json['Number of Credit Problems']
                
    if request_json["Months since last delinquent"]:
        last_delinquent = request_json['Months since last delinquent']

    if request_json["Bankruptcies"]:
        bankruptcies = request_json['Bankruptcies']
    
    if request_json["Purpose"]:
        purpose = request_json['Purpose']
                
    if request_json["Term"]:
        term = request_json['Term']

    if request_json["Current Loan Amount"]:
        current_loan_amount = request_json['Current Loan Amount']
    
    if request_json["Current Credit Balance"]:
        current_credit_balance = request_json['Current Credit Balance']
                
    if request_json["Monthly Debt"]:
        monthly_debt = request_json['Monthly Debt']

    if request_json["Credit Score"]:
        credit_score = request_json['Credit Score']

    
     
    preds = model.predict_proba(pd.DataFrame({'Home Ownership' : [home_ownership],
                                              'Annual Income' : [annual_income],
                                              'Years in current job' : [years_job],
                                              'Tax Liens' : [tax_liens],
                                              'Number of Open Accounts' : [accounts],
                                              'Years of Credit History' : [years_history],
                                              'Maximum Open Credit' : [max_credit],
                                              'Number of Credit Problems' : [n_problems],
                                              'Months since last delinquent' : [last_delinquent],
                                              'Bankruptcies' : [bankruptcies],
                                              'Purpose' : [purpose],
                                              'Term' : [term],
                                              'Current Loan Amount' : [current_loan_amount] ,
                                              'Current Credit Balance' : [current_credit_balance],
                                              'Monthly Debt' : [monthly_debt],
                                              'Credit Score' : [credit_score]}))
      
   
    data["predictions"] = preds[:, 1][0]
    data["description"] = request_json

    data["success"] = True
    print('OK')


    return jsonify(data)


if __name__ == '__main__':
    app.run()

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)


 * Running on http://4de7-34-74-84-199.ngrok.io
 * Traffic stats available on http://127.0.0.1:4040


127.0.0.1 - - [11/Jun/2022 15:38:13] "[37mPOST /predict HTTP/1.1[0m" 200 -


OK


127.0.0.1 - - [11/Jun/2022 15:38:21] "[37mPOST /predict HTTP/1.1[0m" 200 -


OK


127.0.0.1 - - [11/Jun/2022 15:38:33] "[37mPOST /predict HTTP/1.1[0m" 200 -


OK


127.0.0.1 - - [11/Jun/2022 15:38:33] "[37mPOST /predict HTTP/1.1[0m" 200 -
127.0.0.1 - - [11/Jun/2022 15:38:34] "[37mPOST /predict HTTP/1.1[0m" 200 -


OK
OK


127.0.0.1 - - [11/Jun/2022 15:38:34] "[37mPOST /predict HTTP/1.1[0m" 200 -


OK


127.0.0.1 - - [11/Jun/2022 15:38:34] "[37mPOST /predict HTTP/1.1[0m" 200 -


OK


127.0.0.1 - - [11/Jun/2022 15:38:34] "[37mPOST /predict HTTP/1.1[0m" 200 -


OK


127.0.0.1 - - [11/Jun/2022 15:38:34] "[37mPOST /predict HTTP/1.1[0m" 200 -


OK


127.0.0.1 - - [11/Jun/2022 15:38:35] "[37mPOST /predict HTTP/1.1[0m" 200 -


OK


127.0.0.1 - - [11/Jun/2022 15:38:35] "[37mPOST /predict HTTP/1.1[0m" 200 -


OK


127.0.0.1 - - [11/Jun/2022 15:38:35] "[37mPOST /predict HTTP/1.1[0m" 200 -


OK


127.0.0.1 - - [11/Jun/2022 15:38:35] "[37mPOST /predict HTTP/1.1[0m" 200 -


OK


127.0.0.1 - - [11/Jun/2022 15:38:36] "[37mPOST /predict HTTP/1.1[0m" 200 -


OK


127.0.0.1 - - [11/Jun/2022 15:38:36] "[37mPOST /predict HTTP/1.1[0m" 200 -
127.0.0.1 - - [11/Jun/2022 15:38:36] "[37mPOST /predict HTTP/1.1[0m" 200 -


OK
OK


127.0.0.1 - - [11/Jun/2022 15:38:36] "[37mPOST /predict HTTP/1.1[0m" 200 -


OK


127.0.0.1 - - [11/Jun/2022 15:38:36] "[37mPOST /predict HTTP/1.1[0m" 200 -


OK


127.0.0.1 - - [11/Jun/2022 15:38:37] "[37mPOST /predict HTTP/1.1[0m" 200 -
127.0.0.1 - - [11/Jun/2022 15:38:37] "[37mPOST /predict HTTP/1.1[0m" 200 -


OK
OK
OK

127.0.0.1 - - [11/Jun/2022 15:38:37] "[37mPOST /predict HTTP/1.1[0m" 200 -





127.0.0.1 - - [11/Jun/2022 15:38:37] "[37mPOST /predict HTTP/1.1[0m" 200 -


OK


127.0.0.1 - - [11/Jun/2022 15:38:37] "[37mPOST /predict HTTP/1.1[0m" 200 -
127.0.0.1 - - [11/Jun/2022 15:38:38] "[37mPOST /predict HTTP/1.1[0m" 200 -


OK
OK


127.0.0.1 - - [11/Jun/2022 15:38:38] "[37mPOST /predict HTTP/1.1[0m" 200 -


OK


127.0.0.1 - - [11/Jun/2022 15:38:38] "[37mPOST /predict HTTP/1.1[0m" 200 -


OK


127.0.0.1 - - [11/Jun/2022 15:38:38] "[37mPOST /predict HTTP/1.1[0m" 200 -
127.0.0.1 - - [11/Jun/2022 15:38:39] "[37mPOST /predict HTTP/1.1[0m" 200 -
127.0.0.1 - - [11/Jun/2022 15:38:39] "[37mPOST /predict HTTP/1.1[0m" 200 -


OK
OK
OK


127.0.0.1 - - [11/Jun/2022 15:38:39] "[37mPOST /predict HTTP/1.1[0m" 200 -
127.0.0.1 - - [11/Jun/2022 15:38:39] "[37mPOST /predict HTTP/1.1[0m" 200 -


OK
OK


127.0.0.1 - - [11/Jun/2022 15:38:39] "[37mPOST /predict HTTP/1.1[0m" 200 -


OK


127.0.0.1 - - [11/Jun/2022 15:38:40] "[37mPOST /predict HTTP/1.1[0m" 200 -


OK


127.0.0.1 - - [11/Jun/2022 15:38:40] "[37mPOST /predict HTTP/1.1[0m" 200 -
127.0.0.1 - - [11/Jun/2022 15:38:40] "[37mPOST /predict HTTP/1.1[0m" 200 -


OK
OK


127.0.0.1 - - [11/Jun/2022 15:38:40] "[37mPOST /predict HTTP/1.1[0m" 200 -


OK


127.0.0.1 - - [11/Jun/2022 15:38:40] "[37mPOST /predict HTTP/1.1[0m" 200 -


OK


127.0.0.1 - - [11/Jun/2022 15:38:41] "[37mPOST /predict HTTP/1.1[0m" 200 -


OK


127.0.0.1 - - [11/Jun/2022 15:38:41] "[37mPOST /predict HTTP/1.1[0m" 200 -


OK


127.0.0.1 - - [11/Jun/2022 15:38:41] "[37mPOST /predict HTTP/1.1[0m" 200 -


OK


127.0.0.1 - - [11/Jun/2022 15:38:41] "[37mPOST /predict HTTP/1.1[0m" 200 -


OK


127.0.0.1 - - [11/Jun/2022 15:38:42] "[37mPOST /predict HTTP/1.1[0m" 200 -
127.0.0.1 - - [11/Jun/2022 15:38:42] "[37mPOST /predict HTTP/1.1[0m" 200 -


OK
OK


127.0.0.1 - - [11/Jun/2022 15:38:42] "[37mPOST /predict HTTP/1.1[0m" 200 -


OK


127.0.0.1 - - [11/Jun/2022 15:38:42] "[37mPOST /predict HTTP/1.1[0m" 200 -


OK


127.0.0.1 - - [11/Jun/2022 15:38:43] "[37mPOST /predict HTTP/1.1[0m" 200 -


OK


127.0.0.1 - - [11/Jun/2022 15:38:43] "[37mPOST /predict HTTP/1.1[0m" 200 -


OK


127.0.0.1 - - [11/Jun/2022 15:38:43] "[37mPOST /predict HTTP/1.1[0m" 200 -


OK


127.0.0.1 - - [11/Jun/2022 15:38:43] "[37mPOST /predict HTTP/1.1[0m" 200 -
127.0.0.1 - - [11/Jun/2022 15:38:43] "[37mPOST /predict HTTP/1.1[0m" 200 -


OK
OK


127.0.0.1 - - [11/Jun/2022 15:38:44] "[37mPOST /predict HTTP/1.1[0m" 200 -
127.0.0.1 - - [11/Jun/2022 15:38:44] "[37mPOST /predict HTTP/1.1[0m" 200 -


OK
OK


127.0.0.1 - - [11/Jun/2022 15:48:04] "[37mPOST /predict HTTP/1.1[0m" 200 -


OK


127.0.0.1 - - [11/Jun/2022 15:48:21] "[37mPOST /predict HTTP/1.1[0m" 200 -


OK


Тестовый клиент

In [60]:
# Пример данных
(home_ownership, annual_income, years_job, tax_liens, accounts, years_history, max_credit, n_problems, last_delinquent, bankruptcies, purpose, 
term, current_loan_amount, current_credit_balance, monthly_debt, credit_score) = ('Rent', 616892.0, '10+ years', 0.0, 6.0, 19.9, 415250.0, 0.0, 47.0, 0.0, 'debt consolidation', 'Short Term', 273042.0, 222338.0, 6323.0, 716.0)

body = {'Home Ownership' : home_ownership,
    'Annual Income' : annual_income,
    'Years in current job' : years_job,
    'Tax Liens' : tax_liens,
    'Number of Open Accounts' : accounts,
    'Years of Credit History' : years_history,
    'Maximum Open Credit' : max_credit,
    'Number of Credit Problems' : n_problems,
    'Months since last delinquent' : last_delinquent,
    'Bankruptcies' : bankruptcies,
    'Purpose' : purpose,
    'Term' : term,
    'Current Loan Amount' : current_loan_amount ,
    'Current Credit Balance' : current_credit_balance,
    'Monthly Debt' : monthly_debt,
    'Credit Score' : credit_score}

In [61]:
with app.test_client() as t:
    response = t.post('/predict', json=body)
    json_data = response.get_json()

json_data

OK


{'description': {'Annual Income': 616892.0,
  'Bankruptcies': 0.0,
  'Credit Score': 716.0,
  'Current Credit Balance': 222338.0,
  'Current Loan Amount': 273042.0,
  'Home Ownership': 'Rent',
  'Maximum Open Credit': 415250.0,
  'Monthly Debt': 6323.0,
  'Months since last delinquent': 47.0,
  'Number of Credit Problems': 0.0,
  'Number of Open Accounts': 6.0,
  'Purpose': 'debt consolidation',
  'Tax Liens': 0.0,
  'Term': 'Short Term',
  'Years in current job': '10+ years',
  'Years of Credit History': 19.9},
 'predictions': 0.5290400798468633,
 'success': True}