Загружаем библиотеки, рестартуем Kernel

In [None]:
!pip install psycopg2-binary
!pip install boto3
!pip install mlflow==2.7.0
!pip install -U scikit-learn

# Load data from postgre

#### Задача: определение кредитного рейтинга клиента на основе данных о его кредитной истории.

Загружаем обучающие данные из Postgre

In [2]:
import pandas as pd
import psycopg2 as pg

dataset_name = "train_cs"

engine = pg.connect("host=cassandra-postgresql.feast-db port=5432 dbname=FEAST_OFFLINE_STORE user=postgres password=postgres")
df = pd.read_sql(f'select * from {dataset_name}', con=engine)

Посмотрим на данные

In [3]:
df.head()

Unnamed: 0,Month,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,2,23.0,4,19114.12,4205.50294,3,4,3,4.0,1,...,1,809.98,31.94496,17.961599,2,49.574949,118.280222,6.0,284.629162,2
1,3,116.302136,4,19114.12,4205.50294,3,4,3,4.0,3,...,1,809.98,28.609352,22.0,2,49.574949,81.699521,3.0,331.209863,2
2,4,23.0,4,19114.12,4205.50294,3,4,3,4.0,5,...,1,809.98,31.377862,22.0,2,49.574949,199.458074,1.0,223.45131,2
3,5,23.0,4,19114.12,1824.843333,3,4,3,4.0,6,...,1,809.98,24.797347,22.0,2,49.574949,41.420153,2.0,341.489231,2
4,7,23.0,4,19114.12,1824.843333,3,4,3,4.0,3,...,1,809.98,22.537593,22.0,2,49.574949,178.344067,1.0,244.565317,2


# Prepare data

Отделим фичи от таргетов сохранив их в разные переменные

In [4]:
train_data = df.drop("Credit_Score",axis=1)
label_data = df["Credit_Score"]

Разделим фичи и таргеты на датасеты для обучения и тестирования

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train_data, label_data ,test_size=0.2, random_state=42)

# Train Model

Обучим модель на тренировачных данных и получим ее accuracy (точность) на тестовом датасете.

In [6]:
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=3, random_state=0).fit(X_train, y_train)
score = clf.score(X_test, y_test)
score

0.7327347057964666

# Log Model

Залогируем модель в MLFlow, так же залогируем метрику точности работы модели на тестовом датасете.

In [7]:
import mlflow

mlflow.set_experiment("Credit Score Classification")

with mlflow.start_run() as run:
    
    mlflow.log_metrics({"accuracy": score})
    
    mlflow.sklearn.log_model(
        sk_model=clf,
        artifact_path="model",
    )

После логирования модели переходим в MLFlow, регистрируем модель, переводим ее в стейдж Production.

Затем запускаем граф Airflow

# Test results

После того как граф отработал, можем проверить результат работы выгрузив датасет с полученным кредитным рейтингом для каждого клиента.

Предсказанный кредитный рейтинг сохраняется в колонке results

In [15]:
import pandas as pd
import psycopg2 as pg

dataset_name = "results_cs"

engine = pg.connect("host=cassandra-postgresql.feast-db port=5432 dbname=FEAST_OFFLINE_STORE user=postgres password=postgres")
df_res = pd.read_sql(f'select * from {dataset_name}', con=engine)
df_res.head()

Unnamed: 0,Month,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,results
0,12,16.0,1,72471.32,6097.276667,8,10,30,3.364257,23,...,3,4439.31,29.975945,3.0,1,394.144658,169.118693,2.0,296.464316,1
1,9,20.0,13,43430.88,3418.24,6,4,12,1.0,20,...,2,1288.94,35.571842,18.0,2,20.440818,61.287485,2.0,510.095697,0
2,10,20.0,13,43430.88,3418.24,6,4,12,1.0,20,...,2,1288.94,25.260261,18.0,2,20.440818,186.594681,1.0,424.788501,0
3,11,20.0,13,43430.88,3418.24,6,4,12,1.0,20,...,2,1288.94,35.908725,18.0,2,20.440818,60.241992,4.0,501.14119,0
4,12,20.0,13,43430.88,3418.24,6,4,12,1.0,17,...,2,1288.94,39.172962,18.0,2,20.440818,58.729698,4.0,502.653484,0


# Test Logged Model

In [8]:
import pandas as pd
import psycopg2 as pg

dataset_name = "test_cs"

engine = pg.connect("host=cassandra-postgresql.feast-db port=5432 dbname=FEAST_OFFLINE_STORE user=postgres password=postgres")
df_test = pd.read_sql(f'select * from {dataset_name}', con=engine)

In [9]:
df.head()

Unnamed: 0,Month,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,2,23.0,4,19114.12,4205.50294,3,4,3,4.0,1,...,1,809.98,31.94496,17.961599,2,49.574949,118.280222,6.0,284.629162,2
1,3,116.302136,4,19114.12,4205.50294,3,4,3,4.0,3,...,1,809.98,28.609352,22.0,2,49.574949,81.699521,3.0,331.209863,2
2,4,23.0,4,19114.12,4205.50294,3,4,3,4.0,5,...,1,809.98,31.377862,22.0,2,49.574949,199.458074,1.0,223.45131,2
3,5,23.0,4,19114.12,1824.843333,3,4,3,4.0,6,...,1,809.98,24.797347,22.0,2,49.574949,41.420153,2.0,341.489231,2
4,7,23.0,4,19114.12,1824.843333,3,4,3,4.0,3,...,1,809.98,22.537593,22.0,2,49.574949,178.344067,1.0,244.565317,2


In [11]:
import mlflow

model_name = "csgb"
stage = "Production"

model = mlflow.sklearn.load_model(f"models:/{model_name}/{stage}")

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

In [12]:
model.predict(df_test)

array([1, 0, 0, ..., 1, 0, 1])