# Управление и логгирование экспериментов в mlflow

В рамках данного ноутбука мы проведем несколько экспериментов для получения лучшей модели по прогнозу оттока клиентов. Выполним следующие шаги:
* подгрузим данные 
* запустим обучение и поиск гиперпараметров для нескольких классификаторов в рамках эксперимента
* залоггируем модели и метрики в mlflow
* сохраним лучшую модель в s3 

## Загрузка данных из s3
Предварительно я подгрузил .csv файл в свой s3 бакет. Получим его оттуда. Доступ к s3 я предоставляю в dockerfile.


In [1]:
import pandas as pd
import numpy as np
import mlflow
from mlflow.models import infer_signature
from mlflow.store.artifact.artifact_repository_registry import get_artifact_repository

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import (GridSearchCV, StratifiedKFold,
                                     cross_val_score)


In [2]:
df = pd.read_csv('data/telecom_churn.csv')

In [3]:
df.head()

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
0,KS,128,415,No,Yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,No,Yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,No,No,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,Yes,No,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,Yes,No,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


In [4]:
df.shape

(3333, 20)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   State                   3333 non-null   object 
 1   Account length          3333 non-null   int64  
 2   Area code               3333 non-null   int64  
 3   International plan      3333 non-null   object 
 4   Voice mail plan         3333 non-null   object 
 5   Number vmail messages   3333 non-null   int64  
 6   Total day minutes       3333 non-null   float64
 7   Total day calls         3333 non-null   int64  
 8   Total day charge        3333 non-null   float64
 9   Total eve minutes       3333 non-null   float64
 10  Total eve calls         3333 non-null   int64  
 11  Total eve charge        3333 non-null   float64
 12  Total night minutes     3333 non-null   float64
 13  Total night calls       3333 non-null   int64  
 14  Total night charge      3333 non-null   

Набор представляет из себя 3333 пользователя компании телефонной связи. Имеется 19 фичей, а колонка Churn представляет из себя булевы значения, где 0 - абонент лоялен, 1 - абонент поменял тариф (ушел). Мы хотим прогнозировать ушел ли клиент (отток).
 

In [6]:
# Обучим модель на численных фичах
cols = []
for i in df.columns:
    if (df[i].dtype == "float64") or (df[i].dtype == 'int64'):
        cols.append(i)

cols, len(cols)

(['Account length',
  'Area code',
  'Number vmail messages',
  'Total day minutes',
  'Total day calls',
  'Total day charge',
  'Total eve minutes',
  'Total eve calls',
  'Total eve charge',
  'Total night minutes',
  'Total night calls',
  'Total night charge',
  'Total intl minutes',
  'Total intl calls',
  'Total intl charge',
  'Customer service calls'],
 16)

In [7]:
# Divide the dataset into the input and target
X, y = df[cols].copy(), np.asarray(df["Churn"],dtype='int8')

# Initialize a stratified split of our dataset for the validation process
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize the classifier with the default parameters 
rfc = RandomForestClassifier(random_state=42, n_jobs=-1)

# Train it on the training set
results = cross_val_score(rfc, X, y, cv=skf)

# Evaluate the accuracy on the test set
print("CV accuracy score: {:.2f}%".format(results.mean() * 100))

CV accuracy score: 92.50%


Используем эту модель как бейзлайн и попробуем увеличить score с помощью других моделей и GridSearch 

## Обучение и поиск гиперпараметров

Попробуем обучить RandomForest, SVC, LogRegression модели

In [18]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [30]:
model_params = {
    'gradien_boosting': {
        'model' : GradientBoostingClassifier(),
        'params' : {
            'n_estimators' : range(10, 50, 10),
            'max_depth': range(5,9,1),
            'max_features':range(7,20,3)         
        }
    },
    'random_forest' : {
        'model' : RandomForestClassifier(),
        'params' : {
            'n_estimators' : range(10, 50, 10),
            'max_features': range(3, 15, 3),
            'max_depth': [5, 10, 15, 20],
            'min_samples_leaf': [1, 3, 5, 7],   
              
              }
    },
    'log_reg' : {
        'model' : LogisticRegression(),
        'params' : {
            'C' : [1, 2, 5]
        }
    }
}

In [12]:
# Создать новый эксперимент
exp_name = "telecom_churn_classification_test"
experiment_id='344457750143816081'
#experiment_id = mlflow.create_experiment(exp_name)
mlflow.set_experiment(exp_name)

<Experiment: artifact_location='s3://kirillbucket/mlflow/344457750143816081', creation_time=1732301882313, experiment_id='344457750143816081', last_update_time=1732301882313, lifecycle_stage='active', name='telecom_churn_classification_test', tags={}>

In [31]:
X, y = df[cols].copy(), np.asarray(df["Churn"],dtype='int8')
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5,  random_state=42)

scores = []
with mlflow.start_run(run_name="parent_run", experiment_id = experiment_id, description = "parent") as parent_run:
    for model_name, mp in model_params.items():
        with mlflow.start_run(run_name=model_name, experiment_id=experiment_id, nested=True) as child_run:
            grid_search = GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False) 
            # Train it on the training set# Fit the grid search to the data
            grid_search.fit(X_train, y_train)
            
            # Evaluate the best model on the test set
            y_pred = grid_search.predict(X_test)
            scores.append({
                'model' : model_name,
                'top_score' : grid_search.best_score_,
                'best_params' : grid_search.best_params_})

            # Создадим валидационный датасет.
            eval_df = X_val.copy()
            eval_df["churn"] = y_val
        
            # Сохраним результаты обучения с помощью MLFlow.
            signature = infer_signature(X_test, y_pred)
            model_info = mlflow.sklearn.log_model(grid_search, "classification_model", signature=signature)
            mlflow.evaluate(
                model=model_info.model_uri,
                data=eval_df,
                targets="churn",
                model_type="classifier",
                evaluators=["default"],
            )
        



    



Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

2024/11/22 20:21:30 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2024/11/22 20:21:30 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2024/11/22 20:21:31 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
2024/11/22 20:21:35 INFO mlflow.tracking._tracking_service.client: 🏃 View run gradien_boosting at: http://mlflow-service:5000/#/experiments/344457750143816081/runs/fd9bc3513c0449ebaf8d2796a471a40d.
2024/11/22 20:21:35 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow-service:5000/#/experiments/344457750143816081.


Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

2024/11/22 20:40:28 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2024/11/22 20:40:28 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2024/11/22 20:40:28 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
2024/11/22 20:40:42 INFO mlflow.tracking._tracking_service.client: 🏃 View run random_forest at: http://mlflow-service:5000/#/experiments/344457750143816081/runs/231962c89d9a48b8933071b9387ea3b1.
2024/11/22 20:40:42 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow-service:5000/#/experiments/344457750143816081.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

2024/11/22 20:41:31 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2024/11/22 20:41:31 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2024/11/22 20:41:32 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
2024/11/22 20:41:43 INFO mlflow.tracking._tracking_service.client: 🏃 View run log_reg at: http://mlflow-service:5000/#/experiments/344457750143816081/runs/34433e923d4b45cf94e1b706bb4dfb0f.
2024/11/22 20:41:43 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow-service:5000/#/experiments/344457750143816081.
2024/11/22 20:41:43 INFO mlflow.tracking._tracking_service.client: 🏃 View run parent_run at: http://mlflow-service:5000/#/experiments/344457750143816081/runs/b0f0235330814638b7927a8dba37b1b1.
2024/11/22 20:41:43 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow-se

In [32]:
df_scores = pd.DataFrame(scores)
df_scores

Unnamed: 0,model,top_score,best_params
0,gradien_boosting,0.924985,"{'max_depth': 7, 'max_features': 7, 'n_estimat..."
1,random_forest,0.924611,"{'max_depth': 15, 'max_features': 12, 'min_sam..."
2,log_reg,0.855965,{'C': 2}


С помощью GridSearchCV удалось увеличить скор модели. Метрики, датасет, модель залогированы и зарегистированы и мы можем использовать нашу модель для инференса