In [98]:
from sklearn.model_selection import train_test_split
import pandas as pd

In [99]:
data = pd.read_pickle('../data/final_dataset.pkl').sample(frac=0.3, random_state = 2)

In [100]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 91 entries, 99 to 71
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   age       91 non-null     int8    
 1   sex       91 non-null     category
 2   cp        91 non-null     category
 3   trestbps  91 non-null     int16   
 4   fbs       91 non-null     category
 5   restecg   91 non-null     category
 6   thalach   91 non-null     int16   
 7   exang     91 non-null     category
 8   oldpeak   91 non-null     float16 
 9   slope     91 non-null     category
 10  ca        91 non-null     category
 11  thal      91 non-null     category
 12  target    91 non-null     category
dtypes: category(9), float16(1), int16(2), int8(1)
memory usage: 2.3 KB


In [101]:
X_train, X_test, y_train, y_test = train_test_split(data.drop('target', axis=1), data['target'], test_size=0.3, random_state=43)

In [102]:
cat_features = X_train.select_dtypes(include=['category','object']).columns.to_list()
cat_features

['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']

In [103]:
num_features = X_train.select_dtypes(include=['number']).columns.to_list()
num_features

['age', 'trestbps', 'thalach', 'oldpeak']

In [104]:
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import TargetEncoder

In [84]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_features),
        ('cat', TargetEncoder(), cat_features)
    ]
)

In [85]:
pipeline = Pipeline(steps=[
    ('transform', preprocessor),
    ('model', RandomForestClassifier(random_state=43))
])

In [53]:
pipeline.fit(X_train, y_train)

In [54]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score
y_pred = pipeline.predict(X_test)
y_pred_proba = pipeline.predict_proba(X_test)[:, 1]  # Для ROC AUC нужно использовать вероятности

metrics = {}

# Метрики для классификации
metrics["accuracy"] = accuracy_score(y_test, y_pred)
metrics["precision"] = precision_score(y_test, y_pred)
metrics["recall"] = recall_score(y_test, y_pred)
metrics["f1"] = f1_score(y_test, y_pred)
metrics["roc_auc"] = roc_auc_score(y_test, y_pred_proba)

print(metrics)

{'accuracy': 0.7857142857142857, 'precision': np.float64(0.7222222222222222), 'recall': np.float64(0.9285714285714286), 'f1': np.float64(0.8125), 'roc_auc': np.float64(0.9285714285714286)}


In [55]:
import mlflow
TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

registry_uri = f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}"
tracking_uri = f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}"

mlflow.set_tracking_uri(tracking_uri)   
mlflow.set_registry_uri(registry_uri) 

In [56]:
from mlflow.models import infer_signature

signature =  infer_signature(model_input = X_train.head(5))
input_example = X_train.head(5)
params_dict = pipeline.get_params()

experiment_id = mlflow.create_experiment('estate_projectt')

with mlflow.start_run(run_name='baseline model', experiment_id=experiment_id) as run:
    run_id = run.info.run_id 
    mlflow.sklearn.log_model(pipeline, 
                             artifact_path="models",
                             signature=signature,
                             input_example=input_example,
                             )
    mlflow.log_metrics(metrics)
    mlflow.log_artifact('../requirements.txt')
    mlflow.log_params(params_dict)

run = mlflow.get_run(run_id) 
assert (run.info.status =='FINISHED')

2024/11/17 19:48:31 INFO mlflow.tracking._tracking_service.client: 🏃 View run baseline model at: http://127.0.0.1:5000/#/experiments/3/runs/71d290368e20496eaafeaee7464ebdc1.
2024/11/17 19:48:31 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/3.


In [130]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import OrdinalEncoder

X_train_copy = X_train.copy()

In [131]:
pf = PolynomialFeatures(degree=2)
pf.fit_transform(X_train_copy[['trestbps','oldpeak']])

array([[1.0000000e+00, 1.4000000e+02, 1.5996094e+00, 1.9600000e+04,
        2.2394531e+02, 2.5587502e+00],
       [1.0000000e+00, 1.5000000e+02, 8.9990234e-01, 2.2500000e+04,
        1.3498535e+02, 8.0982423e-01],
       [1.0000000e+00, 1.1000000e+02, 2.0000000e+00, 1.2100000e+04,
        2.2000000e+02, 4.0000000e+00],
       [1.0000000e+00, 1.8000000e+02, 1.5996094e+00, 3.2400000e+04,
        2.8792969e+02, 2.5587502e+00],
       [1.0000000e+00, 1.1800000e+02, 7.9980469e-01, 1.3924000e+04,
        9.4376953e+01, 6.3968754e-01],
       [1.0000000e+00, 1.4000000e+02, 6.0009766e-01, 1.9600000e+04,
        8.4013672e+01, 3.6011720e-01],
       [1.0000000e+00, 1.2000000e+02, 3.8007812e+00, 1.4400000e+04,
        4.5609375e+02, 1.4445938e+01],
       [1.0000000e+00, 1.4200000e+02, 1.4003906e+00, 2.0164000e+04,
        1.9885547e+02, 1.9610939e+00],
       [1.0000000e+00, 1.1700000e+02, 1.4003906e+00, 1.3689000e+04,
        1.6384570e+02, 1.9610939e+00],
       [1.0000000e+00, 1.2500000e+02,

In [132]:
pf_pipeline = Pipeline(steps=[
    ('poly', pf),
    ('scale', StandardScaler())
])

preprocessor_sklearn = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_features),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=3000), cat_features),
        ('poly', pf_pipeline, ['trestbps', 'oldpeak'])
    ],
    remainder='drop',
)

In [133]:
X_train_copy[['trestbps', 'oldpeak']] = X_train_copy[['trestbps', 'oldpeak']].astype('float128')

In [134]:
X_train_copy_raw = preprocessor_sklearn.fit_transform(X_train_copy)
X_train_copy = pd.DataFrame(X_train_copy_raw, columns=preprocessor_sklearn.get_feature_names_out())

In [135]:
with pd.option_context('display.max_rows', 5, 'display.max_columns', None):
    display (X_train_copy)

Unnamed: 0,num__age,num__trestbps,num__thalach,num__oldpeak,cat__sex,cat__cp,cat__fbs,cat__restecg,cat__exang,cat__slope,cat__ca,cat__thal,poly__1,poly__trestbps,poly__oldpeak,poly__trestbps^2,poly__trestbps oldpeak,poly__oldpeak^2
0,-0.335844,0.585174,0.960252,0.529752,1.0,0.0,0.0,1.0,1.0,2.0,0.0,1.0,0.0,0.585174,0.529752,0.532129,0.600114,0.103723
1,0.661138,1.234223,0.866677,-0.136053,0.0,3.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,1.234223,-0.136053,1.231735,-0.023500,-0.390952
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,1.325793,0.974603,-0.022280,-0.992354,0.0,2.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.974603,-0.992354,0.946102,-0.969754,-0.620007
62,-1.222051,-1.037448,1.334549,0.149690,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,-1.037448,0.149690,-1.005797,-0.002210,-0.212577


In [136]:
column_names = X_train_copy.columns.tolist()

# Вывести названия столбцов
print(column_names)

['num__age', 'num__trestbps', 'num__thalach', 'num__oldpeak', 'cat__sex', 'cat__cp', 'cat__fbs', 'cat__restecg', 'cat__exang', 'cat__slope', 'cat__ca', 'cat__thal', 'poly__1', 'poly__trestbps', 'poly__oldpeak', 'poly__trestbps^2', 'poly__trestbps oldpeak', 'poly__oldpeak^2']


In [137]:
with open('columns.txt', 'w') as f:
    for column in column_names:
        f.write(column + '\n')

In [141]:
pipeline_sklearn = Pipeline(steps=[
    ('transform', preprocessor_sklearn),
    ('model', RandomForestClassifier(random_state=43))
])

model_sklearn = pipeline_sklearn.fit(X_train, y_train)

y_pred2 = model_sklearn.predict(X_test)
y_pred_proba2 = model_sklearn.predict_proba(X_test)[:, 1] 

metrics2 = {}

# Метрики для классификации
metrics2["accuracy"] = accuracy_score(y_test, y_pred2)
metrics2["precision"] = precision_score(y_test, y_pred2)
metrics2["recall"] = recall_score(y_test, y_pred2)
metrics2["f1"] = f1_score(y_test, y_pred2)
metrics2["roc_auc"] = roc_auc_score(y_test, y_pred_proba2)

print(metrics2)

{'accuracy': 0.8928571428571429, 'precision': np.float64(0.8235294117647058), 'recall': np.float64(1.0), 'f1': np.float64(0.9032258064516129), 'roc_auc': np.float64(0.9846938775510203)}


In [143]:
signature =  infer_signature(model_input = X_train.head(5))
input_example = X_train.head(5)
params_dict = model_sklearn.get_params()

with mlflow.start_run(run_name='new_signs', experiment_id=experiment_id) as run:
    run_id = run.info.run_id 
    mlflow.sklearn.log_model(model_sklearn, 
                             artifact_path="models",
                             signature=signature,
                             input_example=input_example,
                             )
    mlflow.log_metrics(metrics2)
    mlflow.log_artifact('columns.txt')
    mlflow.log_params(params_dict)

run = mlflow.get_run(run_id) 
assert (run.info.status =='FINISHED')

2024/11/17 20:47:41 INFO mlflow.tracking._tracking_service.client: 🏃 View run new_signs at: http://127.0.0.1:5000/#/experiments/3/runs/f6f8a0ff053d4f808dfc2233436ded35.
2024/11/17 20:47:41 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/3.


In [146]:
from mlxtend.feature_selection import SequentialFeatureSelector

sfs = SequentialFeatureSelector(RandomForestClassifier(random_state=43), 
                                k_features=9,
                                forward=True,
                                floating=False, # True to drop selected features
                                scoring='accuracy',
                                cv=2)

sfs.fit(X_train_copy,y_train)

selected_features_sfs = X_train_copy.loc[:, sfs.k_feature_names_]
selected_features_sfs

Unnamed: 0,num__age,cat__cp,cat__restecg,cat__slope,cat__ca,cat__thal,poly__1,poly__trestbps,poly__oldpeak^2
0,-0.335844,0.0,1.0,2.0,0.0,1.0,0.0,0.585174,0.103723
1,0.661138,3.0,1.0,2.0,0.0,0.0,0.0,1.234223,-0.390952
2,-1.554378,0.0,0.0,1.0,0.0,1.0,0.0,-1.361972,0.511374
3,1.547345,2.0,0.0,1.0,0.0,1.0,0.0,3.181369,0.103723
4,-0.557396,2.0,0.0,2.0,3.0,0.0,0.0,-0.842733,-0.439074
...,...,...,...,...,...,...,...,...,...
58,0.107259,1.0,1.0,2.0,0.0,0.0,0.0,0.065935,-0.212577
59,0.218035,1.0,0.0,2.0,0.0,1.0,0.0,-0.063875,-0.620007
60,-0.668172,1.0,0.0,1.0,0.0,0.0,0.0,-0.063875,-0.608698
61,1.325793,2.0,0.0,1.0,1.0,0.0,0.0,0.974603,-0.620007


In [148]:
rfe_sfs_idx = list(sfs.k_feature_idx_)
print(rfe_sfs_idx)
rfe_sfs_col = list(sfs.k_feature_names_)
print(rfe_sfs_col)

[0, 5, 7, 9, 10, 11, 12, 13, 17]
['num__age', 'cat__cp', 'cat__restecg', 'cat__slope', 'cat__ca', 'cat__thal', 'poly__1', 'poly__trestbps', 'poly__oldpeak^2']


In [150]:
with open('index.txt', 'w') as f:
    for i in rfe_sfs_idx:
        f.write(str(i) + '\n')

with open('columns_new.txt', 'w') as f:
    for i in rfe_sfs_col:
        f.write(i + '\n')

In [151]:
class ColumnExtractor(object):

    def __init__(self, cols):
        self.cols = cols

    def transform(self, X):
        return X[:,self.cols]
    
    def fit(self, X, y=None):
        return self


rfe_sfs_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor_sklearn), 
    ('rfe_extractor', ColumnExtractor(rfe_sfs_idx)),
    ('model', RandomForestClassifier(random_state=43))
])

rfe_sfs_pipeline.fit(X_train, y_train)

In [152]:
predictions_sfs = rfe_sfs_pipeline.predict(X_test)
y_pred_proba3 = rfe_sfs_pipeline.predict_proba(X_test)[:, 1] 

metrics3 = {}

# Метрики для классификации
metrics3["accuracy"] = accuracy_score(y_test, predictions_sfs)
metrics3["precision"] = precision_score(y_test, predictions_sfs)
metrics3["recall"] = recall_score(y_test, predictions_sfs)
metrics3["f1"] = f1_score(y_test, predictions_sfs)
metrics3["roc_auc"] = roc_auc_score(y_test, y_pred_proba3)

print(metrics3)

{'accuracy': 0.8928571428571429, 'precision': np.float64(0.8666666666666667), 'recall': np.float64(0.9285714285714286), 'f1': np.float64(0.896551724137931), 'roc_auc': np.float64(0.9744897959183674)}


In [153]:
signature =  infer_signature(model_input = X_train.head(5))
input_example = X_train.head(5)
params_dict = rfe_sfs_pipeline.get_params()

with mlflow.start_run(run_name='filtered_signs', experiment_id=experiment_id) as run:
    run_id = run.info.run_id 
    mlflow.sklearn.log_model(rfe_sfs_pipeline, 
                             artifact_path="models",
                             signature=signature,
                             input_example=input_example,
                             )
    mlflow.log_metrics(metrics3)
    mlflow.log_artifact('index.txt')
    mlflow.log_artifact('columns_new.txt')
    mlflow.log_params(params_dict)

run = mlflow.get_run(run_id) 
assert (run.info.status =='FINISHED')

2024/11/17 21:32:29 INFO mlflow.tracking._tracking_service.client: 🏃 View run filtered_signs at: http://127.0.0.1:5000/#/experiments/3/runs/7bcb647a3ea74300b45e49741dbcf93f.
2024/11/17 21:32:29 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/3.


In [154]:
import optuna

def objective(trial):
    # Предлагаем гиперпараметры
    n_estimators = trial.suggest_int('n_estimators', 50, 200)
    max_depth = trial.suggest_int('max_depth', 1, 20)
    max_features = trial.suggest_float('max_features', 0.1, 1.0)
    opt_pipeline  = Pipeline(steps=[
        ('preprocessor', preprocessor_sklearn),
        ('model', RandomForestClassifier(n_estimators=n_estimators, 
                                       max_depth=max_depth, 
                                       max_features=max_features, 
                                       random_state=43))
    ])
    opt_pipeline.fit(X_train, y_train)
    preds = opt_pipeline.predict(X_test)
    score = f1_score(y_test, preds, average='weighted')
    return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=15)

print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params) 

[I 2024-11-17 22:00:24,315] A new study created in memory with name: no-name-5c67e535-100e-4252-84ce-e8aae26d6066
[I 2024-11-17 22:00:24,678] Trial 0 finished with value: 0.8541666666666667 and parameters: {'n_estimators': 173, 'max_depth': 20, 'max_features': 0.21394117192290527}. Best is trial 0 with value: 0.8541666666666667.
[I 2024-11-17 22:00:24,824] Trial 1 finished with value: 0.8916129032258064 and parameters: {'n_estimators': 71, 'max_depth': 4, 'max_features': 0.18727403556416053}. Best is trial 1 with value: 0.8916129032258064.
[I 2024-11-17 22:00:24,920] Trial 2 finished with value: 0.9282051282051282 and parameters: {'n_estimators': 50, 'max_depth': 18, 'max_features': 0.36359630017735134}. Best is trial 2 with value: 0.9282051282051282.
[I 2024-11-17 22:00:25,096] Trial 3 finished with value: 0.8564102564102564 and parameters: {'n_estimators': 103, 'max_depth': 7, 'max_features': 0.4779967774176199}. Best is trial 2 with value: 0.9282051282051282.
[I 2024-11-17 22:00:25,

Number of finished trials: 15
Best trial: {'n_estimators': 51, 'max_depth': 20, 'max_features': 0.8598853604010984}


In [157]:
opt_pipeline  = Pipeline(steps=[
        ('preprocessor', preprocessor_sklearn),
        ('model', RandomForestClassifier(n_estimators=51, 
                                       max_depth=20, 
                                       max_features=0.8598853604010984, 
                                       random_state=43))
    ])
opt_pipeline.fit(X_train, y_train)

In [158]:
predictions_opt = opt_pipeline.predict(X_test)
y_pred_proba4 = opt_pipeline.predict_proba(X_test)[:, 1] 

metrics4 = {}

# Метрики для классификации
metrics4["accuracy"] = accuracy_score(y_test, predictions_opt)
metrics4["precision"] = precision_score(y_test, predictions_opt)
metrics4["recall"] = recall_score(y_test, predictions_opt)
metrics4["f1"] = f1_score(y_test, predictions_opt)
metrics4["roc_auc"] = roc_auc_score(y_test, y_pred_proba4)

print(metrics4)

{'accuracy': 0.9285714285714286, 'precision': np.float64(0.9285714285714286), 'recall': np.float64(0.9285714285714286), 'f1': np.float64(0.9285714285714286), 'roc_auc': np.float64(0.9770408163265306)}


In [159]:
signature =  infer_signature(model_input = X_train.head(5))
input_example = X_train.head(5)
params_dict = opt_pipeline.get_params()

with mlflow.start_run(run_name='param_setup', experiment_id=experiment_id) as run:
    run_id = run.info.run_id 
    mlflow.sklearn.log_model(opt_pipeline, 
                             artifact_path="models",
                             signature=signature,
                             input_example=input_example,
                             )
    mlflow.log_metrics(metrics4)
    mlflow.log_params(params_dict)

run = mlflow.get_run(run_id) 
assert (run.info.status =='FINISHED')



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2024/11/17 22:09:23 INFO mlflow.tracking._tracking_service.client: 🏃 View run param_setup at: http://127.0.0.1:5000/#/experiments/3/runs/e691c26c2d074f7b80628f0deba392fd.
2024/11/17 22:09:23 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/3.


In [160]:
X = data.drop('target', axis=1)
y = data['target']

In [163]:
opt_pipeline.fit(X, y)

In [165]:
with open('columns_for_training_final_model.txt', 'w') as f:
    for column in column_names:
        f.write(column + '\n')

signature =  infer_signature(model_input = X.head(5))
input_example = X.head(5)
params_dict = opt_pipeline.get_params()

with mlflow.start_run(run_name='final_model', experiment_id=experiment_id) as run:
    run_id = run.info.run_id 
    mlflow.sklearn.log_model(opt_pipeline, 
                             artifact_path="models",
                             signature=signature,
                             input_example=input_example,
                             )
    mlflow.log_artifact('../requirements.txt')
    mlflow.log_artifact('columns_for_training_final_model.txt')
    mlflow.log_params(params_dict)

run = mlflow.get_run(run_id) 
assert (run.info.status =='FINISHED')



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2024/11/17 22:30:06 INFO mlflow.tracking._tracking_service.client: 🏃 View run final_model at: http://127.0.0.1:5000/#/experiments/3/runs/ef398bd7999048098a97d90850652408.
2024/11/17 22:30:06 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/3.
