In [100]:
import pickle
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, root_mean_squared_error
import mlflow
from mlflow.models import infer_signature

In [101]:
with open('../data/dataset.pkl', 'rb') as handle:
    df = pickle.load(handle)

In [102]:
df

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2023,SE,FT,Principal Data Scientist,85847,ES,100,ES,L
1,2023,MI,CT,ML Engineer,30000,US,100,US,S
2,2023,MI,CT,ML Engineer,25500,US,100,US,S
3,2023,SE,FT,Data Scientist,175000,CA,100,CA,M
4,2023,SE,FT,Data Scientist,120000,CA,100,CA,M
...,...,...,...,...,...,...,...,...,...
3750,2020,SE,FT,Data Scientist,412000,US,100,US,L
3751,2021,MI,FT,Principal Data Scientist,151000,US,100,US,L
3752,2020,EN,FT,Data Scientist,105000,US,100,US,S
3753,2020,EN,CT,Business Data Analyst,100000,US,100,US,L


In [103]:
df = df.rename(columns={'salary_in_usd': 'target'})
X = df.drop('target', axis=1)
y = df['target']

In [104]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [105]:
num_features = X.select_dtypes(include=['number']).columns.to_list()
print(num_features)
cat_features = X.select_dtypes(include=['category']).columns.to_list()
print(cat_features)

['work_year', 'remote_ratio']
['experience_level', 'employment_type', 'job_title', 'employee_residence', 'company_location', 'company_size']


In [106]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_features),  # преобразования для числовых признаков
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=99999999), cat_features), # преобразования для категориальных признаков
    ],
    remainder='drop' ) # Удаляем столбцы, которые не затронуты преобразования

In [107]:
regressor = RandomForestRegressor()
pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('model', regressor)
])

In [108]:
X_train

Unnamed: 0,work_year,experience_level,employment_type,job_title,employee_residence,remote_ratio,company_location,company_size
1194,2023,SE,FT,Data Scientist,US,100,US,M
1057,2023,SE,FT,Data Analyst,US,0,US,M
1103,2023,SE,FT,Data Manager,US,0,US,M
794,2023,SE,FT,Data Science Manager,US,0,US,M
626,2023,SE,FT,Data Scientist,CA,0,CA,M
...,...,...,...,...,...,...,...,...
1006,2023,SE,FT,Machine Learning Engineer,US,0,US,M
3743,2020,MI,FT,Data Engineer,ES,100,US,M
2215,2022,MI,FT,Data Analyst,US,100,US,L
2913,2022,EN,FT,Data Engineer,US,100,US,L


In [109]:
pipeline.fit(X_train, y_train)

In [110]:
predictions = pipeline.predict(X_test) 

metrics = {}
metrics["mae"] = mean_absolute_error(y_test, predictions)   
metrics["mape"] = mean_absolute_percentage_error(y_test, predictions)
metrics["rmse"] = root_mean_squared_error(y_test, predictions)

metrics

{'mae': 35617.75123908231,
 'mape': 0.33366674279718284,
 'rmse': 47149.8733632604}

In [111]:
# Работаем с MLflow локально
TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

registry_uri = f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}"
tracking_uri = f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}"

mlflow.set_tracking_uri(tracking_uri)   
mlflow.set_registry_uri(registry_uri)   
# название тестового эксперимента, запуска (run) внутри него, имени, под которым модель будет регистрироваться
EXPERIMENT_NAME = "estate_project"
RUN_NAME = "baseline model"
REGISTRY_MODEL_NAME = "estate_model_rf"

In [112]:
signature =  infer_signature(model_input = X_train.head(5))
input_example = X_train.head(5)
req_file = '../requirements.txt'
art = '../comment.txt'
params_dict = pipeline.get_params()



In [113]:
experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id 
    mlflow.sklearn.log_model(pipeline, 
                             artifact_path="models",
                             signature=signature,
                             input_example=input_example,
                             pip_requirements=req_file
                             )
    mlflow.log_metrics(metrics)
    mlflow.log_artifact(art)
    mlflow.log_params(params_dict)

run = mlflow.get_run(run_id) 
assert (run.info.status =='FINISHED')

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2024/10/22 18:13:30 INFO mlflow.tracking._tracking_service.client: 🏃 View run baseline model at: http://127.0.0.1:5000/#/experiments/1/runs/416ffa61aff644cb955a91625d6692a7.
2024/10/22 18:13:30 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.
