In [13]:
from copyreg import pickle
!python -V

Python 3.10.4


MLflow setup:

To run this example you need to launch the mlflow server locally by running the following command in your terminal:

`mlflow server --backend-store-uri sqlite:///mlflow.db --default-artifact-root ./mlruns --host 0.0.0.0 --port 5000`

To start MLflow Tracking Server enabled with proxied artifact storage access:
( see more in : [https://www.mlflow.org/docs/latest/tracking.html#where-runs-are-recorded](https://www.mlflow.org/docs/latest/tracking.html#where-runs-are-recorded) )

In [33]:
S3_BUCKET = 's3://mlflow-enkidupal-experiments'
f'mlflow server --host 0.0.0.0 --port 5000 --serve-artifacts --artifacts-destination {S3_BUCKET}/'


'mlflow server --host 0.0.0.0 --port 5000 --serve-artifacts --artifacts-destination s3://mlflow-enkidupal-experiments/'

To rerun the server, this command has to be run in *correct* directory,
where previous mlflow server was run, to point backend-store-uri to mlflow.db !

`cd project_root`


 `mlflow server --host 0.0.0.0 --port 5000 --serve-artifacts --artifacts-destination s3://mlflow-enkidupal-experiments/`

Import libraries

In [21]:
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score

import mlflow

import pandas as pd
import numpy as np
from toolz import compose
import pickle

from sklearn.model_selection import train_test_split

from pandas.core.common import SettingWithCopyWarning
from matplotlib_inline import backend_inline

import warnings

backend_inline.set_matplotlib_formats('svg')
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

In [22]:
import os

_data_root = os.path.join("./../", 'data')
_data_root_raw = os.path.join(_data_root, 'raw')
_data_root_processed =  os.path.join(_data_root, 'processed')

_train_dirpath = os.path.join(_data_root_raw, "train")
_train_filepath = os.path.join(_train_dirpath, "train.csv")
_test_dirpath = os.path.join(_data_root_raw, "test")
_test_filepath = os.path.join(_test_dirpath, "test.csv")

_train_processed_dirpath = os.path.join(_data_root_processed, "train")
_train_processed_filepath = os.path.join(_data_root_processed, "train.csv")
_valid_processed_dirpath = os.path.join(_data_root_processed, "valid")
_valid_processed_filepath = os.path.join(_data_root_processed, "valid.csv")
_test_processed_dirpath = os.path.join(_data_root_processed, "test")
_test_processed_filepath = os.path.join(_data_root_processed, "test.csv")

os.makedirs(_train_dirpath, exist_ok=True)
os.makedirs(_test_dirpath, exist_ok=True)
os.makedirs(_train_processed_dirpath, exist_ok=True)
os.makedirs(_valid_processed_dirpath, exist_ok=True)
os.makedirs(_test_processed_dirpath, exist_ok=True)


## Download data from kaggle, unzip it and copy it to data folder


In [39]:
def download_data():
    !kaggle competitions download -c titanic -p {_data_root} --force
    !unzip -o {_data_root}/"titanic.zip" -d {_data_root}
    !cp {_data_root}/"train.csv" {_train_filepath}
    !cp {_data_root}/"test.csv" {_test_filepath}

    # clean up
    !rm  {_data_root}/*.csv  {_data_root}/*.zip

def extract_target(data: pd.DataFrame, target="Survived"):
    targets = data[target].values
    return targets

def preprocess_df_2(df: pd.DataFrame, transforms, categorical, numerical):
    """Return processed features dict and target."""

    # Apply in-between transformations
    df = compose(*transforms[::-1])(df)

    # For dict vectorizer: int = ignored, str = one-hot
    df[categorical] = df[categorical].astype(str)

    return df

def preprocess_df(df: pd.DataFrame, transforms, categorical, numerical):
    """Return processed features dict and target."""

    # Apply in-between transformations
    df = compose(*transforms[::-1])(df)

    # For dict vectorizer: int = ignored, str = one-hot
    df[categorical] = df[categorical].astype(str)

    # Convert dataframe to feature dictionaries
    feature_dicts = df[categorical + numerical].to_dict(orient='records')

    return feature_dicts


def read_data(filename):
    """Return processed features dict and target."""

    # Load dataset
    if filename.endswith('parquet'):
        df = pd.read_parquet(filename)
    elif filename.endswith('csv'):
        df = pd.read_csv(filename)
    else:
        raise "Error: not supported file format."

    return df


def preprocess_no_extract_target(filename, transforms, categorical, numerical):
    df = read_data(filename)

    feature_dicts = preprocess_df(df, transforms, categorical, numerical)

    return feature_dicts

def split_train_read(filename: str, val_size=0.2, random_state=42):
    df_train_full = read_data(filename)

    df_train, df_val = train_test_split(df_train_full, test_size=val_size, random_state=random_state)
    return df_train, df_val

def save_preprocessed(df: pd.DataFrame, path):
    df.to_csv(path)

def dump_pickle(obj, filename):
    with open(filename, "wb") as f_out:
        return pickle.dump(obj, f_out)

def run():
    mlflow.set_tracking_uri("http://0.0.0.0:5000")
    mlflow.set_experiment("titanic-experiment")
    mlflow.sklearn.autolog()

    with mlflow.start_run():
        download_data()
        transforms = []
        target = 'Survived'
        categorical = ['Sex', 'Pclass', 'Embarked', 'SibSp', 'Parch']
        numerical = ['Fare']

        df_train, df_val = split_train_read(_train_filepath, val_size=0.2, random_state=42)

        train_dicts, y_train = preprocess_df(df_train, transforms, categorical, numerical), extract_target(df_train)
        val_dicts, y_val = preprocess_df(df_val, transforms, categorical, numerical), extract_target(df_val)

        df_test = read_data(_test_filepath)
        test_dicts = preprocess_df(df_test, transforms, categorical, numerical)

        # Fit all possible categories
        dv = DictVectorizer()
        dv.fit(train_dicts)

        X_train = dv.transform(train_dicts)
        X_val = dv.transform(val_dicts)
        X_test = dv.transform(test_dicts)

        model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)

        model.fit(X_train, y_train)

        with open("../models/preprocessor.b", "wb") as f_out:
            pickle.dump(dv, f_out)
        mlflow.log_artifact("../models/preprocessor.b", artifact_path="preprocessor")

        #    mlflow.xgboost.log_model(booster, artifact_path="models_mlflow")
        #mlflow.log_artifact(local_path=os.path.join("../models", "rfc.pkl"), artifact_path="models_pickle")
        mlflow.sklearn.log_model(model, artifact_path="models_pickle")

        y_pred = model.predict(X_val)

        accuracy = np.round(accuracy_score(y_val, y_pred), 4)
        mlflow.log_metric("accuracy", accuracy)

        print(accuracy)


In [44]:
transforms = []
target = 'Survived'
categorical = ['Sex', 'Pclass', 'Embarked', 'SibSp', 'Parch']
numerical = ['Fare']

df_train, df_val = split_train_read(_train_filepath, val_size=0.2, random_state=42)
df_train_preprocessed = preprocess_df_2(df_train, transforms, categorical, numerical)

df_train_preprocessed[categorical].head()

df_train_preprocessed[categorical].isna()

Unnamed: 0,Sex,Pclass,Embarked,SibSp,Parch
375,False,False,False,False,False
344,False,False,False,False,False
526,False,False,False,False,False
24,False,False,False,False,False
234,False,False,False,False,False
...,...,...,...,...,...
71,False,False,False,False,False
106,False,False,False,False,False
270,False,False,False,False,False
435,False,False,False,False,False


In [36]:
df_train.head()

Unnamed: 0.1,Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
375,517,518,0,3,"Ryan, Mr. Patrick",male,,0,0,371110,24.15,,Q
344,792,793,0,3,"Sage, Miss. Stella Anna",female,,8,2,CA. 2343,69.55,,S
526,472,473,1,2,"West, Mrs. Edwy Arthur (Ada Mary Worth)",female,33.0,1,2,C.A. 34651,27.75,,S
24,483,484,1,3,"Turkula, Mrs. (Hedwig)",female,63.0,0,0,4134,9.5875,,S
234,9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [27]:
run()

Downloading titanic.zip to ./../data
  0%|                                               | 0.00/34.1k [00:00<?, ?B/s]
100%|██████████████████████████████████████| 34.1k/34.1k [00:00<00:00, 1.82MB/s]
Archive:  ./../data/titanic.zip
  inflating: ./../data/gender_submission.csv  
  inflating: ./../data/test.csv      
  inflating: ./../data/train.csv     
0.8045
