# 03-100 : ML Ops

This is the first in the series of notebooks with the following goals:

- Move repeated code to .py files.
- Use MLflow to track experiments.
- Implement the Keras hyperparameter tuning tools.

## Web References

- [Effective MLOps: Model Development](https://www.wandb.courses/courses/take/effective-mlops-model-development)
- [An Experiment Tracking Tutorial with Mlflow and Keras](https://www.youtube.com/watch?v=carXIinrmOc)

### MLflow

- [Keras Integration Example](https://github.com/mlflow/mlflow/blob/master/examples/keras/train.py)

### Hyperparameter Tuning 

- [Keras Tuner| Hyperparameter Tuning for Neural Networks in Minutes|](https://www.youtube.com/watch?v=Clo1HKB50Ug)
- [Picking the best model and corresponding hyperparameters using Gridsearch](https://www.youtube.com/watch?v=cOos6wRMpAU)

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import logging

import pandas as pd
import tensorflow as tf

from competition import source_data as sd
from competition import data_preparation as dp
from competition import feature_engineering as fe

In [3]:
# Set the GPU memory for growth
gpus = tf.config.experimental.list_physical_devices('GPU')
print(gpus)
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


### Configure Logging

In [4]:
logging.basicConfig(
    format='%(asctime)s %(levelname)-8s %(message)s',
    level=logging.INFO,
    datefmt='%Y-%m-%d %H:%M:%S',
        handlers=[
        logging.StreamHandler(sys.stdout)
    ])

logging.info("Started")

2023-03-22 09:39:41 INFO     Started


## Data Collection

In [5]:
# load the source training set
df_source = sd.read_csv('../data/train.csv.gz',
                        compression='gzip',
                        dtype=sd.source_dtype)

(13174211, 20)


Unnamed: 0,session_id,index,elapsed_time,event_name,name,level,page,room_coor_x,room_coor_y,screen_coor_x,screen_coor_y,hover_duration,text,fqid,room_fqid,text_fqid,fullscreen,hq,music,level_group
0,20090312431273200,0,0,cutscene_click,basic,0,,-413.991394,-159.314682,380.0,494.0,,undefined,intro,tunic.historicalsociety.closet,tunic.historicalsociety.closet.intro,,,,0-4
1,20090312431273200,1,1323,person_click,basic,0,,-413.991394,-159.314682,380.0,494.0,,"Whatcha doing over there, Jo?",gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,,,,0-4
2,20090312431273200,2,831,person_click,basic,0,,-413.991394,-159.314682,380.0,494.0,,Just talking to Teddy.,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,,,,0-4


In [6]:
# load the source training labels
df_source_labels = sd.read_csv('../data/train_labels.csv')

(212022, 2)


Unnamed: 0,session_id,correct
0,20090312431273200_q1,1
1,20090312433251036_q1,0
2,20090314121766812_q1,1


## Data Preparation & Cleaning

In [7]:
# prepare the main dataset
df_source = dp.prepare_main_dataset(df_source,
                                    elapsed_time_min_clip=0,
                                    elapsed_time_max_clip=3691298)

# remove sessions with problems
problem_sessions = dp.find_problem_sessions(df_source)
df_source = df_source[~df_source['session_id'].isin(problem_sessions)]

with pd.option_context('display.max_columns', None):
    print(df_source.shape)
    display(df_source.head(3))  

(13019794, 14)


Unnamed: 0,session_id,index,elapsed_time,event_name,name,level,room_coor_x,room_coor_y,screen_coor_x,screen_coor_y,fqid,room_fqid,text_fqid,level_group
0,20090312431273200,0,0,cutscene_click,basic,0,-413.991394,-159.314682,380.0,494.0,intro,tunic.historicalsociety.closet,tunic.historicalsociety.closet.intro,0-4
1,20090312431273200,1,1323,person_click,basic,0,-413.991394,-159.314682,380.0,494.0,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0-4
2,20090312431273200,2,831,person_click,basic,0,-413.991394,-159.314682,380.0,494.0,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0-4


In [8]:
# prepare the label dataset
df_source_labels = dp.prepare_label_dataset(df_source_labels)

# remove sessions with problems
df_source_labels = df_source_labels[~df_source_labels['session_id'].isin(problem_sessions)]

with pd.option_context('display.max_columns', None):
    display(df_source_labels.sample(n=3, random_state=51))

Unnamed: 0,session_id,question_num,correct,level_group
172317,21070319253640464,15,0,13-22
194865,21040512553883790,17,1,13-22
197728,22000108514966796,17,1,13-22


## Feature Engineering

In [9]:
# create the initial features
df_features = fe.create_initial_features(df_source, df_source_labels)

with pd.option_context('display.max_columns', None):
    display(df_features.head(6))

Unnamed: 0,session_id,level_group
0,20090312431273200,0-4
1,20090312431273200,13-22
2,20090312431273200,5-12
3,20090312433251036,0-4
4,20090312433251036,13-22
5,20090312433251036,5-12


In [10]:
# add the feature to the features dataset
df_features = fe.add_elapsed_time_features(
    features=df_features,
    X=df_source)

with pd.option_context('display.max_columns', None):
    display(df_features.head(6))

Unnamed: 0,session_id,level_group,elapsed_time_sum,elapsed_time_max,elapsed_time_min,elapsed_time_mean,elapsed_time_mode
0,20090312431273200,0-4,0.001411,0.052535,0.0,0.023103,0.0
1,20090312431273200,13-22,0.04374,0.344602,0.226677,0.281804,0.30132
2,20090312431273200,5-12,0.010577,0.135014,0.060002,0.096641,0.060002
3,20090312433251036,0-4,0.001352,0.063074,0.0,0.026311,0.0
4,20090312433251036,13-22,0.324157,1.0,0.318718,0.676403,1.0
5,20090312433251036,5-12,0.021933,0.221287,0.072301,0.150206,0.072301


In [17]:
# add the total count features to the features dataset
fe.add_total_features(
    features=df_features,
    X=df_source)

with pd.option_context('display.max_columns', None):
    display(df_features.head(6))

Unnamed: 0,session_id,level_group,elapsed_time_sum,elapsed_time_max,elapsed_time_min,elapsed_time_mean,elapsed_time_mode,count_total_feature
0,20090312431273200,0-4,0.001411,0.052535,0.0,0.023103,0.0,"[0.08878224355128975, 0.08878224355128975, 0.0..."
1,20090312431273200,13-22,0.04374,0.344602,0.226677,0.281804,0.30132,"[0.39472105578884226, 0.39472105578884226, 0.4..."
2,20090312431273200,5-12,0.010577,0.135014,0.060002,0.096641,0.060002,"[0.24595080983803239, 0.24595080983803239, 0.2..."
3,20090312433251036,0-4,0.001352,0.063074,0.0,0.026311,0.0,"[0.057588482303539294, 0.057588482303539294, 0..."
4,20090312433251036,13-22,0.324157,1.0,0.318718,0.676403,1.0,"[1.0, 1.0, 1.0, 1.0, 0.5850556438791733]"
5,20090312433251036,5-12,0.021933,0.221287,0.072301,0.150206,0.072301,"[0.36472705458908217, 0.36472705458908217, 0.4..."


In [12]:
# add the total count features to the features dataset
df_features = fe.add_total_features(
    features=df_features,
    X=df_source)

with pd.option_context('display.max_columns', None):
    display(df_features.head(6))

Unnamed: 0,session_id,level_group,event_name,name,fqid,room_fqid,text_fqid
0,20090312431273200,0-4,0.088782,0.088782,0.064620,0.088782,0.054054
1,20090312431273200,13-22,0.394721,0.394721,0.402262,0.394721,0.480127
2,20090312431273200,5-12,0.245951,0.245951,0.276252,0.245951,0.257552
3,20090312433251036,0-4,0.057588,0.057588,0.053312,0.057588,0.050874
4,20090312433251036,13-22,1.000000,1.000000,1.000000,1.000000,0.585056
...,...,...,...,...,...,...,...
34939,22100219442786200,13-22,0.398320,0.398320,0.342488,0.398320,0.391097
34940,22100219442786200,5-12,0.224355,0.224355,0.235864,0.224355,0.225755
34941,22100221145014656,0-4,0.143971,0.143971,0.119548,0.143971,0.085851
34942,22100221145014656,13-22,0.802639,0.802639,0.693053,0.802639,0.572337


Unnamed: 0,session_id,level_group,elapsed_time_sum,elapsed_time_max,elapsed_time_min,elapsed_time_mean,elapsed_time_mode,count_total_feature
0,20090312431273200,0-4,0.001411,0.052535,0.0,0.023103,0.0,"[0.08878224355128975, 0.08878224355128975, 0.0..."
1,20090312431273200,13-22,0.04374,0.344602,0.226677,0.281804,0.30132,"[0.39472105578884226, 0.39472105578884226, 0.4..."
2,20090312431273200,5-12,0.010577,0.135014,0.060002,0.096641,0.060002,"[0.24595080983803239, 0.24595080983803239, 0.2..."
3,20090312433251036,0-4,0.001352,0.063074,0.0,0.026311,0.0,"[0.057588482303539294, 0.057588482303539294, 0..."
4,20090312433251036,13-22,0.324157,1.0,0.318718,0.676403,1.0,"[1.0, 1.0, 1.0, 1.0, 0.5850556438791733]"
5,20090312433251036,5-12,0.021933,0.221287,0.072301,0.150206,0.072301,"[0.36472705458908217, 0.36472705458908217, 0.4..."
