### RNN Model for churn or next event prediction

In [None]:
import os
import signal

import pandas as pd
import plotly.express as px

from customer_analysis.pipelines import RNNPipeline
from customer_analysis.utils.nn_datasets import EventSequenceDataPreparation
from customer_analysis.utils.visualization import plot_rnn_attention_heatmap
from customer_analysis.utils.data_processing import read_rnn_attention_data, \
    get_sequences_above_weights_threshold


In [None]:
import warnings

warnings.filterwarnings("ignore", category=UserWarning, module='distutils')
warnings.filterwarnings("ignore", category=UserWarning,
                        module='_distutils_hack')


In [None]:
# define task - eg. to predict churn or next event on below data
# NOTE: change also in config.json at "task" param.
churn = False


### Prepare BiGQuery [GA4] data

In [None]:
file_1 = "../data/bq-results-20230807_100k_20210130.json"
df1 = pd.read_json(file_1, lines=True)

file_2 = "../data/bq-results-20230807_100k_20210131.json"
df2 = pd.read_json(file_2, lines=True)

df = pd.concat([df1, df2], axis=0, ignore_index=True)


In [None]:
df.info()

#### Prepare mapping dictionaries

In [None]:
# Prepare events dictionaries + padded value
unique_events = df['event_name'].unique()
idx2even = {i: event for i, event in enumerate(list(unique_events))}
idx2even[len(idx2even)] = '<PAD>'

even2idx = {value: key for key, value in idx2even.items()}
idx2churn = {0: 'Not_Churned', 1: 'Churned'}

idx2even


In [None]:
df.rename(columns={'event_name': 'events_sequence'}, inplace=True)


#### Prepare data for next event prediction - rather short sequences of events (per session)

In [None]:
# Extract session ID from the data
def get_session_id(event_params):
    for param in event_params:
        if param['key'] == 'ga_session_id':
            return param['value']['int_value']
    return None


df['session_id'] = df['event_params'].apply(get_session_id)


In [None]:
# Group data -> all events per user session (not for churn prediction)
grouped_session_df = pd.DataFrame(df.sort_values(
    ['user_pseudo_id', 'event_timestamp'])
    .groupby(['user_pseudo_id', 'session_id'])['events_sequence'].apply(list))
grouped_session_df.reset_index(inplace=True)

grouped_session_df.info()

#### Prepare data for churn prediction - long sequences of events

Below approach for churn defninition is based on:
- https://github.com/GoogleCloudPlatform/analytics-componentized-patterns/blob/master/gaming/propensity-model/bqml/bqml_ga4_gaming_propensity_to_churn.ipynb

In [None]:
# Define churn per user
firstlasttouch = df.groupby('user_pseudo_id').agg(
    {'event_timestamp': ['min', 'max']})
firstlasttouch.columns = ['user_first_engagement', 'user_last_engagement']

returningusers = firstlasttouch.assign(
    ts_24hr_after_first_engagement=lambda x: x['user_first_engagement'] +
    (3600000000 * 24),   # * 24h
    churned=lambda x: (x['user_last_engagement'] <
                       x['ts_24hr_after_first_engagement']).astype(int),
    bounced=lambda x: (x['user_last_engagement'] <=
                       x['user_first_engagement'] + 60000000 * 10).astype(int)  # * 10min
)

churned_idx = returningusers[(returningusers['bounced'] == 0) &
                             (returningusers['churned'] == 1)].index


In [None]:
# Assign churn to previously determined user IDs
df['is_churned'] = df['user_pseudo_id']\
    .apply(lambda x: x in churned_idx).astype(int)


In [None]:
# Group data -> all events per one user + churn
grouped_churn_df = pd.DataFrame(df.sort_values(
    ['user_pseudo_id', 'event_timestamp'])
    .groupby(['user_pseudo_id', 'is_churned'])['events_sequence'].apply(list))
grouped_churn_df.reset_index(inplace=True)

grouped_churn_df.info()


#### Assign data to final df - based on 'task'

In [None]:
# Based on 'churn' boolean, determin data to use with RNN model
grouped_df = grouped_churn_df if churn else grouped_session_df

# Exclude sequences with just one event:
grouped_df = grouped_df[grouped_df['events_sequence'].apply(len) > 1]

grouped_df.head()

#### Exclude longest event sequence and shuffle the data

In [None]:
lengths = grouped_df['events_sequence'].apply(len)
top_10_lengths = lengths.nlargest(10)
top_10_lengths


In [None]:
# drop longest sequence <- because of padding and for computation time
grouped_df.drop(index=[top_10_lengths.index[0]], inplace=True)


In [None]:
lengths = grouped_df['events_sequence'].apply(len)
top_10_lengths = lengths.nlargest(10)
top_10_lengths


In [None]:
grouped_df = grouped_df.sample(frac=1, random_state=0)


#### Create TensorDataset

In [None]:
train_tensor_dataset = EventSequenceDataPreparation(padding_value=even2idx['<PAD>'],
                                                    event2idx=even2idx,
                                                    input_df=grouped_df,
                                                    sequence_column='events_sequence',
                                                    target_column='is_churned' if churn else None,
                                                    model_type='rnn',
                                                    n_last_events=500,
                                                    include_test_targets=True,
                                                    train_size=0.7,
                                                    val_size=0.15)

test_tensor_dataset = EventSequenceDataPreparation(padding_value=even2idx['<PAD>'],
                                                   event2idx=even2idx,
                                                   input_df=grouped_df,
                                                   sequence_column='events_sequence',
                                                   target_column='is_churned' if churn else None,
                                                   model_type='rnn',
                                                   n_last_events=500,
                                                   include_test_targets=False,
                                                   train_size=0.7,
                                                   val_size=0.15)


In [None]:
true_train_dataset = train_tensor_dataset['train']
true_valid_dataset = train_tensor_dataset['val']
true_test_dataset = train_tensor_dataset['test']

# with, no - "dummy" - targets
dummy_test_dataset = test_tensor_dataset['test']


### Model init and training

In [None]:
# Model init
config_path = '../config/nn/rnn_config.json'

rnn = RNNPipeline(config_path,
                  2 if churn else len(even2idx),
                  even2idx['<PAD>'])


In [None]:
len(even2idx)


In [None]:
# Train
# valid dataset is an option
rnn.fit(true_train_dataset, true_valid_dataset)


In [None]:
# Show best model saved location, score & best params
print(rnn.best_model_path + '.pth')
print(
    f"best score {rnn.pipeline_params['grid_search_metric']}: {rnn.best_score}")
rnn.best_params


### Predict

In [None]:
# accuracy=nan (or other will be 'nan') because of lack of targets (dummy_test_dataset)

# Use pretrained model as:
preds = rnn.predict(
    true_test_dataset,
    f'{rnn.pipeline_params["model_artifacts_path"]}/grid_model/RNNModel_events_task.pth')
# Above option wil lack 'train'/'val' phases, attention weights.
# Only phase = 'test' attention weights will be avaliable if "save_attention_weights" is true in config.json

# OR from current run (training):
# preds = rnn.predict(true_test_dataset)


In [None]:
# Convert probabilities to 0/1 values.
if isinstance(preds[0][0], float):
    preds = [[int(p[0] > rnn.proba_thresold)] for p in preds]
    print('Converted probabilities to bool.')


In [None]:
translated_result = [[(idx2even if not churn else idx2churn)[idx]
                      for idx in row] for row in preds[:10]]
print(translated_result)


### MLFlow - registered model staging

In [None]:
if rnn.mlflow_config['enable'] and rnn.mlflow_config['regis_model_on_predict']:
    registered_model_name = rnn.mlflow_manager.registered_model_name
    registered_model_version = rnn.mlflow_manager.registered_model_version

    mlflow_server = rnn.mlflow_manager.start_mlflow_server()
    # Options are: ['None', 'Staging', 'Production', 'Archived']
    rnn.mlflow_manager.model_staging(
        model_name=registered_model_name,
        model_version=registered_model_version,
        model_stage="Staging"
    )
    if mlflow_server:
        os.killpg(os.getpgid(mlflow_server.pid), signal.SIGTERM)
        print("\nTerminate local MLFlow server.")

    print(f"\n{registered_model_name = }, {registered_model_version = }")


### Results evaluation

In [None]:
padded_sequences, targets, seq_len = true_test_dataset.tensors

padded_sequences_np = padded_sequences.squeeze(-1).numpy()
targets_np = targets.numpy()

results_df = pd.DataFrame(padded_sequences_np)


def extract_events(row):
    events = row[row != even2idx['<PAD>']].astype(int).tolist()
    return events


results_df['sequences'] = results_df.apply(extract_events, axis=1)
results_df['seq_lengths'] = results_df['sequences'].apply(len)

results_df['True_Target'] = targets_np.astype(int)
results_df['True_Target'] = results_df['True_Target'].map(idx2even) \
    if not churn else results_df['True_Target'].astype(bool)

results_df = results_df[['sequences', 'True_Target']]

# Convert predictions
if isinstance(preds[0], list):
    extracted_predictions = []
    for i, pred in enumerate(preds):
        if targets_np[i] in pred:
            extracted_predictions.append(targets_np[i])
        else:
            extracted_predictions.append(pred[0])
    preds = extracted_predictions

results_df['Predicted_Target'] = preds
results_df['Predicted_Target'] = results_df['Predicted_Target'].map(idx2even) \
    if not churn else results_df['Predicted_Target'].astype(bool)

results_df['Correct_Prediction'] = results_df['True_Target'] == results_df['Predicted_Target']


In [None]:
accuracy = results_df['Correct_Prediction'].mean()
print(f"accuracy: {accuracy:.3f}")


In [None]:
results_df['True_Target'].value_counts()


In [None]:
results_df['Predicted_Target'].value_counts()


In [None]:
data_melted = pd.melt(
    results_df,
    value_vars=['True_Target', 'Predicted_Target'],
    var_name='Target Type',
    value_name='Target')

data_count = data_melted.groupby(
    ['Target', 'Target Type']).size().reset_index(name='count')


In [None]:
# Plot True/Predicted Targets
fig = px.bar(data_count, x='Target', y='count',
             color='Target Type', barmode='group', text='count')
fig.update_layout(title={
    'text': 'Grouped Bar Chart of True and Predicted Targets',
    'x': 0.5,
    'xanchor': 'center'},
    xaxis_title='Target',
    yaxis_title='Count',
    width=1100 if churn else 1500,
    height=600)

fig.show()


### Plot attention weights

In [None]:
pth = rnn.best_model_path

head_index = 0
batch_index = -4


In [None]:
# read attention data
if rnn.pipeline_params['save_attention_weights']:
    data = read_rnn_attention_data(
        file_path=f'{pth}.json',
        phase='train',
        head_index=head_index,
        batch_index=batch_index)

    display(data.items())


In [None]:
# Plot heatmap
if rnn.pipeline_params['save_attention_weights']:
    plot_rnn_attention_heatmap(
        data,
        even2idx['<PAD>'],
        idx2even if not churn else idx2churn)


In [None]:
# read attention data
if rnn.pipeline_params['save_attention_weights']:
    data = read_rnn_attention_data(
        file_path=f'{pth}.json',
        phase='test',
        head_index=head_index,
        batch_index=batch_index)

    display(data.items())


In [None]:
# Plot heatmap
if rnn.pipeline_params['save_attention_weights']:
    plot_rnn_attention_heatmap(
        data,
        even2idx['<PAD>'],
        idx2even if not churn else idx2churn)


In [None]:
if rnn.pipeline_params['save_attention_weights']:
    sequences = get_sequences_above_weights_threshold(
        head_data=data['heads']['0'],
        idx2event=idx2even,
        weights_threshold=0.55,
        min_events=4)

    display(sequences)


### Additionally update wheel

In [None]:
# local:
# %cd /home/piotr/ggiitt/customer_analysis
# !python setup.py bdist_wheel
