# Phase 20: Exit Model (Model 2) - Data Pre-processing
=====================================================
This notebook transforms the raw RetailRocket `events.csv` dataset into sequential clickstream data (windowed sequences) used to train the Abandonment Prediction models.

In [1]:
import pandas as pd
import numpy as np
import os

# Paths relative to notebooks/ directory
DATA_DIR = "../scripts/data"
INPUT_PATH = os.path.join(DATA_DIR, "events.csv")
OUTPUT_DIR = "../data/processed"

print(f"Loading raw data from: {INPUT_PATH}")

Loading raw data from: ../scripts/data\events.csv


In [2]:
if not os.path.exists(INPUT_PATH):
    print(f"❌ Error: events.csv not found at {INPUT_PATH}.")
else:
    df = pd.read_csv(INPUT_PATH)
    print(f"✅ Loaded {len(df):,} events.")

✅ Loaded 2,756,101 events.


### 1. Sequential Mapping
We map RetailRocket event types to indices used by the model embeddings.

In [3]:
EVENT_TO_PAGE = {
    'view': 1,           # Product view
    'addtocart': 2,      # Cart operation
    'transaction': 3     # Purchase (completed)
}

df['page_type'] = df['event'].map(EVENT_TO_PAGE)
df = df.sort_values(['visitorid', 'timestamp'])

### 2. Sessionization
Group clicks into sessions. A new session starts if there's a 30-minute gap.

In [4]:
def create_sessions(events_df, gap_minutes=30):
    gap_ms = gap_minutes * 60 * 1000
    sessions = []
    session_labels = []
    
    current_visitor = None
    current_session = []
    last_ts = None
    
    print("Creating sequences... (This may take a minute)")
    for i, row in events_df.iterrows():
        v_id = row['visitorid']
        ts = row['timestamp']
        pt = row['page_type']
        
        if v_id != current_visitor or (last_ts and ts - last_ts > gap_ms):
            if len(current_session) >= 2:
                sessions.append(current_session)
                has_tx = any(e[0] == 3 for e in current_session)
                session_labels.append(0 if has_tx else 1)
            current_session = []
            current_visitor = v_id
            
        current_session.append((pt, ts))
        last_ts = ts
        
    return sessions, session_labels

sessions_raw, labels = create_sessions(df)
print(f"Generated {len(sessions_raw):,} sessions.")

Creating sequences... (This may take a minute)
Generated 382,780 sessions.


### 3. Padding and Output
Normalize durations and pad sequences to a fixed length of 20.

In [6]:
MAX_LEN = 20
X_page = np.zeros((len(sessions_raw), MAX_LEN), dtype=np.int64)
X_dur = np.zeros((len(sessions_raw), MAX_LEN), dtype=np.float32)
y = np.array(labels, dtype=np.float32)

for i, session in enumerate(sessions_raw):
    for j in range(min(len(session)-1, MAX_LEN)):
        dur = (session[j+1][1] - session[j][1]) / 1000
        X_page[i, j] = session[j][0]
        X_dur[i, j] = min(dur, 600) / 600.0
    # Last page padding index
    if len(session) <= MAX_LEN:
        X_page[i, len(session)-1] = session[-1][0]
        X_dur[i, len(session)-1] = 0.05 # default tiny duration

os.makedirs(OUTPUT_DIR, exist_ok=True)
np.save(os.path.join(OUTPUT_DIR, "X_page_real.npy"), X_page)
np.save(os.path.join(OUTPUT_DIR, "X_dur_real.npy"), X_dur)
np.save(os.path.join(OUTPUT_DIR, "y_abandon_real.npy"), y)

print("✅ Saved sequential data to data/processed/")
print(f"Abandonment Rate: {y.mean()*100:.1f}%")

✅ Saved sequential data to data/processed/
Abandonment Rate: 96.4%
