In [1]:
import pandas as pd
import numpy as np
df_final_demo = pd.read_csv(r"/Users/muayadhilamia/Desktop/Ironhack/Week-5/Project/week5_6_project/data/raw/df_final_demo.txt")
df_final_experiment_clients = pd.read_csv(r"/Users/muayadhilamia/Desktop/Ironhack/Week-5/Project/week5_6_project/data/raw/df_final_experiment_clients.txt")
df_final_web_data_pt_1 = pd.read_csv(r"/Users/muayadhilamia/Desktop/Ironhack/Week-5/Project/week5_6_project/data/raw/df_final_web_data_pt_1.txt")
df_final_web_data_pt_2 = pd.read_csv(r"/Users/muayadhilamia/Desktop/Ironhack/Week-5/Project/week5_6_project/data/raw/df_final_web_data_pt_2.txt")


----------------------

                    -- Concating and merging 3 files to df_final_web_data

----------------------

In [2]:
# Concating df_final_web p1 and p2 and merging them with df_final_experiment_clients
df_final_web_data = pd.concat([df_final_web_data_pt_1, df_final_web_data_pt_2], ignore_index=True)

# Merging three tables 
df_web_experiment = pd.merge(
    df_final_web_data,
    df_final_experiment_clients,
    on="client_id",
    how="inner"
)

----------------------

                    -- Cleaning and sorting df_web_experiment

----------------------

In [3]:
# cleaning and droping duplicated and null values in Variation 
df_web_experiment = df_web_experiment.drop_duplicates()
# df_web_experiment = df_web_experiment.dropna(subset=['Variation'])

# replacing steps by numbers 
df_web_experiment["process_step"] = (df_web_experiment["process_step"].replace({
    "start": 1,
    "step_1": 2,
    "step_2": 3,
    "step_3": 4,
    "confirm": 5
}).astype(int))

# fixing date_time 
df_web_experiment['date_time'] = pd.to_datetime(df_web_experiment['date_time'], errors='coerce')

# sorting by client, visit and date_time
df_web_experiment = df_web_experiment.sort_values(
    by=['client_id', 'visit_id', 'date_time'],
    ascending=[True, True, True]
)

  df_web_experiment["process_step"] = (df_web_experiment["process_step"].replace({


----------------------

                    -- Adding new columns by group by to df_web_experiment

----------------------

In [4]:
# --- Safe copy of the DataFrame ---
df_web_experiment = df_web_experiment.copy()

# --- 1. Add time_from_prev_step in seconds ---
df_web_experiment['time_from_prev_step'] = (
    df_web_experiment
    .groupby('visit_id')['date_time']
    .diff()                           # difference between each step and the previous one within the same visit
    .dt.total_seconds()
    .astype('Int64')
)

# --- 2. Add time_from_start in seconds ---
df_web_experiment['time_from_start'] = (
    df_web_experiment
    .groupby('visit_id')['date_time']
    .transform(lambda x: x - x.min())  # time difference between the first step and each step
    .dt.total_seconds()
    .astype('Int64')
)

# --- 3. Add num_steps: total rows (steps) per visit_id ---
df_web_experiment['num_steps'] = (
    df_web_experiment
    .groupby('visit_id')['process_step']
    .transform('count')
)

# --- 4. Add num_visits: unique number of visits per client ---
df_web_experiment['num_visits'] = (
    df_web_experiment
    .groupby('client_id')['visit_id']
    .transform('nunique')
)

# --- 5. Get last step per visit_id ---
last_step_per_visit = (
    df_web_experiment
    .groupby('visit_id', as_index=False)['process_step']
    .max()
    .rename(columns={'process_step': 'last_step'})
)

# --- 6. Merge last_step info back to main DataFrame ---
df_web_experiment = df_web_experiment.merge(last_step_per_visit, on='visit_id', how='left')

# --- 7. Add completed column (True if last_step == 5) ---
df_web_experiment['completed'] = df_web_experiment['last_step'] == 5

# --- 8. Count how many times each step was repeated per visit_id ---
df_web_experiment['step_repeat_count'] = (
    df_web_experiment
    .groupby(['visit_id', 'process_step'])['process_step']
    .transform('count')
)

# --- 9. Add total_steps: number of unique process_step per visit_id ---
df_web_experiment['total_steps'] = (
    df_web_experiment
    .groupby('visit_id')['process_step']
    .transform('nunique')
)



# -- 10 Detect when a user moves backwards within a visit ---
df_web_experiment['step_diff'] = (
    df_web_experiment
    .groupby('visit_id')['process_step']
    .diff()  # difference between current and previous step
)

# Mark an error when step_diff < 0 (user went back)
df_web_experiment['error_flag'] = df_web_experiment['step_diff'] < 0


----------------------

                    -- Merging all of them together

----------------------

In [9]:
# Merging all together 
df_raw = pd.merge(
    df_web_experiment,
    df_final_demo,
    on="client_id",
    how="left"
)

----------------------

                    -- Cleaning df_raw

----------------

In [10]:
# making columns int and rounding "bal"
df_raw = df_raw.copy()

# columns i want to change to int
int_cols = ['client_id', 'clnt_tenure_yr', 'clnt_tenure_mnth', 'clnt_age',
            'num_accts', 'calls_6_mnth', 'logons_6_mnth']

# to round it
float_col = 'bal'

# trasnforming the columns 
for col in int_cols:
    df_raw[col] = (
        pd.to_numeric(df_raw[col], errors='coerce')                                         
        .round()                                           
        .astype('Int64')                                  
    )

# rounding float "bal"
df_raw[float_col] = (
    pd.to_numeric(df_raw[float_col], errors='coerce')
    .round(2)
)

# fixing date_time
df_raw['date_time'] = pd.to_datetime(df_raw['date_time'], errors='coerce')

# sorting by client, visit and date_time
df_raw = df_raw.sort_values(
    by=['client_id', 'visit_id', 'date_time'],
    ascending=[True, True, True]
)

bins = [18, 25, 35, 45, 55, 65, 100]
labels = ['18-24', '25-34', '35-44', '45-54', '55-64', '65+']
df_raw['age_group'] = pd.cut(df_raw['clnt_age'], bins=bins, labels=labels, right=False)




In [None]:
# df_raw.to_csv("df_cleand_raw_m")

In [7]:
pd.set_option('display.max_columns', None)