In [3]:
# load analytics.csv in dataframe
import pandas as pd

# load the csv file
imported_df = pd.read_csv('analytics.csv')

In [4]:
# Count events pr user_id
def count_events_per_user(df):
    # Group by user_id and count events
    grouped_df = df.groupby('user_id').size().reset_index(name='event_count')
    return grouped_df.sort_values(by='event_count', ascending=False)

count_events_per_user(imported_df).head(10)

Unnamed: 0,user_id,event_count
4,less-website,23
5,none,22
0,4624e222-3191-45a6-ba6a-a84594aafef6,1
1,4e1d2ac9-0df9-470d-a937-6ad83723429f,1
2,62a45b49-6aff-4c36-b55e-ff0648a04400,1
3,b75570a4-6341-4d70-9f0d-b684d7144ae1,1


In [5]:
import json

def merge_consecutive_time_spent_events(df):
    """
    Merges consecutive time-spent events with identical URLs by adding their duration values.
    
    Args:
        df: DataFrame with columns 'type', 'url', and 'payload' (JSON string with duration)
        
    Returns:
        DataFrame with merged consecutive time-spent events
    """
    # Create a copy of the input DataFrame to avoid modifying the original
    result = []
    i = 0
    
    while i < len(df):
        current_row = df.iloc[i].to_dict()
        
        # If this isn't a time-spent event, add it as-is and continue
        if current_row['type'] != 'time-spent':
            result.append(current_row)
            i += 1
            continue
        
        # We have a time-spent event - look for consecutive matches with same URL
        current_url = current_row['url']
        try:
            current_payload = json.loads(current_row['payload'])
        except TypeError:
            # If payload is not valid JSON, skip this row
            print(f"Invalid JSON in payload: {current_row['payload']}")
            result.append(current_row)
            i += 1
            continue
        # Initialize total duration with the current row's duration
        
        # Initialize with the current row's duration
        total_duration = current_payload.get('duration', 0)
        
        # Check subsequent rows
        j = i + 1
        while j < len(df) and df.iloc[j]['type'] == 'time-spent' and df.iloc[j]['url'] == current_url:
            next_payload = json.loads(df.iloc[j]['payload'])
            total_duration += next_payload.get('duration', 0)
            j += 1
        
        # If we found consecutive matches, create a merged event
        if j > i + 1:
            merged_payload = current_payload.copy()
            merged_payload['duration'] = total_duration
            
            # Add the merged event to our result
            merged_row = current_row.copy()
            merged_row['payload'] = json.dumps(merged_payload)
            result.append(merged_row)
            
            # Skip the rows we've merged
            i = j
        else:
            # No consecutive matches, keep the original row
            result.append(current_row)
            i += 1
    
    # Convert back to DataFrame
    return pd.DataFrame(result)

# Add a column that increments for each new sessionID it sees
def count_sessions(df):
    """
    Adds a session ID column to the DataFrame, incrementing for each new session ID.
    
    Args:
        df: DataFrame with a 'sessionID' column
        
    Returns:
        DataFrame with an added 'sessionID' column
    """
    # Create a copy of the input DataFrame to avoid modifying the original
    df = df.copy()
    
    # Initialize a new session ID column
    df['sessionCount'] = 0
    
    # Initialize the current session ID
    current_session_id = 0
    
    # Iterate through the DataFrame and assign session IDs
    for i in range(len(df)):
        if i == 0 or df.iloc[i]['session_id'] != df.iloc[i - 1]['session_id']:
            current_session_id += 1
        df.at[i, 'sessionCount'] = current_session_id
    
    return df

In [6]:
# zara userId = "1850b7a3-5cad-4d7b-a4a0-3c8dd534437d"
# 1: userId = "e2eb1291-5fec-45e6-bded-3018f0d67d66"
userId = "ddb693bf-df31-4881-a0de-5cca527b0504"
# 5b9790c0-43b6-4ba8-ac42-c97c3f3e109f	
user = imported_df[imported_df["user_id"] == userId]
user = merge_consecutive_time_spent_events(user)
userWithSession = count_sessions(user)

In [7]:
df = userWithSession[userWithSession['type'] != 'page-view']
df = df.drop(columns=['created_at', 'user_id', 'session_id'])
# Convert UTC timestamp to a more readable format
df['received_at'] = pd.to_datetime(df['received_at']).dt.strftime('%d/%m %H:%M:%S')

KeyError: 'type'

In [92]:
# print csv
df.to_csv("single.csv", index=False)