In [1]:
import pandas as pd
import numpy as np

In [70]:
# Read in the data file
df = pd.read_csv('DATA/Data/allDataOneFile.csv')

In [84]:
# Explanation of the column names.
# 'visitor_id-visit_number' this is the 'primary key' for the table. The visitor_id and visit number combined.
# 'url_id_path' A list of the url ids that the user followed during the visit.
# 'seconds_spent_path' A list of the seconds spent on each url.
# 'start_number' the time in milliseconds since epoch, the time the visit began.
# 'visited_mijnovmgeving' Boolean 1/0 depending on whether the visitor visited mijnabp in the path.
# 'event_src_site_id' id value for the source of the visit, (abp, mijnabp, etc.).

grouped_df = pd.DataFrame(columns=['visitor_id[visit_number]', 'url_id_path', 'seconds_spent_path', 'start_number', 'visited_mijnomgeving', 'event_src_site_id'])

In [None]:
# I wish to extract all unique visitor_id and visit_number pairs from the data.

# Set up the set for the unique_visits and dictionaries for the other columns of interest.
unique_visits = set()
url_id_paths = {}
seconds_spent_paths = {}
start_numbers = {}
visited_mijnomgeving = {}
event_src_site_ids = {}

i = 1
# Loop through the rows of the dataframe
for _, row in df.iterrows():
    # Concatenate the id and number to get visit_id
    visit_id = str(row['visitor_id']) + '[' + str(row['visit_number']) + ']'
    if not visit_id in unique_visits:
        # New visit_id
        unique_visits.add(visit_id)
        url_id_paths[visit_id] = [row['page_url_id']]
        seconds_spent_paths[visit_id] = [row['seconds_spent']]
        start_numbers[visit_id] = row['start_number']
        if row['is_mijn_omgeving_url'] == 1:
            visited_mijnomgeving[visit_id] = 1
        else:
            visited_mijnomgeving[visit_id] = 0
        event_src_site_ids[visit_id] = row['event_src_site_id']
    else:
        # Recurring visit
        # This concatenates seconds spent if the click remains in the same URL
        if url_id_paths[visit_id][-1] == row['page_url_id']:
            seconds_spent_paths[visit_id][-1] += row['seconds_spent']
        else:
            # Otherwise we need a new appendum to both lists.
            url_id_paths[visit_id].append(row['page_url_id'])
            seconds_spent_paths[visit_id].append(row['seconds_spent'])
        # Checking if we have already seen mijn_omgeving, and if not, check if we need to update.
        if visited_mijnomgeving[visit_id] == 0:
            if row['is_mijn_omgeving_url'] == 1:
                visited_mijnomgeving[visit_id] = 1
            else:
                visited_mijnomgeving[visit_id] = 0
    i+=1
    if i % 5000000 == 0:
        print(f"{i} rows done")

In [None]:
# Add the data to the columns in the grouped DataFrame.
grouped_df['visitor_id[visit_number]'] = list(url_id_paths.keys())
grouped_df['url_id_path'] = list(url_id_paths.values())
grouped_df['seconds_spent_path'] = list(seconds_spent_paths.values())
grouped_df['start_number'] = list(start_numbers.values())
grouped_df['visited_mijnomgeving'] = list(visited_mijnomgeving.values())
grouped_df['event_src_site_id'] = list(event_src_site_ids.values())

In [87]:
# Save to file
grouped_df.to_csv('DATA/Data/testGrouped.csv', index=False)