In [1]:
import pandas as pd
import numpy as np
from ast import literal_eval

# Load in Grouped Data File

In [None]:
generic = lambda x: literal_eval(x)

conv = {'url_id_path': generic,
        'seconds_spent_path': generic}

df = pd.read_csv('DATA/Data/allDataOneFile.csv', converters=conv)

# Load in Test Data File
### Used to test the validity and correctness of the below methods

In [3]:
# load test data
testDF = pd.read_csv('DATA/Data/testVis.csv', converters=conv)

# Method to get the number of visits which visit an unscraped URL

In [33]:
def get_number_of_visits_to_unscraped_urls():
    count_visits_to_unscraped_urls = 0
    indices = []
    for index, row in df.iterrows():
        found_unscraped_url = False
        for url_id in row.url_id_path:
            # If there is a new visit which visits an unscraped URL, increase counter
            if url_id > 1559 and not found_unscraped_url:
                count_visits_to_unscraped_urls += 1
                indices.append(index)
                found_unscraped_url = True
        if index % 100000 == 0: # Update on progress as can take quite a few minutes.
            print(f"{index} rows complete...")
    print("DONE!")
    return count_visits_to_unscraped_urls, indices

# The % of visits which visit an unscraped URL
11% of visits visit an unscraped URL.

# Method to get number of visits less than 20 seconds and path length of those visits

In [5]:
def get_number_of_visits_less_20_seconds_and_their_path_lengths():
    visits_less_20_seconds = 0
    path_lengths = []
    times_spent = []
    for index, row in df.iterrows():
        timeSpent = sum(row.seconds_spent_path)
        if timeSpent < 20: # Change here for less than or greater than 20
            visits_less_20_seconds += 1
            path_lengths.append(len(row.url_id_path)) # Keep track of path lengths for sub 20 seconds visits
            times_spent.append(timeSpent)
        if index % 100000 == 0:
            print(f"{index} rows complete...")

    print("DONE!")
    return [times_spent, visits_less_20_seconds, path_lengths]

In [None]:
returned = get_number_of_visits_less_20_seconds_and_their_path_lengths()
times_20 = returned[0]
visits_less_20 = returned[1]
path_lengths_under_20 = returned[2]

In [30]:
print(visits_less_20/len(df.index))

0.3356965426419714


In [31]:
print(sum(path_lengths_under_20)/len(path_lengths_under_20))

1.1697623223123246


In [32]:
print(sum(times_20)/len(times_20))

1.9427805207294973


# Information about visits that are less than 20 seconds in duration

## The % of visits that are less than 20 seconds in duration
33.5% of visits are less than 20 seconds in duration.

## The average path lengths in visits under 20 seconds in duration
1.1697623223123246 is the average path length.

## Average time spent in visits under 20 seconds in duration
1.9427805207294973 is the average time spent.

In [None]:
visits_greater_20, path_lengths = get_number_of_visits_less_20_seconds_and_their_path_lengths()

In [20]:
print(visits_greater_20/len(df.index))

0.6643034573580285


In [21]:
print(sum(path_lengths)/len(path_lengths))

7.4080256085265805


In [None]:
times_spent = get_number_of_visits_less_20_seconds_and_their_path_lengths()

In [22]:
print(sum(times_spent)/len(times_spent))

482.03758769414577


# Information about visits over or equal to 20 seconds

## The % of visits over or equal to 20 seconds
66.4% of visits are over or equal to 20 seconds

## The average path length in visits over or equal to 20 seconds
7.4 Pages visited in the path on average

## The average time spent by visitors with visits over or equal to 20 seconds
482 seconds or over 8 minutes!


## Method to get the number of visits with path length <= 3 and duration < 20 seconds

In [49]:
def get_number_visits_path_less_equal_3_and_20_seconds():
    visit_count = 0
    indices = []
    for index, row in df.iterrows():
        time = sum(row.seconds_spent_path)
        if len(row.url_id_path) < 4 and time < 20:
            visit_count += 1
            indices.append(index)
        if index % 100000 == 0:
            print(f"{index} rows complete...")

    print("DONE!")
    return visit_count, indices

In [None]:
number_visits_under_4_and_20, times = get_number_visits_path_less_equal_3_and_20_seconds()

# Removing the 11% of unscraped page visits.
## Motivate assumption in report/presentation

In [None]:
counts, indexes_of_unscraped_visits = get_number_of_visits_to_unscraped_urls()

# Drop the rows which contain visits to unscraped URLs and save to csv

In [40]:
df.drop(df.index[indexes_of_unscraped_visits], inplace=True)

In [42]:
df.to_csv('DATA/Data/dataNoUnscrapedVisits.csv', index=False)

# Recheck the amount of visits with path <=3 and duration < 20 seconds

In [None]:
counts_path_time, indexes_of_visits_under_3_and_20 = get_number_visits_path_less_equal_3_and_20_seconds()

In [60]:
counts_path_time # Amount of visits

1852742

In [63]:
counts_path_time/len(df.index) # Percentage of rows to be removed ## 34% ##

0.3408249222045705

# Removing the 34%
### Motivate these assumptions etc.

## Dropping the rows which have index which contains the 34% of 'short' visits

In [65]:
df.drop(df.index[indexes_of_visits_under_3_and_20], inplace=True)

In [67]:
df.to_csv('DATA/Data/dataNoUnscrapedVisitsOrUnder20Sec.csv', index=False)

In [2]:
df = pd.read_csv('DATA/Data/dataNoUnscrapedVisitsOrUnder20Sec.csv')

In [3]:
df

Unnamed: 0,visitor_id[visit_number],url_id_path,seconds_spent_path,start_number,visited_mijnomgeving,event_src_site_id
0,0[1],"[188, 1557, 3, 1, 13, 14, 21, 16, 14, 18, 14, ...","[4, 17, 5, 7, 31, 27, 126, 55, 9, 13, 3, 328, 5]",1644099614035,1,0
1,1[1],"[1557, 3, 1, 13, 1556]","[14, 4, 4, 33, 0]",1642603154936,1,1
2,3[1],"[188, 194, 784]","[5, 20, 0]",1664380711086,0,0
3,4[1],"[23, 1557, 3, 13]","[0, 163, 4, 151]",1642669318114,1,1
4,5[1],"[1557, 3, 1, 13, 1, 1559, 12, 1559, 17, 1556]","[12, 14, 34, 66, 358, 9, 18, 19, 908, 0]",1651248315219,1,1
...,...,...,...,...,...,...
3583305,3225293[14],"[1557, 54, 1557, 3, 37, 7, 13, 1556]","[30, 324, 131, 18, 93, 43, 20, 6]",1654507084598,1,1
3583306,3225293[15],"[23, 1557, 23, 1557, 13, 12]","[32, 0, 187, 95, 31, 0]",1658391870260,1,1
3583307,3225294[1],"[1557, 121, 14, 17, 7, 12, 13]","[15, 0, 7, 21, 2, 61, 0]",1655666642494,1,1
3583308,3225294[3],"[1557, 14, 121, 17]","[19, 0, 5, 0]",1656063998310,1,1
