In [2]:
import pandas as pd
import numpy as np
from collections import Counter
from ast import literal_eval
from matplotlib import pyplot as plt
from scipy.stats import t

In [31]:
# functions
def add_address_of_data(given_address): # could be useful for easily allowing others to use this file
    return "".join([given_address, "/testGrouped.csv"])

def add_address_of_reduced_url(given_address): # could be useful for easily allowing others to use this file
    return "".join([given_address, "/references/url_references_reduced.csv"])

def create_markov_chain_of_given_cluster(cluster): # creating transition matrix of given cluster
    # assuming the cluster as a list of visit arrays
    number_of_occurences = { }

    for current_visit in cluster: # accessing cluster
        if len(current_visit) > 1: # skipping visits with only one page
            for x in range(len(current_visit)-1):
                if current_visit[x] not in number_of_occurences.keys():
                    number_of_occurences[current_visit[x]] = {} # create "from" page if empty

                if current_visit[x+1] not in number_of_occurences[current_visit[x]].keys():
                    number_of_occurences[current_visit[x]][current_visit[x+1]] = 1 # create "to" page if empty

                number_of_occurences[current_visit[x]][current_visit[x+1]] = number_of_occurences[current_visit[x]][current_visit[x+1]] + 1 # increasing visit number from page x to page y

    # in the end normalize to get prob. of each transition
    for key in number_of_occurences.keys(): # for each "from" page
        temp_sub_dict = number_of_occurences[key]
        sum_of_row = sum(temp_sub_dict.values()) # find total number of outward visits from this page
        for sub_key in temp_sub_dict.keys(): # adding final prob. values
            number_of_occurences[key][sub_key] = number_of_occurences[key][sub_key] / sum_of_row

    return number_of_occurences

# need to decide if we should alter the prob. in the end (perhaps compare with other prob. in the end?)
def calculate_prob_of_visit(given_visit, list_of_chains):
    highest_prob_of_seq = 0
    total_prob = 0
    for chain in list_of_chains:
        prob_of_seq = 1

        for x in range(len(given_visit)-1):
            temp_key = given_visit[x]
            temp_sub_key = given_visit[x+1]
            if temp_key in chain.keys():
                if temp_sub_key in chain[temp_key].keys():
                    prob_of_seq *= chain[temp_key][temp_sub_key]
                else: # TODO: what should we do if taken path is not on the chain
                    prob_of_seq = prob_of_seq**2 # since it's between 0-1 it will decrease
            else: # TODO: what should we do if taken path is not on the chain
                prob_of_seq = prob_of_seq**4 # larger penalty
        total_prob += prob_of_seq
        if highest_prob_of_seq < prob_of_seq: highest_prob_of_seq = prob_of_seq

    # Maybe instead return a percentage that compares with all other prob. for visual purposes?
    # return highest_prob_of_seq /total_prob
    return highest_prob_of_seq

def get_list(given_lists):
    temp = [given_list for given_list in given_lists] # should length of 0 not be added at all?
    return temp

In [4]:
my_address = "C:/Users/dnaen/APG_data"  # only this has to be modified
conv = {"url_id_path": literal_eval, "seconds_spent_path": literal_eval}
df = pd.read_csv(add_address_of_data(my_address), converters = conv)

In [7]:
# this part goes through the clusters
list_of_markov_chains = []
clusters = []

# creating temp clusters
for x in range(0, 10000, 1000):
    clusters.append(get_list(df["url_id_path"].iloc[x:x+1000]))

for cluster in clusters: # saving each markov chain
    list_of_markov_chains.append(create_markov_chain_of_given_cluster(cluster))


In [29]:
calculate_prob_of_visit(get_list(df["url_id_path"].iloc[0]), list_of_markov_chains)

1
0.2919254658385093
0.12686480524757648
0.07381899486627988
0.010313799910673327
0.0005139767397345511
9.225223533697072e-05
2.807676727646935e-05
3.954474264291458e-06
8.922916288657649e-07
2.0913085051541365e-07
3.5391374702608465e-08
1
0.25
0.10848287112561175
0.06005301794453508
0.00935159566490884
0.0001416908434077097
2.2740505732101554e-05
7.375299156357261e-06
9.36545924616795e-07
2.139024642643297e-07
6.510074999349165e-08
1.727982870197618e-08
1
0.29896907216494845
0.10133767630130922
0.05534308521252312
0.006734415985299225
0.00019560544355640903
2.1661005494501672e-05
6.548676079733063e-06
3.2743380398665314e-07
5.054347309861089e-08
1.3538430294270773e-08
6.360336379858753e-09
1
0.184
0.0538989898989899
0.026544239386344648
0.00342362886557544
0.00013121922481299596
2.1472236787581158e-05
2.928032289215612e-06
8.296091486110901e-07
1.3952517499368333e-07
2.4265247824988404e-08
6.728091442383148e-09
1
0.25
0.10056089743589744
0.05302889768323851
0.007849042617374654
0.0003

4.128993715304321e-09

In [5]:
# trial
size = 10000
temp = get_list(df["url_id_path"].head(size))
a = create_markov_chain_of_given_cluster(temp)

[[188, 1557, 3, 1, 13, 14, 21, 16, 14, 18, 14, 5, 1556],
 [1557, 3, 1, 13, 1556],
 [978],
 [188, 194, 784],
 [23, 1557, 3, 13],
 [1557, 3, 1, 13, 1, 1559, 12, 1559, 17, 1556],
 [859],
 [186, 217, 186],
 [188],
 [1557, 3, 13, 23, 13],
 [23, 1557, 23, 1557, 13],
 [23, 1557, 27],
 [1557, 1, 1556, 1557],
 [1557, 3, 86, 3, 86, 3, 92, 3, 7, 19, 14, 18, 12, 18, 12, 1556],
 [1557, 13],
 [188, 228, 1557, 3, 1, 12, 7, 20, 1, 7, 1, 12, 7, 13, 188, 1, 1556],
 [1557, 3, 1, 17, 956],
 [1557, 956],
 [1557],
 [1557, 1, 17, 12, 17, 13],
 [1557, 3, 13, 1556],
 [1557],
 [1557],
 [1557, 3, 13],
 [809],
 [1557, 3, 13, 1556, 1557, 13, 1556],
 [1557],
 [1557, 23, 1557, 23, 1557, 3, 13, 956],
 [1557,
  3,
  1,
  13,
  12,
  802,
  7,
  19,
  20,
  24,
  1556,
  1557,
  1,
  12,
  20,
  24,
  57,
  25,
  20,
  1556],
 [1557, 23, 1557, 23, 1557],
 [1557, 3, 1, 12, 19, 13],
 [504, 206],
 [3, 1, 13, 1556],
 [1557],
 [1557, 3, 13, 1556],
 [188],
 [199],
 [905],
 [1557, 3, 1, 17, 33, 13, 12, 17],
 [1557, 1, 13, 33,