In [62]:
import pandas as pd
import numpy as np
from collections import Counter
from ast import literal_eval
from matplotlib import pyplot as plt
from scipy.stats import t
import csv

In [58]:
# functions
def add_address_of_data(given_address): # could be useful for easily allowing others to use this file
    return "".join([given_address, "/cluster_paths.csv"])

def create_markov_chain_of_given_cluster(cluster): # creating transition matrix of given cluster
    # assuming the cluster as a list of visit arrays
    number_of_occurences = { }

    for current_visit in cluster: # accessing cluster
        if len(current_visit) > 1: # skipping visits with only one page
            for x in range(len(current_visit)-1):
                if current_visit[x] not in number_of_occurences.keys():
                    number_of_occurences[current_visit[x]] = {} # create "from" page if empty

                if current_visit[x+1] not in number_of_occurences[current_visit[x]].keys():
                    number_of_occurences[current_visit[x]][current_visit[x+1]] = 1 # create "to" page if empty

                number_of_occurences[current_visit[x]][current_visit[x+1]] = number_of_occurences[current_visit[x]][current_visit[x+1]] + 1 # increasing visit number from page x to page y

    # in the end normalize to get prob. of each transition
    for key in number_of_occurences.keys(): # for each "from" page
        temp_sub_dict = number_of_occurences[key]
        sum_of_row = sum(temp_sub_dict.values()) # find total number of outward visits from this page
        for sub_key in temp_sub_dict.keys(): # adding final prob. values
            number_of_occurences[key][sub_key] = number_of_occurences[key][sub_key] / sum_of_row

    return number_of_occurences

# need to decide if we should alter the prob. in the end (perhaps compare with other prob. in the end?)
def calculate_prob_of_visit(given_visit, list_of_chains):
    highest_prob_of_seq = 0
    total_prob = 0
    highest_prob_of_seq_label = 0
    label_count = 0
    for chain in list_of_chains:

        prob_of_seq = 1

        for x in range(len(given_visit)-1):
            temp_key = given_visit[x]
            temp_sub_key = given_visit[x+1]
            if temp_key in chain.keys():
                if temp_sub_key in chain[temp_key].keys():
                    prob_of_seq *= chain[temp_key][temp_sub_key]
                else: # TODO: what should we do if taken path is not on the chain
                    prob_of_seq = prob_of_seq**4 # since it's between 0-1 it will decrease
            else: # TODO: what should we do if taken path is not on the chain
                prob_of_seq = prob_of_seq**8 # larger penalty

        if highest_prob_of_seq < prob_of_seq:
            highest_prob_of_seq = prob_of_seq
            highest_prob_of_seq_label = label_count
        label_count += 1
        total_prob += prob_of_seq

    # Maybe instead return a percentage that compares with all other prob. for visual purposes?
    if total_prob <= 0:
        return 0, -1
    else:
        return highest_prob_of_seq /total_prob, highest_prob_of_seq_label
    # return highest_prob_of_seq, highest_prob_of_seq_label

def get_list(given_lists):
    temp = [given_list for given_list in given_lists] # should length of 0 not be added at all?
    return temp

In [3]:
my_address = "C:/Users/dnaen/APG_data"  # only this has to be modified
conv = {"url_id_path": literal_eval, "seconds_spent_path": literal_eval}
df = pd.read_csv(add_address_of_data(my_address), converters = conv)

In [5]:
# this part goes through the clusters
list_of_markov_chains = []
clusters = []

# creating temp clusters
# for x in range(0, 10000, 1000):
#    clusters.append(get_list(df["url_id_path"].iloc[x:x+1000]))

# creating clusters with given file
for x in range(0, df["cluster_label"].max()):
    clusters.append(get_list(df.loc[df["cluster_label"] == x, "path"]))

for cluster in clusters: # saving each markov chain
    list_of_markov_chains.append(create_markov_chain_of_given_cluster(cluster))

In [35]:
# trial
calculate_prob_of_visit(get_list(df["path"].iloc[34]), list_of_markov_chains)

(0.5403888246399355, 7)

In [64]:
list_of_markov_chains[0]

{'[': {'1': 0.8338322056860499,
  '2': 0.11113489063795604,
  '6': 0.005020908840243719,
  '8': 0.011867421727475342,
  '3': 0.011115878076725141,
  '9': 0.015776444133430354,
  '7': 0.00661657039276368,
  '5': 0.0030897900555345963,
  '4': 0.0015458904498212719},
 '1': {'8': 0.08534055333592942,
  '5': 0.2424139992824969,
  ',': 0.26586273613977207,
  '3': 0.08210553427917003,
  '4': 0.11043746224676587,
  '6': 0.03301446351312816,
  '2': 0.08069996412484554,
  '7': 0.04834131070030141,
  '9': 0.027486378173188074,
  ']': 0.010488546013718413,
  '0': 0.008512162290613282,
  '1': 0.00529688990007083},
 '8': {'8': 0.23344863781363223,
  ',': 0.4588614205905008,
  '6': 0.03194925235788904,
  '0': 0.0222793119667595,
  '9': 0.034946997960917804,
  '7': 0.08120891556969342,
  '4': 0.02213384632933164,
  '5': 0.02907454376977724,
  ']': 0.029304597354960507,
  '3': 0.012856086423249894,
  '1': 0.019736546991809066,
  '2': 0.024199842871478867},
 ',': {' ': 1.0},
 ' ': {'1': 0.57003839454081

In [61]:
list_of_markov_chains.to_csv("C:/Users/dnaen/APG_data/list_of_markov_chains.csv", index=False)

AttributeError: 'list' object has no attribute 'to_csv'

In [59]:
# experiment
accurate_estimation_prob = 0
false_estimation_prob = 0

total_accurate_prob = 0
total_false_prob = 0

# row size
row_size = len(df.axes[0])

for x in range(0, len(df.axes[0])):
    if df.loc[x]["cluster_label"] >= 0:
        estimated_prob, estimated_label = calculate_prob_of_visit(get_list(df["path"].iloc[x]), list_of_markov_chains)
        correct_label = df.loc[x]["cluster_label"]

        if estimated_label == correct_label:
            total_accurate_prob += estimated_prob
        else:
            total_false_prob += estimated_prob

accurate_estimation_prob = total_accurate_prob / row_size
false_estimation_prob = total_false_prob / row_size

print("Accurate prediction with prob.")
print(accurate_estimation_prob)
print("False prediction with prob.")
print(false_estimation_prob)

Accurate prediction with prob.
0.2361750165115125
False prediction with prob.
0.4311013386871281


In [51]:
"""
with "**2", "**4"
Accurate prediction with prob.
0.23586278072337818
False prediction with prob.
0.43129438494374694
"""
"""
with "**4", "**8"
Accurate prediction with prob.
0.2361750165115125
False prediction with prob.
0.4311013386871281
"""
"""
without "**2", "**4"
Accurate prediction with prob.
0.15800201996288196
False prediction with prob.
0.5001289899211507
"""

'[188, 1557, 1, 12, 7, 12, 19, 13, 12, 19, 13, 1559, 12, 19, 13, 1559, 1556, 1557, 1, 12, 13]'