In [33]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [69]:
# functions
def add_address_of_data(given_address): # could be useful for easily allowing others to use this file
    return "".join([given_address, "/cluster_paths.csv"])

def create_markov_chain_of_given_cluster(cluster): # creating transition matrix of given cluster
    # assuming the cluster as a list of visit arrays
    number_of_occurences = { }

    for current_visit in cluster: # accessing cluster
        if len(current_visit) > 1: # skipping visits with only one page
            for x in range(len(current_visit)-1):
                if current_visit[x] not in number_of_occurences.keys():
                    number_of_occurences[current_visit[x]] = {} # create "from" page if empty

                if current_visit[x+1] not in number_of_occurences[current_visit[x]].keys():
                    number_of_occurences[current_visit[x]][current_visit[x+1]] = 1 # create "to" page if empty

                number_of_occurences[current_visit[x]][current_visit[x+1]] = number_of_occurences[current_visit[x]][current_visit[x+1]] + 1 # increasing visit number from page x to page y

    # in the end normalize to get prob. of each transition
    for key in number_of_occurences.keys(): # for each "from" page
        temp_sub_dict = number_of_occurences[key]
        sum_of_row = sum(temp_sub_dict.values()) # find total number of outward visits from this page
        for sub_key in temp_sub_dict.keys(): # adding final prob. values
            number_of_occurences[key][sub_key] = number_of_occurences[key][sub_key] / sum_of_row

    return number_of_occurences

# calculates the probability of the given visit
def calculate_prob_of_visit(given_visit, list_of_chains):
    highest_prob_of_seq = 0
    total_prob = 0
    highest_prob_of_seq_label = 0
    label_count = 0

    # goes through each chain (cluster/label)
    for chain in list_of_chains:

        prob_of_seq = 1

        for x in range(len(given_visit)-1):
            temp_key = given_visit[x]
            temp_sub_key = given_visit[x+1]
            if temp_key in chain.keys():
                if temp_sub_key in chain[temp_key].keys():
                    prob_of_seq *= chain[temp_key][temp_sub_key]
                # else: # taken path (so "from" exists but "to" page does not) is not on the chain, so need to give some sort of penalty
                    # the penalty value is up to us
                    # prob_of_seq = prob_of_seq**4 # since it's between 0-1 it will decrease
            # else: # in this case "from" page doesn't exist too, need to give higher penalty
                # prob_of_seq = prob_of_seq**8 # larger penalty

        if highest_prob_of_seq < prob_of_seq:

            highest_prob_of_seq = prob_of_seq
            highest_prob_of_seq_label = label_count
        label_count += 1
        total_prob += prob_of_seq


    if total_prob <= 0: # means visit size is 1 or 0 hence immediately skipping all code above
        return 0, -1
    else:
        # instead of returning the actual prob., returning a percentage that compares with all other prob. for visual purposes
        return highest_prob_of_seq / total_prob, highest_prob_of_seq_label
        # this can be used instead for later, to define a min. threshold
        # return highest_prob_of_seq, highest_prob_of_seq_label

def get_list(given_lists):
    temp = [given_list for given_list in given_lists] # creating nested list
    return temp

In [35]:
my_address = "C:/Users/dnaen/APG_data"  # only this has to be modified
df = pd.read_csv(add_address_of_data(my_address))
train, test = train_test_split(df, test_size=0.2)

In [36]:
train.head()

Unnamed: 0,visit_id,cluster_label,path
7488,134151[1],0,"[3, 1, 14, 21, 14, 5]"
67826,348931[6],-1,"[23, 3, 13, 32, 1, 13, 32, 17, 32]"
82229,622175[25],1,"[188, 966, 188, 237, 188]"
9544,53852[1],11,"[188, 187, 3, 1, 13]"
62022,216927[11],-1,"[618, 901, 681, 901, 481, 439]"


In [37]:
# this part goes through the clusters
list_of_markov_chains = []
clusters = []

# creating clusters with given file
for x in range(0, train["cluster_label"].max()): # going through each label
    clusters.append(get_list(train.loc[train["cluster_label"] == x, "path"]))

for cluster in clusters: # saving each markov chain
    list_of_markov_chains.append(create_markov_chain_of_given_cluster(cluster))

In [38]:
# trial
calculate_prob_of_visit(get_list(test["path"].iloc[34]), list_of_markov_chains)

(0.9999712799388378, 35)

In [70]:
# experiment
accurate_estimation_prob = 0
false_estimation_prob = 0

total_accurate_prob = 0
total_false_prob = 0

# row size
row_size = len(test.axes[0])

# using each data point of given dataset as a new visit
for x in range(0, row_size):
    if test.iloc[x]["cluster_label"] >= 0: # if visits actual label >= 0, then continue (meaning it's labelled data)
        estimated_prob, estimated_label = calculate_prob_of_visit(get_list(test["path"].iloc[x]), list_of_markov_chains)
        correct_label = test.iloc[x]["cluster_label"]

        if estimated_label == correct_label:
            total_accurate_prob += estimated_prob
        else:
            total_false_prob += estimated_prob

accurate_estimation_prob = total_accurate_prob / row_size
false_estimation_prob = total_false_prob / row_size

print("Accurate prediction with prob.")
print(accurate_estimation_prob)
print("False prediction with prob.")
print(false_estimation_prob)

Accurate prediction with prob.
0.07589914179300772
False prediction with prob.
0.24371660763395178


In [40]:
"""
--- NEW DATA ---
- with data split -
---
with "**2", "**4"
Accurate prediction with prob.
0.16584946834774958
False prediction with prob.
0.1284352770748932
---
with "**4", "**8"
Accurate prediction with prob.
0.16603716939094812
False prediction with prob.
0.1284816459135557
---
without penalty
Accurate prediction with prob.
0.07589914179300772
False prediction with prob.
0.24371660763395178
---



- with data split -
---
with "**4", "**8"
Accurate prediction with prob.
0.2388692717237677
False prediction with prob.
0.43046396467076636
---
with "**2", "**4"
Accurate prediction with prob.
0.23843819277924
False prediction with prob.
0.43074425945854444
---
Accurate prediction with prob.
0.15161087259582942
False prediction with prob.
0.5060816486264718
---

- without data split -
---
with "**2", "**4"
Accurate prediction with prob.
0.23586278072337818
False prediction with prob.
0.43129438494374694
---
---
with "**4", "**8"
Accurate prediction with prob.
0.2361750165115125
False prediction with prob.
0.4311013386871281
---
---
without "**2", "**4"
Accurate prediction with prob.
0.15800201996288196
False prediction with prob.
0.5001289899211507
"""

'\n--- NEW DATA ---\n- with data split -\n\n\n\n\n\n- with data split -\n---\nwith "**4", "**8"\nAccurate prediction with prob.\n0.2388692717237677\nFalse prediction with prob.\n0.43046396467076636\n---\nwith "**2", "**4"\nAccurate prediction with prob.\n0.23843819277924\nFalse prediction with prob.\n0.43074425945854444\n---\nAccurate prediction with prob.\n0.15161087259582942\nFalse prediction with prob.\n0.5060816486264718\n---\n\n- without data split -\n---\nwith "**2", "**4"\nAccurate prediction with prob.\n0.23586278072337818\nFalse prediction with prob.\n0.43129438494374694\n---\n---\nwith "**4", "**8"\nAccurate prediction with prob.\n0.2361750165115125\nFalse prediction with prob.\n0.4311013386871281\n---\n---\nwithout "**2", "**4"\nAccurate prediction with prob.\n0.15800201996288196\nFalse prediction with prob.\n0.5001289899211507\n'