In [13]:
import pandas as pd
from ast import literal_eval

In [23]:
class ProximalCues:
    def __init__(self):
        generic = lambda x: literal_eval(x)

        conv = {'pc': generic}

        self.urls = pd.read_csv("data/urls/url_references_reduced.csv", converters=conv)

    # Get the proximal cues by page id
    def get_proximal_cues_by_id(self, id):
        return self.urls.loc[self.urls.id==id].pc.iloc[0]

In [24]:
import numpy as np


class TfIdf:
    def __init__(self):
        # Loading in the tf-idf
        # Rows are documents, the first column is the document id
        # Columns are keywords, the first row is the keyword id
        self.tfidf = pd.read_csv("data/tf_idf/tf_idf.csv")

        # Loading in the keywords
        # Two columns, column 1 is id, column 2 is keyword
        # We can access the weight in the tf-idf by first accessing the id number from the keyword file
        self.tfidf_keywords = pd.read_csv("data/tf_idf/tf_idf_keywords.csv")
        self.tfidf_keywords.columns = ["id", "keyword"]

    def get_id_by_keyword(self, keyword):
        return self.tfidf_keywords.id.iloc[self.tfidf_keywords[self.tfidf_keywords.keyword == keyword].index].values[0]


    def get_keyword_by_id(self, id):
        return self.tfidf_keywords.loc[self.tfidf_keywords.id==id, 'keyword'].iloc[0]
    # Method to get the tfidf value of a keyword in a page
    def get_tf_idf_value(self, page_id, keyword):
        keyword_id = self.get_id_by_keyword(keyword)
        return self.tfidf.iloc[page_id, keyword_id + 1]

    def get_all_keywords_by_id(self, page_id):
        keywords = self.tfidf.iloc[page_id][1:].values
        return keywords

    def get_all_keywords_by_id_normalized(self, page_id):
        keywords_weights = self.get_all_keywords_by_id(page_id)
        sum_value = sum(keywords_weights)
        keywords_weights_normalized = keywords_weights/sum_value
        return keywords_weights_normalized

    def get_number_of_keywords(self):
        return self.tfidf_keywords.shape[0]

In [25]:
class AdjacencyMatrix:
    def __init__(self):
        # Loading in the adjacency matrix
        # First column and row are data immediately, so access directly by page id
        # If row(page_id_1) leads to column(page_id_2) = 1 else = 0
        self.adjacency_matrix = pd.read_csv("data/matrices/adjacency_matrix.csv", header=None)

    # Method to get the adjacency value from two page ids
    def get_adjacency_value(self, page_id_1, page_id_2):
        return self.adjacency_matrix.iloc[page_id_1, page_id_2] == 1

In [27]:
adjacency_matrix = AdjacencyMatrix()
proximal_cues = ProximalCues()
tfidf = TfIdf()

In [62]:
generic = lambda x: literal_eval(x)

conv = {'url_id_path': generic,
        'seconds_spent_path': generic}
df = pd.read_csv('data/clickdata/dataNoUnscrapedVisitsOrUnder20Sec.csv', converters=conv)

paths = df.url_id_path
seconds = df.seconds_spent_path

In [63]:
paths.head(10)

0    [188, 1557, 3, 1, 13, 14, 21, 16, 14, 18, 14, ...
1                               [1557, 3, 1, 13, 1556]
2                                      [188, 194, 784]
3                                    [23, 1557, 3, 13]
4        [1557, 3, 1, 13, 1, 1559, 12, 1559, 17, 1556]
5                                      [186, 217, 186]
6                                                [188]
7                                [1557, 3, 13, 23, 13]
8                             [23, 1557, 23, 1557, 13]
9                                [1557, 1, 1556, 1557]
Name: url_id_path, dtype: object

In [64]:
seconds.head(10)

0    [4, 17, 5, 7, 31, 27, 126, 55, 9, 13, 3, 328, 5]
1                                   [14, 4, 4, 33, 0]
2                                          [5, 20, 0]
3                                    [0, 163, 4, 151]
4            [12, 14, 34, 66, 358, 9, 18, 19, 908, 0]
5                                        [14, 868, 0]
6                                                [40]
7                                  [71, 7, 751, 1, 9]
8                                  [1, 298, 1, 35, 6]
9                                    [27, 15, 12, 74]
Name: seconds_spent_path, dtype: object

I took the liberty to implement my own - perhaps but not necessarily slightly simplified - version of IUNIS.

Input is the ordered list of page id's that the user has visited.
For each page, take the TF-IDF values from that page, which is a row of keywords, with either 0 if the keyword is not present or 0<value<=1 if it is.

Then, for each next page, we add the TF-IDF values onto the existing weights.

Eventually we can sort the list and we will have the keywords with the highest weight on top.

In [31]:
from pandas import DataFrame
def find_keyword_weights(path, sorted):
    weights = np.zeros(tfidf.get_number_of_keywords())
    pd.options.display.max_rows = 0
    for id in path:
        weights += tfidf.get_all_keywords_by_id_normalized(id)
        weights /= max(weights)
    if sorted:
        df = DataFrame(weights, columns=['weights'])
        return df.sort_values(by=['weights'], ascending=False)
    return weights

In this model, I introduced a factor of decay. Essentially, every iteration, all values are divided by 1.25, meaning that recent keywords are more biased.
This is a very simple implementation, we can experiment with changing this 1.25, and also look at different implementations.

In [32]:
def find_keyword_weights_more_weight_on_recent_pages(path, decay_factor, sorted):
    weights = np.zeros(tfidf.get_number_of_keywords())
    pd.options.display.max_rows = 0
    for id in path:
        weights = weights/decay_factor
        weights += tfidf.get_all_keywords_by_id_normalized(id)
        weights /= max(weights)
    if sorted:
        df = DataFrame(weights, columns=['weights'])
        return df.sort_values(by=['weights'], ascending=False)
    return weights

In [92]:
def find_keyword_weights_more_weight_on_recent_pages_and_long_time(path, secs, decay_factor, sorted):
    weights = np.zeros(tfidf.get_number_of_keywords())
    pd.options.display.max_rows = 0
    for i in range(len(path)):
        weights = weights/decay_factor
        timeweight = secs[i]/sum(secs)
        print("Timeweight:", timeweight)
        weights += (tfidf.get_all_keywords_by_id_normalized(path[i])*timeweight)
        weights /= max(weights)
    if sorted:
        df = DataFrame(weights, columns=['weights'])
        return df.sort_values(by=['weights'], ascending=False)
    return weights

This is a method to print out the top 'num_of_words' weighted keywords.

In [72]:
def print_top_weights_as_words(num_of_words, weights):
    top_weights = weights.head(num_of_words)
    i=0
    for index in top_weights.index:
        print(str(tfidf.get_keyword_by_id(index)) + " : " + str(top_weights.weights.iloc[i]))
        i+=1
    print("\n")

Line 2 in this section gets rid of all useless pages (login, logout, search, and error)
Then, if the user has visited more than 5 pages, it will compute the keywords up until page 5.

In [73]:
def remove_keywordless_pages(path, seconds):
    indices_to_remove = []
    for i in range(len(path)):
        if path[i]>1555:
            indices_to_remove.append(i)
    for i in range(len(indices_to_remove)-1, -1, -1):
        index = indices_to_remove[i]
        path.pop(index)
        seconds.pop(index)
    return path, seconds

These sections computes the weights after each different page to see how it changes throughout the visit
We can do interesting experiments to see how information need changes throughout the visit, and to find out at which page index we can best make our prediction final

In [93]:
print(paths[0])
print(seconds[0])
path, secs = remove_keywordless_pages(paths[0], seconds[0])
print(path)
print(secs)


[188, 3, 1, 13, 14, 21, 16, 14, 18, 14, 5]
[4, 5, 7, 31, 27, 126, 55, 9, 13, 3, 328]
[188, 3, 1, 13, 14, 21, 16, 14, 18, 14, 5]
[4, 5, 7, 31, 27, 126, 55, 9, 13, 3, 328]


In [94]:
for i in range(len(path)):
    subpath = path[:i+1]
    weights = find_keyword_weights_more_weight_on_recent_pages(subpath, 1.25, True)
    print_top_weights_as_words(10, weights)

bijverdienen : 1.0
gekregen : 0.875277017645266
jaaropgave : 0.8620659396774324
betaalspecificatie : 0.8205906418828691
nabetaling : 0.7961984147428294
buitenland : 0.5409923820516527
dekkingsgraad : 0.4982173030124242
speciaal : 0.4324878278328123
uitbetaling : 0.43123633353548013
betaaldatums : 0.3672792338716384


bijverdienen : 1.0
gekregen : 0.8752770176452659
jaaropgave : 0.8620659396774324
betaalspecificatie : 0.8205906418828691
nabetaling : 0.7961984147428294
buitenland : 0.5409923820516527
dekkingsgraad : 0.4982173030124241
speciaal : 0.4324878278328123
uitbetaling : 0.4312363335354801
betaaldatums : 0.36727923387163836


bijverdienen : 1.0
gekregen : 0.8752770176452659
jaaropgave : 0.8620659396774324
betaalspecificatie : 0.8205906418828691
nabetaling : 0.7961984147428294
buitenland : 0.5409923820516527
laxxxxxxxxx : 0.535666062567965
dekkingsgraad : 0.4982173030124241
dhr : 0.46330517813879507
speciaal : 0.4324878278328123


bijverdienen : 1.0
gekregen : 0.8752770176452659
ja

In [95]:
for i in range(len(path)):
    subpath = path[:i+1]
    subsecs = secs[:i+1]
    weights = find_keyword_weights_more_weight_on_recent_pages_and_long_time(subpath, subsecs, 1.25, True)
    print_top_weights_as_words(10, weights)

Timeweight: 1.0
bijverdienen : 1.0
gekregen : 0.875277017645266
jaaropgave : 0.8620659396774324
betaalspecificatie : 0.8205906418828691
nabetaling : 0.7961984147428294
buitenland : 0.5409923820516527
dekkingsgraad : 0.4982173030124242
speciaal : 0.4324878278328123
uitbetaling : 0.43123633353548013
betaaldatums : 0.3672792338716384


Timeweight: 0.4444444444444444
Timeweight: 0.5555555555555556
bijverdienen : 1.0
gekregen : 0.8752770176452659
jaaropgave : 0.8620659396774324
betaalspecificatie : 0.820590641882869
nabetaling : 0.7961984147428294
buitenland : 0.5409923820516527
dekkingsgraad : 0.4982173030124241
speciaal : 0.4324878278328122
uitbetaling : 0.4312363335354801
betaaldatums : 0.36727923387163836


Timeweight: 0.25
Timeweight: 0.3125
Timeweight: 0.4375
bijverdienen : 1.0
gekregen : 0.8752770176452659
jaaropgave : 0.8620659396774324
betaalspecificatie : 0.8205906418828691
nabetaling : 0.7961984147428294
buitenland : 0.5409923820516527
dekkingsgraad : 0.4982173030124241
speciaal 

Idea for evaluation:

Look at the keywords at a specific time (we must make sure the weights are normalised, they are not in the implementation above), and compare it with a certain page near the end of the user visit to see if the user found their need.
We can do either the last page, or maybe the page that the user has spent the most time on.
The problem becomes that we want to evaluate the prediction of the information need, and there will be inaccuracies as the user data is not necessarily correct.
But if we can create a metric and we can compare different implementations with the same metric we still have something nice.


In [39]:
import math


def estimate_most_important_page(path, seconds):
    path = path[int(len(path)/2):]
    seconds = seconds[int(len(seconds)/2):]
    index = seconds.index(max(seconds))
    return path[index]

def find_keywords_of_estimated_most_important_page(path, seconds):
    id = estimate_most_important_page(path, seconds)
    return tfidf.get_all_keywords_by_id_normalized(id)

from numpy import dot
from numpy.linalg import norm
def similarity_estimated_information_need(information_need, keywords_most_important_page):
    def cosine_similarity(list_1, list_2):
        cos_sim = dot(list_1, list_2) / (norm(list_1) * norm(list_2))
        return cos_sim
    similarity = cosine_similarity(information_need, keywords_most_important_page)
    if math.isnan(similarity):
        return 0
    return similarity

In [40]:
def compute_similarities(subpath_size):
    count=0
    similarity_list = []
    for j in range(len(paths)):
        path, second = remove_keywordless_pages(paths[j], seconds[j])
        if count == 1000:
            break
        if len(path)>6:
            count+=1
            subpath = path[:subpath_size]
            weights = find_keyword_weights(subpath, False)
            similarity_list.append(similarity_estimated_information_need(weights, find_keywords_of_estimated_most_important_page(path, second)))
    return similarity_list

In [41]:
similarity_list_1 = compute_similarities(1)
similarity_list_2 = compute_similarities(2)
similarity_list_3 = compute_similarities(3)
similarity_list_4 = compute_similarities(4)
similarity_list_5 = compute_similarities(5)

  keywords_weights_normalized = keywords_weights/sum_value


In [42]:
print(np.mean(similarity_list_1))
print(np.mean(similarity_list_2))
print(np.mean(similarity_list_3))
print(np.mean(similarity_list_4))
print(np.mean(similarity_list_5))

0.07635014852139813
0.09595198661187265
0.11406294830471776
0.13827845053808233
0.16547565422871116


In [43]:
def compute_similarities_decay(decay_factor):
    count=0
    similarity_list = []
    for j in range(len(paths)):
        path, second = remove_keywordless_pages(paths[j], seconds[j])
        if count == 1000:
            break
        if len(path)>6:
            count+=1
            subpath = path[:3]
            weights = find_keyword_weights_more_weight_on_recent_pages(subpath, decay_factor, False)
            similarity_list.append(similarity_estimated_information_need(weights, find_keywords_of_estimated_most_important_page(path, second)))
    return similarity_list

In [44]:
avg_sim_list = []

for i in range(1,51):
    similarity_list = compute_similarities_decay(i)
    avg_sim_list.append(np.mean(similarity_list))

  keywords_weights_normalized = keywords_weights/sum_value


KeyboardInterrupt: 

In [None]:
avg_sim_list

In [None]:
from matplotlib import pyplot as plt
plt.plot(list(range(1,51)), avg_sim_list)
plt.xlabel("Factor of Decay")
plt.ylabel("Similarity")
plt.show()

In [None]:
avg_sim_list_no_decay =[]

for i in range(1,8):
    similarity_list = compute_similarities(i)
    avg_sim_list_no_decay.append(np.mean(similarity_list))

In [None]:
plt.plot(list(range(1,8)), avg_sim_list_no_decay)
plt.xlabel("Prediction page")
plt.ylabel("Similarity")
plt.show()