In [1]:
import pandas as pd
from ast import literal_eval

In [2]:
class ProximalCues:
    def __init__(self):
        generic = lambda x: literal_eval(x)

        conv = {'pc': generic}

        self.urls = pd.read_csv("../data/urls/url-reference_new.csv", converters=conv)

    # Get the proximal cues by page id
    def get_proximal_cues_by_id(self, id):
        return self.urls.loc[self.urls.id==id].pc.iloc[0]

In [3]:
import numpy as np


class TfIdf:
    def __init__(self):
        # Loading in the tf-idf
        # Rows are documents, the first column is the document id
        # Columns are keywords, the first row is the keyword id
        self.tfidf = pd.read_csv("../data/tf_idf/tf_idf.csv")

        # Loading in the keywords
        # Two columns, column 1 is id, column 2 is keyword
        # We can access the weight in the tf-idf by first accessing the id number from the keyword file
        self.tfidf_keywords = pd.read_csv("../data/tf_idf/tf_idf_keywords.csv")
        self.tfidf_keywords.columns = ["id", "keyword"]

    def get_id_by_keyword(self, keyword):
        return self.tfidf_keywords.id.iloc[self.tfidf_keywords[self.tfidf_keywords.keyword == keyword].index].values[0]


    def get_keyword_by_id(self, id):
        return self.tfidf_keywords.loc[self.tfidf_keywords.id==id, 'keyword'].iloc[0]
    # Method to get the tfidf value of a keyword in a page
    def get_tf_idf_value(self, page_id, keyword):
        keyword_id = self.get_id_by_keyword(keyword)
        return self.tfidf.iloc[page_id, keyword_id + 1]

    def get_all_keywords_by_id(self, page_id):
        keywords = self.tfidf.iloc[page_id][1:].values
        return keywords

    def get_all_keywords_by_id_normalized(self, page_id):
        keywords_weights = self.get_all_keywords_by_id(page_id)
        sum_value = sum(keywords_weights)
        keywords_weights_normalized = keywords_weights/sum_value
        return keywords_weights_normalized

    def get_number_of_keywords(self):
        return self.tfidf_keywords.shape[0]

In [4]:
class AdjacencyMatrix:
    def __init__(self):
        # Loading in the adjacency matrix
        # First column and row are data immediately, so access directly by page id
        # If row(page_id_1) leads to column(page_id_2) = 1 else = 0
        self.adjacency_matrix = pd.read_csv("../data/matrices/adjacency_matrix.csv", header=None)

    # Method to get the adjacency value from two page ids
    def get_adjacency_value(self, page_id_1, page_id_2):
        return self.adjacency_matrix.iloc[page_id_1, page_id_2] == 1

In [5]:
adjacency_matrix = AdjacencyMatrix()
proximal_cues = ProximalCues()
tfidf = TfIdf()

In [6]:
generic = lambda x: literal_eval(x)

conv = {'url_id_path': generic,
        'seconds_spent_path': generic}
df = pd.read_csv('../data/clickdata/dataNoUnscrapedVisitsOrUnder20Sec.csv', converters=conv)

paths = df.url_id_path

I took the liberty to implement my own - perhaps but not necessarily slightly simplified - version of IUNIS.

Input is the ordered list of page id's that the user has visited.
For each page, take the TF-IDF values from that page, which is a row of keywords, with either 0 if the keyword is not present or 0<value<=1 if it is.

Then, for each next page, we add the TF-IDF values onto the existing weights.

Eventually we can sort the list and we will have the keywords with the highest weight on top.

In [7]:
from pandas import DataFrame
def find_keyword_weights(path):
    weights = np.zeros(tfidf.get_number_of_keywords())
    pd.options.display.max_rows = 0
    for id in path:
        weights += tfidf.get_all_keywords_by_id_normalized(id)
        weights /= max(weights)
    df = DataFrame(weights, columns=['weights'])
    return df.sort_values(by=['weights'], ascending=False)

In this model, I introduced a factor of decay. Essentially, every iteration, all values are divided by 1.25, meaning that recent keywords are more biased.
This is a very simple implementation, we can experiment with changing this 1.25, and also look at different implementations.

In [8]:
def find_keyword_weights_more_weight_on_recent_pages(path):
    weights = np.zeros(tfidf.get_number_of_keywords())
    pd.options.display.max_rows = 0
    for id in path:
        weights = weights/1.25
        weights += tfidf.get_all_keywords_by_id_normalized(id)
        weights /= max(weights)
    df = DataFrame(weights, columns=['weights'])
    return df.sort_values(by=['weights'], ascending=False)

This is a method to print out the top 'num_of_words' weighted keywords.

In [9]:
def print_top_weights_as_words(num_of_words, weights):
    top_weights = weights.head(num_of_words)
    i=0
    for index in top_weights.index:
        print(str(tfidf.get_keyword_by_id(index)) + " : " + str(top_weights.weights.iloc[i]))
        i+=1
    print("\n")

Line 2 in this section gets rid of all useless pages (login, logout, search, and error)
Then, if the user has visited more than 5 pages, it will compute the keywords up until page 5.

In [10]:
for path in paths:
    path[:] = [x for x in path if x < 1556]
    if len(path)>5:
        path[:] = path[:5]
        print_top_weights_as_words(10, find_keyword_weights(path))

bijverdienen : 1.0
gekregen : 0.875277017645266
jaaropgave : 0.8620659396774324
betaalspecificatie : 0.8205906418828691
nabetaling : 0.7961984147428294
buitenland : 0.5409923820516527
dekkingsgraad : 0.4982173030124242
speciaal : 0.4324878278328123
uitbetaling : 0.43123633353548013
laxxxxxxxxx : 0.428532850054372


samenstellen : 1.0
bericht : 1.0
samenvatting : 0.9161629120995465
kiezen : 0.9055389089669759
laxxxxxxxxx : 0.8211387538905037
ineen : 0.713910540255049
dhr : 0.7102145594665861
deeltijdpensioen : 0.6951830527567521
loonheffingskorting : 0.6578939461454301
aanpassen : 0.6325852885784838


bijspaarproducten : 1.0
samenvatting : 0.6348361530871923
kiezen : 0.6274744697118948
samenstellen : 0.5995137542351816
ineen : 0.49468955252214997
deeltijdpensioen : 0.48171272715256386
loonheffingskorting : 0.4558740114825088
aanpassen : 0.4383369003449479
pensioenleeftijd : 0.39416896495012904
tijdelijk : 0.3840670767096012


bericht : 1.0
bijverdienen : 1.0
gekregen : 0.875277017645266

KeyboardInterrupt: 

These sections computes the weights after each different page to see how it changes throughout the visit
We can do interesting experiments to see how information need changes throughout the visit, and to find out at which page index we can best make our prediction final

In [None]:
path = paths[12]
path[:] = [x for x in path if x < 1556]
print(path)

In [None]:
for i in range(len(path)):
    subpath = path[:i+1]
    weights = find_keyword_weights_more_weight_on_recent_pages(subpath)
    print_top_weights_as_words(10, weights)

Idea for evaluation:

Look at the keywords at a specific time (we must make sure the weights are normalised, they are not in the implementation above), and compare it with a certain page near the end of the user visit to see if the user found their need.
We can do either the last page, or maybe the page that the user has spent the most time on.
The problem becomes that we want to evaluate the prediction of the information need, and there will be inaccuracies as the user data is not necessarily correct.
But if we can create a metric and we can compare different implementations with the same metric we still have something nice.
