In [61]:
import pandas as pd
from ast import literal_eval

I dislike classes in python but I think it makes sense to make these classes.
I would put the adj matrix in the tfidf but that would just create confusion

In [62]:
class ProximalCues:
    def __init__(self):
        generic = lambda x: literal_eval(x)

        conv = {'pc': generic}

        self.urls = pd.read_csv("../data/urls/url-reference_new.csv", converters=conv)

    # Get the proximal cues by page id
    def get_proximal_cues_by_id(self, id):
        return self.urls.loc[self.urls.id==id].pc.iloc[0]

In [63]:
import numpy as np


class TfIdf:
    def __init__(self):
        # Loading in the tf-idf
        # Rows are documents, the first column is the document id
        # Columns are keywords, the first row is the keyword id
        self.tfidf = pd.read_csv("../data/tf_idf/tf_idf.csv")

        # Loading in the keywords
        # Two columns, column 1 is id, column 2 is keyword
        # We can access the weight in the tf-idf by first accessing the id number from the keyword file
        self.tfidf_keywords = pd.read_csv("../data/tf_idf/tf_idf_keywords.csv")
        self.tfidf_keywords.columns = ["id", "keyword"]

    def get_id_by_keyword(self, keyword):
        return self.tfidf_keywords.id.iloc[self.tfidf_keywords[self.tfidf_keywords.keyword == keyword].index].values[0]


    def get_keyword_by_id(self, id):
        return self.tfidf_keywords.loc[self.tfidf_keywords.id==id, 'keyword'].iloc[0]
    # Method to get the tfidf value of a keyword in a page
    def get_tf_idf_value(self, page_id, keyword):
        keyword_id = self.get_id_by_keyword(keyword)
        return self.tfidf.iloc[page_id, keyword_id + 1]

    def get_all_keywords_by_id(self, page_id):
        keywords = self.tfidf.iloc[page_id][1:].values
        return keywords

    def get_all_keywords_by_id_normalized(self, page_id):
        keywords_weights = self.get_all_keywords_by_id(page_id)
        sum_value = sum(keywords_weights)
        keywords_weights_normalized = keywords_weights/sum_value
        return keywords_weights_normalized

    def get_number_of_keywords(self):
        return self.tfidf_keywords.shape[0]

In [64]:
class AdjacencyMatrix:
    def __init__(self):
        # Loading in the adjacency matrix
        # First column and row are data immediately, so access directly by page id
        # If row(page_id_1) leads to column(page_id_2) = 1 else = 0
        self.adjacency_matrix = pd.read_csv("../data/matrices/adjacency_matrix.csv", header=None)

    # Method to get the adjacency value from two page ids
    def get_adjacency_value(self, page_id_1, page_id_2):
        return self.adjacency_matrix.iloc[page_id_1, page_id_2] == 1

In [65]:
adjacency_matrix = AdjacencyMatrix()
proximal_cues = ProximalCues()
tfidf = TfIdf()

The input to the model should be a list of documents and the order in which they were visited.
Luckily this is exactly what we have.

In [66]:
generic = lambda x: literal_eval(x)

conv = {'url_id_path': generic,
        'seconds_spent_path': generic}
df = pd.read_csv('../data/clickdata/testGrouped.csv', converters=conv)

paths = df.url_id_path

We have one visit path here:

In [76]:
from pandas import DataFrame
def find_keyword_weights(path):
    weights = np.zeros(tfidf.get_number_of_keywords())
    pd.options.display.max_rows = 0
    for id in path:
        weights += tfidf.get_all_keywords_by_id_normalized(id)

    df = DataFrame(weights, columns=['weights'])
    return df.sort_values(by=['weights'], ascending=False)

# TODO: find a way to put more weight on recent pages
def find_keyword_weights_more_weight_on_recent_pages(path):
    weights = np.zeros(tfidf.get_number_of_keywords())
    pd.options.display.max_rows = 0
    for id in path:
        weights += tfidf.get_all_keywords_by_id_normalized(id)

    df = DataFrame(weights, columns=['weights'])
    return df.sort_values(by=['weights'], ascending=False)

Idea: for each page visited, find the weights of the keywords


In [77]:
for path in paths:
    path[:] = [x for x in path if x < 1556]
    if len(path)>5:
        path[:] = path[5:]
        top_ten_weights = find_keyword_weights(path).head(10)
        print("Keywords computed:")
        i=0
        for index in top_ten_weights.index:
            print(str(tfidf.get_keyword_by_id(index)) + " : " + str(top_ten_weights.weights.iloc[i]))
            i+=1
        print("\n")

Keywords computed:
samenwonen : 0.050130126456266716
samenlevingscontract : 0.04701830951287108
pensioenvoorstel : 0.03175212603543943
keuzehulp : 0.030554290306610363
expartner : 0.029235857691412542
uitgebreid : 0.02740035907531143
lee : 0.025287096766019285
boswachter : 0.025163500140071408
excuses : 0.024722601560457486
aangemeld : 0.02371275011712724


Keywords computed:
excuses : 0.34257657836359534
overzicht : 0.2558516439267637
inzicht : 0.255028314053615
onze : 0.14654346365602605
aa : 0.0
posttraumatische : 0.0
potentie : 0.0
potential : 0.0
pot : 0.0
postzegel : 0.0


Keywords computed:
bericht : 0.5
laxxxxxxxxx : 0.4820994563111685
dhr : 0.4169746603249156
mijnabp : 0.2259258833639158
waardeoverdracht : 0.06709860751171397
naam : 0.030754133481599645
vul : 0.030243205497987133
beslist : 0.02880054459855882
vorig : 0.0287009377916239
vraag : 0.019946348836137617


Keywords computed:
excuses : 0.44965790325133287
bericht : 0.35799261705709523
onze : 0.19234947969157198
aa : 0

  keywords_weights_normalized = keywords_weights/sum_value


Keywords computed:
aa : nan
aaa : nan
aafje : nan
aambeien : nan
aan : nan
aanbesteding : nan
aanbeveling : nan
aanbevelingen : nan
aanbevolen : nan
aanbieden : nan


Keywords computed:
bericht : 0.50000179952701
laxxxxxxxxx : 0.4764699728907816
dhr : 0.41210729606553287
mijnabp : 0.21337961648056977
toestemming : 0.03386249803086842
mx : 0.02613527244078497
welkom : 0.02175710358918654
pensioencoach : 0.020447955413524437
overzicht : 0.018130600209761052
inzicht : 0.018072258282651566


Keywords computed:
aanvragen : 0.29996223870339717
overzicht : 0.26815633149967233
periode : 0.24485720466432032
pensioenopbouw : 0.2353776101289802
aow : 0.10075507597766742
samenstellen : 0.09656734146384446
netto : 0.07645200886688652
tegelijk : 0.07374068662783206
geniet : 0.062177178224927465
plan : 0.061766749858961516


Keywords computed:
voorwaarde : 0.05996031431500197
nabestaandenpensioen : 0.047509978633980064
abpwerkgever : 0.031482204403703806
expartner : 0.029613587394693677
afziet : 0.02

KeyboardInterrupt: 

sessie
helppagina
gegeven
uitloggen
aug
menu
uitgelogd
seconden
hoofdcontent
verlengen
