# Creating the Probability Matrix
The below code will read in the frequency matrix, which holds information on how many times a user travelled from one page to another. If you were to query the frequency matrix at [4, 7], and you got 12, that would mean that users travelled from page with id 4, to page with id 7, 12 times! What the probability will do is sum the number of frequencies to all pages that link to one id. so essentially sum the row data in the frequency matrix. This will give a total for how many times users left this webpage. Then each id that a person left this page for will be divided by the total. This means in the Probability matrix, each row will sum to 1, and the value at [4, 7] would now be a decimal betweel 0 and 1, if [4, 7] was 0.5, then 50% of the time a user visits page 4, they navigate to page 7 next, and so on!

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Read in the frequency matrix
freq_df = pd.read_csv('DATA/matrices/frequency_matrix.csv', header=None)

In [3]:
freq_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5028,5029,5030,5031,5032,5033,5034,5035,5036,5037
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,213853,0,4007,9,863797,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1498547,0,0,0,3942,348,11253,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5033,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5034,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5035,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5036,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# Convert dataframe to 2d numpy matrix
freq_matrix = freq_df.to_numpy()

In [35]:
# Method takes in frequency matrix, and outputs probability matrix
def get_prob_matrix(freq_matrix):
    # Loop through rows and sum them. Save to different array.
    array_totals = np.zeros(len(freq_matrix[0]), dtype=int) # index number corresponds to id
    
    # set up probability matrix
    prob_matrix = np.zeros((len(freq_matrix), len(freq_matrix)))
                            
    for index, row in enumerate(freq_matrix):
        array_totals[index] = sum(row)
        
    for index, row in enumerate(freq_matrix):
        divide_by = array_totals[index]
        if not divide_by == 0:
            for index_2, frequency in enumerate(row):
                # Assign probability in prob_matrix
                prob_matrix[index][index_2] = frequency / divide_by
    return prob_matrix

In [37]:
#my_own = [[0, 3, 7],
#          [1, 0, 4],
#          [9, 13, 0]] FOR TESTING

# Get real probability matrix
prob_matrix = get_prob_matrix(freq_matrix)

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [43]:
# Save to csv file -> convert to dataframe first
prob_df = pd.DataFrame(prob_matrix)
prob_df.to_csv('DATA/matrices/probability_matrix.csv', index=False, header=False)