<a href="https://colab.research.google.com/github/MartaCampagnoli/PageRank-ProductCustomers/blob/main/PageRankDSE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import itertools
from itertools import permutations
import tqdm
from tqdm.notebook import tqdm_notebook
import zipfile
import csv
import sys
from pprint import pprint

In [2]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://dlcdn.apache.org/spark/spark-3.3.2/spark-3.3.2-bin-hadoop3.tgz
!tar xf spark-3.3.2-bin-hadoop3.tgz
!pip install -q findspark

import os
os.environ["JAVA_HOME"] = "xxxxxx"
os.environ["SPARK_HOME"] = "xxxxxx"

import findspark
findspark.init("spark-3.3.2-bin-hadoop3")#SPARK_HOME
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()


# Functions


Auxiliary functions: 

*   **'indecesprod'** has been created to reindex products in the dataset for cleaner results when creating edges and to correctly apply the method used to count nodes in the algorithm. Needs the dataset to be ordered by values in 'product_parent' column;
*   **'tuple_list'** takes the 'customer_id' column and 'indexes' column created with the first function to create the list of tuples to be transformed into an rdd.



In [3]:
def indexesprod(d):
    index = []
    count = 0
    a = d['product_parent'][0] #takes first value in the column orered by value
    for row in tqdm_notebook(d.itertuples(), total=d.shape[0], desc=f'Reading DF'):
        if d['product_parent'].at[row.Index] == a:
            index.append(count)        
        else:       
            count = count + 1
            index.append(count)
            a = d['product_parent'].at[row.Index]
    return index

def tuple_list(d):
    rev = d['customer_id'] #reviewers
    prod = d['indexes'] #products
    revprod = list(zip(rev,prod))
    return revprod

# Dataset loading and Data preparation


In [4]:
os.environ['KAGGLE_USERNAME'] = "martacampagnoli"
os.environ['KAGGLE_KEY'] = "194348b64eada9bcca0d9127eae03b54"
!kaggle datasets download -d cynthiarempel/amazon-us-customer-reviews-dataset


amazon-us-customer-reviews-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)


In [5]:
zip_file = zipfile.ZipFile('/content/amazon-us-customer-reviews-dataset.zip') #dataframe creation, adapted from Kaggle

instruments = pd.DataFrame({'marketplace': pd.Series([], dtype='str'),
              'customer_id': pd.Series([], dtype='str'),
              'review_id': pd.Series([], dtype='str'),
              'product_id': pd.Series([], dtype='str'),
              'product_parent': pd.Series([], dtype='str'),
              'product_title': pd.Series([], dtype='str'),
              'product_category': pd.Series([], dtype='str'),
              'star_rating': pd.Series([], dtype='int'),
              'helpful_votes': pd.Series([], dtype='int'),
              'total_votes': pd.Series([], dtype='int'),
              'vine': pd.Series([], dtype='str'),
              'verified_purchase': pd.Series([], dtype='str'),
              'review_headline': pd.Series([], dtype='str'),
              'review_body': pd.Series([], dtype='str'),
              'review': pd.Series([], dtype='str'),
              'review_date': pd.Series([], dtype='float')})

#choosing the product category
filenames = ['amazon_reviews_us_Musical_Instruments_v1_00.tsv']

tmp = []
for filename in filenames:
    with zip_file.open(filename) as f:
        data = pd.read_csv(f, sep='\t', header=0, quoting=csv.QUOTE_NONE)
        data['review'] = data['review_headline'] + ' ' + data['review_body']
        tmp.append(data) #solution to solve the issue of df.append being deprecated in a future version
        
instruments = pd.concat(tmp, ignore_index=True) 

print(instruments.shape)
instruments.head()

(904765, 16)


Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date,review
0,US,45610553,RMDCHWD0Y5OZ9,B00HH62VB6,618218723,AGPtek® 10 Isolated Output 9V 12V 18V Guitar P...,Musical Instruments,3,0,1,N,N,Three Stars,"Works very good, but induces ALOT of noise.",2015-08-31,"Three Stars Works very good, but induces ALOT ..."
1,US,14640079,RZSL0BALIYUNU,B003LRN53I,986692292,Sennheiser HD203 Closed-Back DJ Headphones,Musical Instruments,5,0,0,N,Y,Five Stars,Nice headphones at a reasonable price.,2015-08-31,Five Stars Nice headphones at a reasonable price.
2,US,6111003,RIZR67JKUDBI0,B0006VMBHI,603261968,AudioQuest LP record clean brush,Musical Instruments,3,0,1,N,Y,Three Stars,removes dust. does not clean,2015-08-31,Three Stars removes dust. does not clean
3,US,1546619,R27HL570VNL85F,B002B55TRG,575084461,Hohner Inc. 560BX-BF Special Twenty Harmonica,Musical Instruments,5,0,0,N,Y,I purchase these for a friend in return for pl...,I purchase these for a friend in return for pl...,2015-08-31,I purchase these for a friend in return for pl...
4,US,12222213,R34EBU9QDWJ1GD,B00N1YPXW2,165236328,Blue Yeti USB Microphone - Blackout Edition,Musical Instruments,5,0,0,N,Y,Five Stars,This is an awesome mic!,2015-08-31,Five Stars This is an awesome mic!


In [6]:
instrumentsdef = instruments[["customer_id","review_id", "product_id","product_parent","product_title","product_category"]] #dropping columns for easier visualization, could be skipped

In [7]:
instrumentsdef['custfreq'] = instrumentsdef.groupby('customer_id')['customer_id'].transform('count') #count how many times each reviewer appears in the dataset
instrumentsdef.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  instrumentsdef['custfreq'] = instrumentsdef.groupby('customer_id')['customer_id'].transform('count') #count how many times each reviewer appears in the dataset


Unnamed: 0,customer_id,review_id,product_id,product_parent,product_title,product_category,custfreq
0,45610553,RMDCHWD0Y5OZ9,B00HH62VB6,618218723,AGPtek® 10 Isolated Output 9V 12V 18V Guitar P...,Musical Instruments,1
1,14640079,RZSL0BALIYUNU,B003LRN53I,986692292,Sennheiser HD203 Closed-Back DJ Headphones,Musical Instruments,7
2,6111003,RIZR67JKUDBI0,B0006VMBHI,603261968,AudioQuest LP record clean brush,Musical Instruments,1
3,1546619,R27HL570VNL85F,B002B55TRG,575084461,Hohner Inc. 560BX-BF Special Twenty Harmonica,Musical Instruments,1
4,12222213,R34EBU9QDWJ1GD,B00N1YPXW2,165236328,Blue Yeti USB Microphone - Blackout Edition,Musical Instruments,1


As confirmed by documentation, the SettingWithCopyWarning is most likely a False Positive. Using the following line of code:

`instrumentsdef.loc[:, 'custfreq'] = instrumentsdef.groupby('customer_id')['customer_id'].transform('count') `

yeilds the same result but gets the same warning error.


In [8]:
instrumentsdef = instrumentsdef[(instrumentsdef.custfreq != 1 )] #drop all reviewers that appear only once: they will not connect any two products
instrumentsdef

Unnamed: 0,customer_id,review_id,product_id,product_parent,product_title,product_category,custfreq
1,14640079,RZSL0BALIYUNU,B003LRN53I,986692292,Sennheiser HD203 Closed-Back DJ Headphones,Musical Instruments,7
6,10225065,RL5LNO26GAVJ1,B009PJRMHQ,694166585,Kmise 1pc Pickguard for Gibson Sg Standard 3-p...,Musical Instruments,4
7,6356995,R3GYQ5W8JHP8SB,B00NKBDAZS,446431775,Kealoha Concert Ukulele - Stunning NEW Printed...,Musical Instruments,3
9,32139520,R14YLXA56NP51I,B000FIBD0I,771888534,Gator GPTBLACK Plywood Pedal Board with Black ...,Musical Instruments,10
10,36060782,R1ZH0HSH38IOTZ,B0002E52GG,68535945,Hetman 1 - Light Piston Lubricant Light,Musical Instruments,8
...,...,...,...,...,...,...,...
904746,49350719,R1KAOC3XU8XTPL,B00002DDSF,530155260,Yamaha PSR240 61-Note Touch-Sensitive Portable...,Musical Instruments,3
904747,49350719,R1WHFJ07JWFV61,B00002DDSN,575547555,Yamaha PSRD1 61-Note Dance and Techno Portable...,Musical Instruments,3
904748,50947605,R10FN99OXUBU4S,B00002F626,668653052,Yamaha PSS16 37-Note Portable Electronic Keyboard,Musical Instruments,2
904763,50522024,R1VYXC7FFG4LGH,B00002F2IZ,615125621,Yamaha DD9M Touch-Sensitive Digital Drums,Musical Instruments,2


In [9]:
instrumentsdef = instrumentsdef.sort_values("product_parent") #necessary for correct reindexing
instrumentsdef = instrumentsdef.reset_index(drop=True)
instrumentsdef.head()

Unnamed: 0,customer_id,review_id,product_id,product_parent,product_title,product_category,custfreq
0,39745418,RIQS2872HL2BC,B003S7GAT2,27657,Remo RH3100-00 3-Piece Drum Set Multi-colored ...,Musical Instruments,3
1,11944622,R15K6L7ZKJMTW5,B003S7GAT2,27657,Remo RH3100-00 3-Piece Drum Set Multi-colored ...,Musical Instruments,6
2,14433458,R1U50ENAWMW7JK,B003S7GAT2,27657,Remo RH3100-00 3-Piece Drum Set Multi-colored ...,Musical Instruments,4
3,25384250,R20ZG1DH5DE9IP,B003S7GAT2,27657,Remo RH3100-00 3-Piece Drum Set Multi-colored ...,Musical Instruments,2
4,14174875,RAHGU6QRL392J,B003S7GAT2,27657,Remo RH3100-00 3-Piece Drum Set Multi-colored ...,Musical Instruments,3


In [10]:
indexing = indexesprod(instrumentsdef) #reindexing products to more efficiently create edges

Reading DF:   0%|          | 0/463713 [00:00<?, ?it/s]

In [11]:
instrumentsdef['indexes'] = indexing
instrumentsdef.head(10)

Unnamed: 0,customer_id,review_id,product_id,product_parent,product_title,product_category,custfreq,indexes
0,39745418,RIQS2872HL2BC,B003S7GAT2,27657,Remo RH3100-00 3-Piece Drum Set Multi-colored ...,Musical Instruments,3,0
1,11944622,R15K6L7ZKJMTW5,B003S7GAT2,27657,Remo RH3100-00 3-Piece Drum Set Multi-colored ...,Musical Instruments,6,0
2,14433458,R1U50ENAWMW7JK,B003S7GAT2,27657,Remo RH3100-00 3-Piece Drum Set Multi-colored ...,Musical Instruments,4,0
3,25384250,R20ZG1DH5DE9IP,B003S7GAT2,27657,Remo RH3100-00 3-Piece Drum Set Multi-colored ...,Musical Instruments,2,0
4,14174875,RAHGU6QRL392J,B003S7GAT2,27657,Remo RH3100-00 3-Piece Drum Set Multi-colored ...,Musical Instruments,3,0
5,1822108,R3QURUAS4MSCXL,B006YV2N0Y,44637,Cannon MCC350 Large Round Wood Top Wind Chime,Musical Instruments,3,1
6,10311075,R1723HYZJ5OHJF,B006YV2N0Y,44637,Cannon MCC350 Large Round Wood Top Wind Chime,Musical Instruments,5,1
7,13612653,R2AKGHD6RAM5YL,B0049SXUY2,54127,MXL CR-24 Studio Condenser Kit,Musical Instruments,13,2
8,12065414,R34MMM30UD5HZS,B0049SXUY2,54127,MXL CR-24 Studio Condenser Kit,Musical Instruments,9,2
9,12323342,R1FBDV6H40KZXW,B002NO4DMC,101117,NEW PRO CHERRY SUNBURST ARCHTOP LP-777 ELECTRI...,Musical Instruments,7,3


In [12]:
nodes = max(indexing) + 1 #the number of nodes is 78063, adding 1 because the index starts from 0
nodes

78063

At the end of this brief preprocessing face, I create a list of tuples reviewer - product index to be transformed into an rdd and needed to create edges between products reviewed by the same customer and compute the PageRank index. 

Apart from final checks on the results of the algortith, the rest of the project is carried out in Spark through the application of MapReduce methods.

In [13]:
rp = tuple_list(instrumentsdef) #list of tuples
rp[0:20]

[(39745418, 0),
 (11944622, 0),
 (14433458, 0),
 (25384250, 0),
 (14174875, 0),
 (1822108, 1),
 (10311075, 1),
 (13612653, 2),
 (12065414, 2),
 (12323342, 3),
 (12126693, 4),
 (20808922, 5),
 (42109453, 5),
 (12766095, 5),
 (27835258, 5),
 (47259347, 5),
 (51360783, 5),
 (24741649, 5),
 (31452449, 5),
 (22949585, 6)]

In [14]:
#size of list of tuple
print("Size of list: " + str(sys.getsizeof(rp)/1e6) + " Megabytes")

Size of list: 4.167352 Megabytes




# Edges and PageRank step by step


In [15]:
sc = spark.sparkContext

In [16]:
rdd = sc.parallelize(rp).cache()

In order to create edges, I use a MapReduce process on the list of tuples of reviewers (key) and reviewed product (value). By grouping by key, I find a list of  reviews for each reviewer, and then I apply a permutation function on the list to create all possible edges.



In [17]:
edges =  rdd.groupByKey().flatMap(lambda x : (itertools.permutations(x[1], 2))).sortByKey() #edges

Before defining a PageRank algorithm, I show the main steps used to obtain the necessary elements to run it.

In [18]:
total_prod = max(edges.max(lambda x:x[0])[0],edges.max(lambda x:x[1])[1]) + 1 #index starts from zero, we are counting the actual number of nodes so we add 1
print(total_prod)

78063


In [19]:
id2degree = edges.countByKey()
id2degree[0],id2degree[1],id2degree[2] #outdegree of each node 

(13, 6, 20)

In [20]:
connection_matrix = edges.map(lambda x:(x[1],x[0],1/id2degree[x[0]])) #connection matrix (i,j,mij)
connection_matrix.take(20)

[(58314, 0, 0.07692307692307693),
 (64026, 0, 0.07692307692307693),
 (20748, 0, 0.07692307692307693),
 (29261, 0, 0.07692307692307693),
 (44894, 0, 0.07692307692307693),
 (48571, 0, 0.07692307692307693),
 (55869, 0, 0.07692307692307693),
 (8048, 0, 0.07692307692307693),
 (10759, 0, 0.07692307692307693),
 (42035, 0, 0.07692307692307693),
 (8213, 0, 0.07692307692307693),
 (24585, 0, 0.07692307692307693),
 (39937, 0, 0.07692307692307693),
 (25670, 1, 0.16666666666666666),
 (48142, 1, 0.16666666666666666),
 (17940, 1, 0.16666666666666666),
 (24149, 1, 0.16666666666666666),
 (55980, 1, 0.16666666666666666),
 (60467, 1, 0.16666666666666666),
 (4435, 2, 0.05)]

In [21]:
page_rank = np.full((total_prod,), 1/(total_prod)) #to be kept in main memory. vector of initial probabilities distribution
page_rank[:10]

array([1.28101661e-05, 1.28101661e-05, 1.28101661e-05, 1.28101661e-05,
       1.28101661e-05, 1.28101661e-05, 1.28101661e-05, 1.28101661e-05,
       1.28101661e-05, 1.28101661e-05])

In [22]:
old_page_rank = np.ones(total_prod) #ensures the loop is started

# PageRank: Algorithms


The elements described in the previous section are now used in three different functions which only need to receive the 'edges' file to perform all the steps previously illustrated:

1.   **'get_page_rank_iteration'**, based exclusively on a set number of iterations, used as a first test for the functioning of the code. While the iterations are set to 50, the value can be changed for a faster test or a more accurate result;
2.   **'get_page_rank_distance'**, where the algorithm stops when the euclidean distance between the vector of old values of the page rank and the updated vector is lower than a given threshold;
3.   **'get_page_rank_taxation'**, an implementation of the taxation (or teleport) variation of the algorithm; 
2.   **'l2distance'**, a function computing the Euclidean distance of two vectors.


Code for the algorithms and distance function has been adapted for this project from the solutions proposed during the curse and tutoring lessons.

The functions could also be modified to receive the parallelized tuples list instead and perform edges creation inside of it.





In [23]:
def get_page_rank_iteration(rdd, verbose=False):

    
    total_prod = max(rdd.max(lambda x:x[0])[0],rdd.max(lambda x:x[1])[1]) + 1
    id2degree = rdd.countByKey()
    
    page_rank = np.full((total_prod,), 1/(total_prod))
    old_page_rank = np.ones(total_prod)

    connection_matrix = rdd.map(lambda x:(x[1],x[0],1/id2degree[x[0]]))
    
    
    for iteration in range(50): #to change for less or more iterations
        old_page_rank = page_rank
        page_rank_values = (connection_matrix
                            .map(lambda t: (t[0], t[2]*page_rank[t[1]]))
                            .reduceByKey(lambda a, b: a+b)
                            .sortByKey()
                            .collect()
                           )
        page_rank = np.array([c
                              for (i, c) in page_rank_values])
            
        if verbose:
            print(page_rank)

        iteration += 1
        
    print('{} iterations'.format(iteration))
    
    return page_rank

In [24]:
simple_rank = get_page_rank_iteration(edges)

50 iterations


In [25]:
simple_rank

array([3.71505108e-06, 1.71350116e-06, 5.71150805e-06, ...,
       2.85570982e-07, 2.57040208e-06, 6.56685570e-06])

In [26]:
#sorting of rank, displaying PR value and index of node (product)
for product in list(zip(simple_rank.argsort()[::-1], simple_rank[simple_rank.argsort()[::-1]]))[:10]:
    print(f"pr: {product[1]}, product: {(product[0])}")

pr: 0.0029756904344108645, product: 26012
pr: 0.0027317160968084743, product: 15398
pr: 0.0025631570512565493, product: 34129
pr: 0.002530913485544629, product: 32831
pr: 0.002310719495768728, product: 58553
pr: 0.0019120565771037015, product: 42359
pr: 0.0018917481825906904, product: 50799
pr: 0.0018198080148849728, product: 15633
pr: 0.0017024009002907475, product: 38191
pr: 0.0016219379151427601, product: 77448


In [27]:
pprint(instrumentsdef[instrumentsdef['indexes'] == 26012 ][['product_title','indexes']][:1]) #highest rank

       product_title  indexes
154472   JOYO PEDALS    26012


In [28]:
pprint(instrumentsdef[instrumentsdef['indexes'] == 15398 ][['product_title','indexes']][:1])

          product_title  indexes
94019  Snark SN-1 Tuner    15398


In [29]:
pprint(instrumentsdef[instrumentsdef['indexes'] == 34129][['product_title','indexes']][:1])

                   product_title  indexes
204838  Ernie Ball slinky nickel    34129


In [30]:
pprint(instrumentsdef[instrumentsdef['indexes'] == 32831][['product_title','indexes']][:1])

                                           product_title  indexes
195220  Planet Waves Pro-Winder String Winder and Cutter    32831


In [31]:
pprint(instrumentsdef[instrumentsdef['indexes'] == 58553][['product_title','indexes']][:1])

                                            product_title  indexes
350449  Elixir Strings 80/20 Bronze Acoustic Guitar St...    58553


In [32]:
def l2distance(v, q):
    
    if len(v) != len(q):
        raise ValueError('Cannot compute the distance'
                         ' of two vectors of different size')
    
    return sum([(q_el - v_el)**2 for v_el, q_el in zip(v, q)])

In [33]:
def get_page_rank_distance(rdd, 
                  max_iterations=400, tolerance=1.e-8, verbose=False): #se non funziona fare alla -7 -8 ecc

    
    total_prod = max(rdd.max(lambda x:x[0])[0],rdd.max(lambda x:x[1])[1]) + 1
    id2degree = rdd.countByKey()
    
    page_rank = np.full((total_prod,), 1/(total_prod))
    old_page_rank = np.ones(total_prod)

    connection_matrix = rdd.map(lambda x:(x[1],x[0],1/id2degree[x[0]]))
    

    iteration = 0
    while l2distance(old_page_rank, page_rank) >= tolerance and \
          iteration < max_iterations:
        old_page_rank = page_rank
        page_rank_values = (connection_matrix
                            .map(lambda t: (t[0], t[2]*page_rank[t[1]]))
                            .reduceByKey(lambda a, b: a+b)
                            .sortByKey()
                            .collect()
                           )
        page_rank = np.array([c for (i, c) in page_rank_values])
            
        if verbose:
            nice_print(page_rank)

        iteration += 1

    print('{} iterations'.format(iteration))
    
    return page_rank

In [34]:
prdistance = get_page_rank_distance(edges)

21 iterations


In [35]:
prdistance

array([3.75885748e-06, 1.71425738e-06, 5.73437816e-06, ...,
       2.86855277e-07, 2.58325209e-06, 6.50733668e-06])

In [36]:
#sorting of rank, displaying PR value and index of node (product)
for product in list(zip(prdistance.argsort()[::-1], prdistance[prdistance.argsort()[::-1]]))[:10]:
    print(f"pr: {product[1]}, product: {(product[0])}")

pr: 0.0029913399160774556, product: 26012
pr: 0.0027472324132232, product: 15398
pr: 0.002576945790401992, product: 34129
pr: 0.002544992787209787, product: 32831
pr: 0.0023243036009685707, product: 58553
pr: 0.0019246177392081396, product: 42359
pr: 0.0019020018090063827, product: 50799
pr: 0.0018301189652971815, product: 15633
pr: 0.0017117720570050752, product: 38191
pr: 0.001629897133761123, product: 77448


In [37]:
def get_page_rank_taxation(rdd, beta=0.85,
                  max_iterations=400, tolerance=1.e-10, verbose=False):

    
    total_prod = max(rdd.max(lambda x:x[0])[0],rdd.max(lambda x:x[1])[1]) + 1
    id2degree = rdd.countByKey()
    
    page_rank = np.full((total_prod,), 1/(total_prod))
    old_page_rank = np.ones(total_prod)

    connection_matrix  = rdd.map(lambda x:(x[1],x[0],1/id2degree[x[0]]))
  
    
    iteration = 0
    while l2distance(old_page_rank, page_rank) >= tolerance and \
          iteration < max_iterations:
        old_page_rank = page_rank
        page_rank_values = (connection_matrix
                            .map(lambda t: (t[0], t[2]*page_rank[t[1]]))
                            .reduceByKey(lambda a, b: a+b)
                            .sortByKey()
                            .collect()
                           )
        page_rank = np.array([beta*c + (1-beta)/(total_prod)
                              for (i, c) in page_rank_values])
            
        if verbose:
            print(page_rank)

        iteration += 1
        
    print('{} iterations'.format(iteration))
    
    return page_rank

In [38]:
prtax = get_page_rank_taxation(edges)

16 iterations


In [39]:
prtax

array([9.05558550e-06, 3.60473367e-06, 6.65642982e-06, ...,
       2.22301469e-06, 3.73321794e-06, 8.84652921e-06])

In [40]:
#sorting of rank, displaying PR value and index of node (product)
for product in list(zip(prtax.argsort()[::-1], prtax[prtax.argsort()[::-1]]))[:10]:
    print(f"pr: {product[1]}, product: {(product[0])}")

pr: 0.002320725883481306, product: 15398
pr: 0.002317047863168403, product: 26012
pr: 0.0020025939825207687, product: 34129
pr: 0.0019827109283405493, product: 32831
pr: 0.0018969709944813358, product: 58553
pr: 0.0016924879657939172, product: 42359
pr: 0.0016082182167982783, product: 50799
pr: 0.001467785056985196, product: 15633
pr: 0.0014610345911296428, product: 38191
pr: 0.001350611468379184, product: 77448


# Product Frequency: a check

PageRank is considered a "popularity" index. When speaking about products, we can think that the most popular ones are also the ones that get reviewed more. It is reasonable to think that the more a product appears in the dataset, the higher the chance that it will have a consistent number of ingoing links, thus being well connected and raising its PageRank values. 

This last part of the project checks wheter the most frequent products correspond to the products with higher PageRank values.



In [41]:
instrumentsdef['prodcount'] = instrumentsdef.groupby('product_parent')['product_parent'].transform('count') #count number of times the product appears in the data
instrumentsdef.head()

Unnamed: 0,customer_id,review_id,product_id,product_parent,product_title,product_category,custfreq,indexes,prodcount
0,39745418,RIQS2872HL2BC,B003S7GAT2,27657,Remo RH3100-00 3-Piece Drum Set Multi-colored ...,Musical Instruments,3,0,5
1,11944622,R15K6L7ZKJMTW5,B003S7GAT2,27657,Remo RH3100-00 3-Piece Drum Set Multi-colored ...,Musical Instruments,6,0,5
2,14433458,R1U50ENAWMW7JK,B003S7GAT2,27657,Remo RH3100-00 3-Piece Drum Set Multi-colored ...,Musical Instruments,4,0,5
3,25384250,R20ZG1DH5DE9IP,B003S7GAT2,27657,Remo RH3100-00 3-Piece Drum Set Multi-colored ...,Musical Instruments,2,0,5
4,14174875,RAHGU6QRL392J,B003S7GAT2,27657,Remo RH3100-00 3-Piece Drum Set Multi-colored ...,Musical Instruments,3,0,5


In [42]:
instrumentsdef.loc[instrumentsdef['prodcount'].idxmax()] #product with highest count

customer_id                    10555415
review_id                R2FEM1861T5RQO
product_id                   B003VWJ2K8
product_parent                195373332
product_title          Snark SN-1 Tuner
product_category    Musical Instruments
custfreq                              4
indexes                           15398
prodcount                          1964
Name: 94019, dtype: object

In [43]:
instrumentsdef = instrumentsdef.sort_values("prodcount", ascending=False) #and manually
pprint(instrumentsdef[['product_title','indexes']][:1])

          product_title  indexes
94659  Snark SN-1 Tuner    15398


In [44]:
pprint(instrumentsdef[['product_title','indexes']][1965:1966])

                                            product_title  indexes
351364  Elixir Strings 80/20 Bronze Acoustic Guitar St...    58553


In [45]:
pprint(instrumentsdef[['product_title','indexes']][3340:3341])

                                            product_title  indexes
255970  Snark SN-2 All Instrument Clip-On Chromatic Tuner    42359


In [46]:
pprint(instrumentsdef[['product_title','indexes']][4687:4688])

       product_title  indexes
230042         Snark    38191


In [47]:
pprint(instrumentsdef[['product_title','indexes']][5922:5923])

                                            product_title  indexes
459804  Musician's Gear Electric, Acoustic and Bass Gu...    77448


# Addendum: file sizes and number of edges


In [48]:
 edges = edges.collect()

In [49]:
edges[0:20] #showing the edges 

[(0, 58314),
 (0, 64026),
 (0, 20748),
 (0, 29261),
 (0, 44894),
 (0, 48571),
 (0, 55869),
 (0, 8048),
 (0, 10759),
 (0, 42035),
 (0, 8213),
 (0, 24585),
 (0, 39937),
 (1, 25670),
 (1, 48142),
 (1, 17940),
 (1, 24149),
 (1, 55980),
 (1, 60467),
 (2, 4435)]

In [50]:
#size of the edges file
print("Size of rdd: " + str(sys.getsizeof(edges)/1e6) + " Megabytes")

Size of rdd: 27.436344 Megabytes


In [51]:
len(edges) #number of edges

3377442

In [52]:
setedges = set(edges) #there exist duplicate edges
len(setedges) 

2995259

In [53]:
page_rank = np.full((total_prod,), 1/(total_prod))

In [54]:
#size of the initial vector of probabilities
print("Size of vector: " + str(sys.getsizeof(page_rank)/1e6) + " Megabytes")

Size of vector: 0.624616 Megabytes


In [55]:
#size of vector of final results
print("Size of vector: " + str(sys.getsizeof(simple_rank)/1e6) + " Megabytes")

Size of vector: 0.624616 Megabytes
