<a href="https://colab.research.google.com/github/MaximL98/CrawlingInMyProtein.github.io/blob/master/HW03/CrawlingInMP_HubAndAuthority_Calculations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Crawling in My Protein: by Crawling In My Skin™

In this colab file, we will implement a basic crawler for HW2


In [None]:
# Imports
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

This is the website link we will scrape

In [None]:
mp_url = "https://www.myprotein.co.il/nutrition/protein.list"

# Use requests to retrieve data from a given URL
mp_response = requests.get(mp_url)

# Parse the whole HTML page using BeautifulSoup
mp_soup = BeautifulSoup(mp_response.text, 'html.parser')


**Traverse the page https://www.myprotein.co.il/nutrition/protein.list**

For each item in that page, get the name of the product and it's link


In [None]:
mp_main = 'https://www.myprotein.co.il'

all_products = mp_soup.find('ul', {'class': 'productListProducts_products'})
product_list = all_products.find('li', {'class': 'productListProducts_product'})

# If the 'ul' element is found, proceed with extracting product information
if all_products:
  # These 'li' elements likely represent individual products
  product_list = all_products.find_all('li')
  # Initialize empty lists to store product links and names
  links = []
  product_names = []
  # Loop through each product 'li' element
  for li in product_list:
    a_tag_array = li.findAll('a', {'class': 'productBlock_link'})
    a_tag = a_tag_array[1]
    # If the 'a' element exists, extract and store the product link
    if a_tag:
      links.append(mp_main + a_tag['href'])
      h3_tag = a_tag.find('h3', {'class': 'productBlock_productName'})
      # If the 'h3' element exists, extract and store the product name
      if h3_tag:
        # Remove any leading or trailing whitespace from the extracted product name
        product_names.append(h3_tag.text.strip())
print(product_names)
print(links)

['Impact Whey Protein Powder', 'Impact Whey Isolate Powder', 'Clear Whey Protein Powder', 'Clear Vegan Protein', 'Impact Vegan Protein', 'Impact Soy Protein', 'Impact Weight Gainer', 'Impact Pea Protein', 'Impact Diet Whey', 'Whey Forward Isolate', 'THE Whey', 'Impact Casein Powder', 'Plant Protein Superblend', 'Total Protein Blend', 'Collagen Protein Powder', 'Clear Whey Hydrate', 'Protein Meal Replacement Blend', 'Clear Whey Diet', 'Clear Collagen Protein Powder', 'Clear Vegan Protein (Sample)', 'Breakfast Smoothie', 'Clear Whey Protein (Sample)', 'Clear Weight Gainer', 'Protein Hot Chocolate', 'Hydrolysed Whey Protein Powder', 'THE Diet™', 'Vegan Protein Blend (Sample)', 'All-in-One Recovery', 'Clear Soy Protein', 'Soy Protein Isolate (Sample)']
['https://www.myprotein.co.il/sports-nutrition/impact-whey-protein-powder/10530943.html', 'https://www.myprotein.co.il/sports-nutrition/impact-whey-isolate-powder/10530911.html', 'https://www.myprotein.co.il/sports-nutrition/clear-whey-prote

For each item, find the div which contains the ingredients the product is made of.

In [None]:
# This class represents a node in a network for the PageRank algorithm.
class Node:
  # Initializes a new Node object.
  def __init__(self, id):
    self.id = id
    self.auth_score = 1 # Authority score (initially set to 1)
    self.hub_score = 1  # Hub score (initially set to 1)
    self.connected_to_me = [] # List of nodes that point to this node (incoming links)
    self.connected_to_them = [] # List of nodes this node points to (outgoing links)
    self.page_rank_score = 1 # PageRank score (initially set to 1)

  def __str__(self):
    """
    Returns a string representation of the node's ID.
    """
    return str(self.id)

  def get_id(self):
    """
    Returns the node's ID.
    """
    return self.id

  def get_page_rank_score(self):
    """
    Returns the node's current PageRank score.
    """
    return self.page_rank_score

  def set_page_rank_score(self, page_rank_score):
    """
    Sets the node's PageRank score.

    Args:
      page_rank_score: The new PageRank score for the node.
    """
    self.page_rank_score = page_rank_score

  def increment_auth_score(self, hub_score):
    """
    Increments the node's authority score by a given amount.

    Args:
      hub_score: The amount to add to the authority score.
    """
    self.auth_score += hub_score

  def set_auth_score(self, auth_score):
    """
    Sets the node's authority score to a specific value.

    Args:
      auth_score: The new authority score for the node.
    """
    self.auth_score = auth_score

  def increment_hub_score(self, auth_score):
    """
    Increments the node's hub score by a given amount.

    Args:
      auth_score: The amount to add to the hub score. (Note: might be misleading to use auth_score here)
    """
    self.hub_score += auth_score

  def get_auth_score(self):
    """
    Returns the node's current authority score.
    """
    return self.auth_score

  def get_hub_score(self):
    """
    Returns the node's current hub score.
    """
    return self.hub_score

  def set_hub_score(self, hub_score):
    """
    Sets the node's hub score to a specific value.

    Args:
      hub_score: The new hub score for the node.
    """
    self.hub_score = hub_score

  def add_connection_to_me(self, nodelist):
    """
    Adds a list of nodes to the list of nodes that point to this node (incoming links).

    Args:
      nodelist: A list of Node objects representing incoming connections.
    """
    for node in nodelist:
      if node not in self.connected_to_me:
        self.connected_to_me.append(node)

  def add_connection_to_them(self, nodelist):
    """
    Adds a list of nodes to the list of nodes this node points to (outgoing links).

    Args:
      nodelist: A list of Node objects representing outgoing connections.
    """
    for node in nodelist:
      if node not in self.connected_to_them:
        self.connected_to_them.append(node)

  def get_connections_to_me(self):
    """
    Returns the list of nodes that point to this node (incoming links).
    """
    return self.connected_to_me

  def get_connections_to_them(self):
    """
    Returns the list of nodes this node points to (outgoing links).
    """
    return self.connected_to_them

In [None]:
def get_item_id(href):
  return href.split('/')[-1].split('.')[0]

In [None]:
productNumbers = []

for link in links:
  # Use requests to retrieve data from a given URL
  mp_response = requests.get(link)
  # Parse the whole HTML page using BeautifulSoup
  mp_soup = BeautifulSoup(mp_response.text, 'html.parser')


  # Get links to the recommended products
  node_links = mp_soup.findAll('a', {'class': 'productBlock_link'})
  # Create a set of links for each product
  recommended_items_set = set()
  # For each div of products in the recommended items, extract only the 'href' link
  for href in enumerate(node_links):
    recommended_items_set.add(href[1]['href'])

  # For each 'href' link in the recommended items, extract its ID
  # Example: href="/sports-nutrition/essential-omega-3/10529329.html?rctxt=default" turns into href="10529329"
  recommended_after_id = []
  for recommended in recommended_items_set:
    recommended_after_id.append(get_item_id(recommended))
  # Append the set to the productNumbers list, each index is the product number
  productNumbers.append(recommended_after_id)

print("List of recommended products length:", len(productNumbers))
print("Number of recommended products per page:", len(productNumbers[0]))

List of recommended products length: 30
Number of recommended products per page: 4


In [None]:
# Extract each main product's id and append them to a new id list
# Example: 'https://www.myprotein.co.il/sports-nutrition/impact-whey-protein-powder/10530943.html' turns into 10530943
main_node_ids = []
for i, link in enumerate(links):
  main_node_ids.append(link.split('/')[-1].split('.')[0])
print(main_node_ids)

['10530943', '10530911', '12081395', '12360400', '11776868', '10529701', '10529988', '10530136', '10530657', '14960829', '11353515', '10798909', '13972449', '10529951', '11067704', '12869895', '11324199', '13125562', '12865492', '12360405', '13251950', '12081401', '13096868', '10926373', '10529805', '11350864', '11332873', '10530268', '12853864', '11332868']


**Create 30 nodes from the pages, connect them to their recommended items**

In [None]:
list_main_nodes = []
for id in main_node_ids:
  node = Node(id)
  list_main_nodes.append(node)

for i, node in enumerate(list_main_nodes):
  nodes_to_add = []
  for recommended in productNumbers[i]:
    try:
      # Find if the recommended product is in the list of 30 products in the page
      index_recommended_node = main_node_ids.index(recommended)
    except:
      index_recommended_node = -1
    finally:
      # If the recommended item was found in the 30 products, append it's id to the list of connections to the current node
      if index_recommended_node != -1:
        list_main_nodes[index_recommended_node].add_connection_to_me([node])
        nodes_to_add.append(list_main_nodes[index_recommended_node])
  node.add_connection_to_them(nodes_to_add)

In [None]:
for i in range(len(list_main_nodes)):
  print(f"Node {list_main_nodes[i].get_id()} points to {list_main_nodes[i].get_connections_to_them()}")
  print(f"Node {list_main_nodes[i].get_id()}, the following nodes point to me: {list_main_nodes[i].get_connections_to_me()}\n")


Node 10530943 points to [<__main__.Node object at 0x7cc6a5afb9a0>]
Node 10530943, the following nodes point to me: [<__main__.Node object at 0x7cc6a4467be0>, <__main__.Node object at 0x7cc6a5afae60>, <__main__.Node object at 0x7cc6a5afa0b0>, <__main__.Node object at 0x7cc6a5afabc0>, <__main__.Node object at 0x7cc6a5afaef0>, <__main__.Node object at 0x7cc6a5afb850>, <__main__.Node object at 0x7cc6a5afbee0>, <__main__.Node object at 0x7cc6a5afa4d0>, <__main__.Node object at 0x7cc6a5afa590>, <__main__.Node object at 0x7cc6a5afa440>, <__main__.Node object at 0x7cc6a5af9f00>, <__main__.Node object at 0x7cc6a5af9000>, <__main__.Node object at 0x7cc6a5afacb0>, <__main__.Node object at 0x7cc6a5af88e0>, <__main__.Node object at 0x7cc6a5af9f30>, <__main__.Node object at 0x7cc6a5af8910>, <__main__.Node object at 0x7cc6a5af8520>, <__main__.Node object at 0x7cc6a5afae90>, <__main__.Node object at 0x7cc6a5af8f40>, <__main__.Node object at 0x7cc6a5af8d90>]

Node 10530911 points to [<__main__.Node obj

In [None]:
# Authority and Hub score
# Initialize the sum of authority scores to 0
auth_score_sum = 0
# Calculate authority score for each node
for node in list_main_nodes:
  # Iterate through all nodes connected to the current node
  for node2 in node.get_connections_to_them():
    # Increment the authority score of the current node by the hub score of the connected node
    node.increment_auth_score(node2.get_hub_score())
  # Add the current node's authority score to the total sum
  auth_score_sum += node.get_auth_score()

# Normalize authority scores by dividing each score by the total sum
for node in list_main_nodes:
  node.set_auth_score(node.get_auth_score() / auth_score_sum)

# Initialize the sum of hub scores to 0
hub_score_sum = 0

# Calculate hub score for each node
for node in list_main_nodes:
  # Iterate through all nodes connected to the current node
  for node2 in node.get_connections_to_them():
    # Increment the hub score of the connected node by the authority score of the current node
    node2.increment_hub_score(node.get_auth_score())
  # Add the current node's hub score to the total sum
  hub_score_sum += node.get_hub_score()

# Normalize hub scores by dividing each score by the total sum
for node in list_main_nodes:
  node.set_hub_score(node.get_hub_score() / hub_score_sum)

# Iterate 30 times to print the hub and authority scores of the first 30 nodes
for i in range(30):
  print(f"Node id {list_main_nodes[i]} has hub score of",  "{:.4f}".format(list_main_nodes[i].get_hub_score()), "and auth score of", "{:.4f}".format(list_main_nodes[i].get_auth_score()))



Node id 10530943 has hub score of 0.0579 and auth score of 0.0220
Node id 10530911 has hub score of 0.0426 and auth score of 0.0330
Node id 12081395 has hub score of 0.0495 and auth score of 0.0220
Node id 12360400 has hub score of 0.0371 and auth score of 0.0440
Node id 11776868 has hub score of 0.0382 and auth score of 0.0440
Node id 10529701 has hub score of 0.0397 and auth score of 0.0220
Node id 10529988 has hub score of 0.0346 and auth score of 0.0330
Node id 10530136 has hub score of 0.0331 and auth score of 0.0440
Node id 10530657 has hub score of 0.0349 and auth score of 0.0330
Node id 14960829 has hub score of 0.0331 and auth score of 0.0440
Node id 11353515 has hub score of 0.0331 and auth score of 0.0330
Node id 10798909 has hub score of 0.0331 and auth score of 0.0330
Node id 13972449 has hub score of 0.0331 and auth score of 0.0330
Node id 10529951 has hub score of 0.0331 and auth score of 0.0330
Node id 11067704 has hub score of 0.0331 and auth score of 0.0440
Node id 12

**Page rank algorithm**

In [None]:
# Initialize page rank scores for all nodes to equal probability
for node in list_main_nodes:
  node.set_page_rank_score(1 / len(list_main_nodes))
  # Ensure every node has at least one outgoing link (to prevent division by zero)
  if len(node.get_connections_to_them()) == 0:
    node.add_connection_to_them(list_main_nodes)

# PageRank iteration
for i in range(101):
  # Iterate over all nodes
  for node in list_main_nodes:
    # Calculate the new page rank score for the current node
    page_rank_value = 0.15 / len(list_main_nodes)  # Teleportation factor
    for node2 in node.get_connections_to_me():
      # Contribute from incoming links based on the sender's page rank and outlinks
      page_rank_value += 0.85 * node2.get_page_rank_score() / len(node2.get_connections_to_them())
    node.set_page_rank_score(page_rank_value)



  # Convergence check
  # Print page rank scores every 10 iterations for monitoring
  if (i % 10 == 0):
    print(f"Iteration {i}:")
    for i in range(len(list_main_nodes)):
      print(f"Node {list_main_nodes[i].get_id()} PR:",  "{:.4f}".format(list_main_nodes[i].get_page_rank_score()), end=", ")
    print("")


Iteration 0:
Node 10530943 PR: 0.2694, Node 10530911 PR: 0.1183, Node 12081395 PR: 0.4449, Node 12360400 PR: 0.0381, Node 11776868 PR: 0.0630, Node 10529701 PR: 0.0701, Node 10529988 PR: 0.0144, Node 10530136 PR: 0.0050, Node 10530657 PR: 0.0475, Node 14960829 PR: 0.0050, Node 11353515 PR: 0.0050, Node 10798909 PR: 0.0050, Node 13972449 PR: 0.0050, Node 10529951 PR: 0.0050, Node 11067704 PR: 0.0050, Node 12869895 PR: 0.0050, Node 11324199 PR: 0.0050, Node 13125562 PR: 0.0050, Node 12865492 PR: 0.0050, Node 12360405 PR: 0.0381, Node 13251950 PR: 0.0050, Node 12081401 PR: 0.0050, Node 13096868 PR: 0.0050, Node 10926373 PR: 0.0050, Node 10529805 PR: 0.0050, Node 11350864 PR: 0.0050, Node 11332873 PR: 0.0050, Node 10530268 PR: 0.0050, Node 12853864 PR: 0.0050, Node 11332868 PR: 0.0064, 
Iteration 10:
Node 10530943 PR: 0.1838, Node 10530911 PR: 0.2904, Node 12081395 PR: 0.3170, Node 12360400 PR: 0.0191, Node 11776868 PR: 0.0316, Node 10529701 PR: 0.0217, Node 10529988 PR: 0.0064, Node 10530

In [None]:
# Find top page rank scoring page indexes and display their id number.
top_ten_results =[] # List to store tuples of index and page rank score

# Iterate over main nodes, creating tuples of index and page rank score
for i, item in enumerate(list_main_nodes):
  top_ten_results.append((i, item.get_page_rank_score()))

# Sort the results by page rank score in descending order
top_ten_results.sort(key=lambda x: x[1], reverse=True)

# Extract the indexes of the top 10 results
list_of_indexes_of_top_ten = []
for i in range(10):
  list_of_indexes_of_top_ten.append(top_ten_results[i][0])

# Print page rank scores and corresponding links for the top 10 results
for i in list_of_indexes_of_top_ten:
  print("PR: ", list_main_nodes[i].get_page_rank_score())
  print(links[i])

PR:  0.3072475127646222
https://www.myprotein.co.il/sports-nutrition/clear-whey-protein-powder/12081395.html
PR:  0.27951246918333494
https://www.myprotein.co.il/sports-nutrition/impact-whey-isolate-powder/10530911.html
PR:  0.1777605054993788
https://www.myprotein.co.il/sports-nutrition/impact-whey-protein-powder/10530943.html
PR:  0.03164354967948718
https://www.myprotein.co.il/sports-nutrition/vegan-protein-blend/11776868.html
PR:  0.021651089075854703
https://www.myprotein.co.il/sports-nutrition/soy-protein-isolate/10529701.html
PR:  0.019082790464743593
https://www.myprotein.co.il/sports-nutrition/clear-vegan-protein/12360400.html
PR:  0.011375
https://www.myprotein.co.il/sports-nutrition/impact-diet-whey/10530657.html
PR:  0.010560416666666668
https://www.myprotein.co.il/sports-nutrition/clear-vegan-protein-sample/12360405.html
PR:  0.006416666666666667
https://www.myprotein.co.il/sports-nutrition/impact-weight-gainer/10529988.html
PR:  0.006416666666666667
https://www.myprotein.