Finding children <b>recursively</b> entails getting *their* grandchildren too, as _viable_ results.

We can generalize this process for each card and collect:
- the brand name
- the model of the device
- the memory storage
- the screen/display size, and
- the connector type

Ratings by <b>*features*</b>.
> to be implemented

## Generalizaing Review Collection

In [1]:
# Libraries needed
import pandas as pd
import requests_html, re, time
from bs4 import BeautifulSoup
import csv


# General variables
mainUrl = 'https://www.amazon.com'
headers = ({'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18362',
            'Accept-Language': 'en-US, en;q=0.5'})

pageNum = 1
url = """
      https://www.amazon.com/s?bbn=7072561011&rh=n%3A7072561011%2Cp_n_feature_twenty-
      one_browse-bin%3A21596696011&dc&qid=1665390197&rnid=17881854011&ref=lp_7072561011_nr_p_n_feature_twenty-one_browse-bin_0
      """
targetPages = 15
data = pd.DataFrame(columns=['brand', 'model', 'memoryStorage','opSystem','display', 'reviews'])

session = requests_html.HTMLSession()
while pageNum < targetPages:
    currentUrl = url
    response = session.get(url= currentUrl, headers=headers)
    
    # getting the response content

    primaryPageSoup = BeautifulSoup(markup=response.content, features='lxml')
    
    # finding the container 

    try:
        parentDiv = primaryPageSoup.find(name="div", 
                                         attrs={'class': 's-main-slot s-result-list s-search-results sg-row'})

        # finding the card with phone titles

        phoneDivs = parentDiv.findChildren(name="div", attrs={'data-uuid': True}, recursive=False)

        for phone in phoneDivs:

            redirectUrl = phone.find(name='a')['href']

            pageRedirect = session.get(mainUrl + redirectUrl, headers=headers)

            redirectSoup = BeautifulSoup(markup=pageRedirect.content, features='lxml')

            # Grabbing link to reviews
            try:
                reviewsUrl = redirectSoup.find(name="a", 
                                               attrs={'data-hook': 'see-all-reviews-link-foot'})['href']
            except TypeError:
                # If no reviews found, no point in collecting the information
                continue

            try:
                # Locate table with Device Information
                infoTable = redirectSoup.find(name='table', attrs={'class': 'a-normal a-spacing-micro'})

                # Gather Brand Information- eg, Samsung, Iphone, One Plus
                brandRow = infoTable.find(name='tr', attrs={'class': 'a-spacing-small po-brand'})
                brand = brandRow.findChildren(name='td', recursive=False)[1].text.strip()

                # Gather Device Model Information - eg, Iphone XS, S20FE
                modelRow = infoTable.find(name='tr', attrs={'class': 'a-spacing-small po-model_name'})
                model = modelRow.findChildren(name='td', recursive=False)[1].text.strip()

                # Gather Operating System Information - eg IOS, Android 13
                opSystemrow = infoTable.find(name='tr', 
                                             attrs={'class': 'a-spacing-small po-operating_system'})
                operatingSystem = opSystemrow.findChildren(name='td', recursive=False)[1].text.strip()

                # Gather ROM information - eg, 64GB, 128GB
                memoryStorageRow = infoTable.find(name='tr', 
                                                  attrs={'class': 'a-spacing-small po-memory_storage_capacity'})
                memoryStorage = memoryStorageRow.findChildren(name='td', recursive=False)[1].text.strip()

                # Gather Screen Display Size - eg, 5.8 Inches
                displaySizeRow = infoTable.find(name='tr',
                                                attrs={'class': 'a-spacing-small po-display.size'})
                displaySize = displaySizeRow.findChildren(name="td", recursive=False)[1].text.strip()

            except AttributeError:
                continue


            # Pagination for Reviews 
            pageCounter = 0

            while pageCounter < 5:
                # Getting request for the review page
                reviewsPage = session.get(url=mainUrl + reviewsUrl,
                                      headers=headers)
                # Creating the Soup
                reviewsSoup = BeautifulSoup(reviewsPage.content, features='lxml')

                try: 
                    # Obtaining the reviews parent div
                    reviewsDiv = reviewsSoup.find(name="div", 
                                               attrs={'id': 'cm_cr-review_list'})

                    # All reviews in their respective divs
                    reviewsList = reviewsDiv.findChildren(name='div',
                                                     attrs={'data-hook': 'review'},
                                                     recursive=False)

                    for div in reviewsList:
                        text = div.find(name="span", attrs={'data-hook': 'review-body'}).text.strip('\n')

                        # Append Information to csv, first five column entries will be repeated per review.
                        data.loc[len(data.index)] = [brand, model, memoryStorage, operatingSystem, displaySize, text]
                        
                    try:
                        # Getting next page url and updating the reviewsUrl variable
                        paginationContainer = reviewsSoup.find(name="ul", attrs={'class': 'a-pagination'})
                        nextReviewsUrl = paginationContainer.findChildren(name="li", recursive=False)[1].find(name="a")['href']
                        reviewsUrl = nextReviewsUrl
                    except TypeError:
                        break 


                except AttributeError:
                    break

                # Sleeping to avoid making too many requests
                time.sleep(10)

                # Updating the page counter
                pageCounter += 1
    
    except AttributeError:
        print("Try again in an hour! Bot is blocked")
        data.to_csv("./Data/amazon_data", mode="a",
                   header=False, index=False)
        break
        
    # Retrieve next Page Url
    paginationSpan = primaryPageSoup.find(name='span', attrs={'class': 's-pagination-strip'})
    
    print("Done with page {}".format(pageNum))
    pageNum += 1
    nextPageUrl = paginationSpan.find(name='a',
                   attrs={'aria-label': f'Go to next page, page {str(pageNum)}'})['href']
    
    url = mainUrl + nextPageUrl
    
    if pageNum != targetPages:
        print("Obtained link for page {}".format(pageNum), end="\n" + "*" * 50 +"\n")
        time.sleep(30)
    else:
        print("Done scraping", end="\n" + "*" * 50 + "\n")

Done with page 1
Obtained link for page 2
**************************************************
Try again in an hour! Bot is blocked
Done with page 2


AttributeError: 'NoneType' object has no attribute 'find'