## SCRAPING 111 TO 253 PAGES

In [7]:
from time import sleep
import requests
import pandas as pd
from bs4 import BeautifulSoup

# Function to extract data from BeautifulSoup objects
def soup2list(src, list_, attr=None):
    if attr:
        for val in src:
            list_.append(val.get(attr, "N/A"))  # Avoid KeyError if attribute is missing
    else:
        for val in src:
            list_.append(val.get_text(strip=True))  # Strip spaces

# Lists to store extracted data
users = []
userReviewNum = []
ratings = []
locations = []
dates = []
reviews = []

from_page = 111
to_page = 411
company = 'www.walmart.com'

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}  # Added headers to prevent request blocking

for i in range(from_page, to_page + 1):
    url = f"https://www.trustpilot.com/review/{company}?page={i}"
    result = requests.get(url, headers=headers)

    if result.status_code != 200:
        print(f"Failed to retrieve page {i}. Status code: {result.status_code}")
        continue  # Skip this page if request fails

    soup = BeautifulSoup(result.content, "html.parser")

    # Extracting Usernames
    soup2list(soup.find_all('span', class_='typography_heading-xs__osRhC'), users)  # Usernames

    # Extracting Total Reviews (Numbers only)
    for user_info in soup.find_all('div', class_='styles_consumerExtraDetails__TylYM'):
        total_reviews_text = user_info.get_text(strip=True)
        total_reviews = ''.join(filter(str.isdigit, total_reviews_text))  # Extract digits only
        userReviewNum.append(total_reviews if total_reviews else "N/A")

    # Extracting Locations (Only country, excluding review count)
    soup2list(soup.find_all('span', class_='typography_body-m__k2UI7 typography_appearance-subtle__PYOVM'), locations)

    # Extracting Dates
    soup2list(soup.find_all('time'), dates, attr='datetime')  # Dates

    # Extracting Ratings
    soup2list(soup.find_all('div', class_='styles_reviewHeader__PuHBd'), ratings, attr='data-service-review-rating')  # Ratings

    # Extracting Review Texts
    for review_content in soup.find_all('div', class_='styles_reviewContent__SCYfD'):
        review_text = review_content.get_text(strip=True)
        reviews.append(review_text if review_text else "N/A")

    sleep(1)  # Prevents request throttling

# Fix: Ensure all lists have the same length
max_length = max(len(users), len(userReviewNum), len(locations), len(dates), len(reviews), len(ratings))

# Pad shorter lists with "N/A"
users += ["N/A"] * (max_length - len(users))
userReviewNum += ["N/A"] * (max_length - len(userReviewNum))
locations += ["N/A"] * (max_length - len(locations))
dates += ["N/A"] * (max_length - len(dates))
reviews += ["N/A"] * (max_length - len(reviews))
ratings += ["N/A"] * (max_length - len(ratings))

# Creating a DataFrame
review_data = pd.DataFrame(
    {
        'Username': users,
        'Total reviews': userReviewNum,
        'Location': locations,
        'Date': dates,
        'Review': reviews,
        'Rating': ratings
    }
)

# # Optionally save to CSV
# review_data.to_csv("review_data.csv", index=False)

# Display first 5 rows for debugging purposes
print(review_data.head())


Failed to retrieve page 254. Status code: 403
Failed to retrieve page 255. Status code: 403
Failed to retrieve page 256. Status code: 403
Failed to retrieve page 257. Status code: 403
Failed to retrieve page 258. Status code: 403
Failed to retrieve page 259. Status code: 403
Failed to retrieve page 260. Status code: 403
Failed to retrieve page 261. Status code: 403
Failed to retrieve page 262. Status code: 403
Failed to retrieve page 263. Status code: 403
Failed to retrieve page 264. Status code: 403
Failed to retrieve page 265. Status code: 403
Failed to retrieve page 266. Status code: 403
Failed to retrieve page 267. Status code: 403
Failed to retrieve page 268. Status code: 403
Failed to retrieve page 269. Status code: 403
Failed to retrieve page 270. Status code: 403
Failed to retrieve page 271. Status code: 403
Failed to retrieve page 272. Status code: 403
Failed to retrieve page 273. Status code: 403
Failed to retrieve page 274. Status code: 403
Failed to retrieve page 275. Statu

## SCRAPING 254 TO 391 PAGES

In [10]:
import random
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from time import sleep

# Function to extract data from BeautifulSoup objects
def soup2list(src, list_, attr=None):
    if attr:
        for val in src:
            list_.append(val.get(attr, "N/A"))  # Avoid KeyError if attribute is missing
    else:
        for val in src:
            list_.append(val.get_text(strip=True))  # Strip spaces

# Lists to store extracted data
users = []
userReviewNum = []
ratings = []
locations = []
dates = []
reviews = []

from_page = 254
to_page = 411
company = 'www.walmart.com'

# List of user-agents to simulate different browsers
user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:81.0) Gecko/20100101 Firefox/81.0",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
]

# Function to fetch a page with retries in case of failure
def fetch_page_with_retry(url, retries=5):
    for _ in range(retries):
        headers = {
            "User-Agent": random.choice(user_agents)
        }
        try:
            result = requests.get(url, headers=headers)
            if result.status_code == 200:
                return result
            else:
                print(f"Failed to retrieve page. Status code: {result.status_code}. Retrying...")
                time.sleep(5)
        except requests.exceptions.RequestException as e:
            print(f"Request error for {url}: {e}. Retrying...")
            time.sleep(5)

    print(f"Failed to retrieve {url} after {retries} retries.")
    return None  # Return None if all retries fail

# Scraping the reviews from page 111 to 411
for i in range(from_page, to_page + 1):
    url = f"https://www.trustpilot.com/review/{company}?page={i}"
    print(f"Fetching page {i}: {url}")

    # Fetch the page with retry mechanism
    result = fetch_page_with_retry(url)

    if not result:
        print(f"Skipping page {i} due to repeated failures.")
        continue  # Skip this page if request fails

    soup = BeautifulSoup(result.content, "html.parser")

    # Extracting Usernames
    soup2list(soup.find_all('span', class_='typography_heading-xs__osRhC'), users)  # Usernames

    # Extracting Total Reviews (Numbers only)
    for user_info in soup.find_all('div', class_='styles_consumerExtraDetails__TylYM'):
        total_reviews_text = user_info.get_text(strip=True)
        total_reviews = ''.join(filter(str.isdigit, total_reviews_text))  # Extract digits only
        userReviewNum.append(total_reviews if total_reviews else "N/A")

    # Extracting Locations (Only country, excluding review count)
    soup2list(soup.find_all('span', class_='typography_body-m__k2UI7 typography_appearance-subtle__PYOVM'), locations)

    # Extracting Dates
    soup2list(soup.find_all('time'), dates, attr='datetime')  # Dates

    # Extracting Ratings
    soup2list(soup.find_all('div', class_='styles_reviewHeader__PuHBd'), ratings, attr='data-service-review-rating')  # Ratings

    # Extracting Review Texts
    for review_content in soup.find_all('div', class_='styles_reviewContent__SCYfD'):
        review_text = review_content.get_text(strip=True)
        reviews.append(review_text if review_text else "N/A")

    sleep(1)  # Prevents request throttling

# Fix: Ensure all lists have the same length
max_length = max(len(users), len(userReviewNum), len(locations), len(dates), len(reviews), len(ratings))

# Pad shorter lists with "N/A"
users += ["N/A"] * (max_length - len(users))
userReviewNum += ["N/A"] * (max_length - len(userReviewNum))
locations += ["N/A"] * (max_length - len(locations))
dates += ["N/A"] * (max_length - len(dates))
reviews += ["N/A"] * (max_length - len(reviews))
ratings += ["N/A"] * (max_length - len(ratings))

# Creating a DataFrame
review_data = pd.DataFrame(
    {
        'Username': users,
        'Total reviews': userReviewNum,
        'Location': locations,
        'Date': dates,
        'Review': reviews,
        'Rating': ratings
    }
)

# Optionally save to CSV
review_data.to_csv("review_data.csv", index=False)

# Display first 5 rows for debugging purposes
print(review_data.head())

Fetching page 254: https://www.trustpilot.com/review/www.walmart.com?page=254
Fetching page 255: https://www.trustpilot.com/review/www.walmart.com?page=255
Fetching page 256: https://www.trustpilot.com/review/www.walmart.com?page=256
Fetching page 257: https://www.trustpilot.com/review/www.walmart.com?page=257
Fetching page 258: https://www.trustpilot.com/review/www.walmart.com?page=258
Fetching page 259: https://www.trustpilot.com/review/www.walmart.com?page=259
Fetching page 260: https://www.trustpilot.com/review/www.walmart.com?page=260
Fetching page 261: https://www.trustpilot.com/review/www.walmart.com?page=261
Fetching page 262: https://www.trustpilot.com/review/www.walmart.com?page=262
Fetching page 263: https://www.trustpilot.com/review/www.walmart.com?page=263
Fetching page 264: https://www.trustpilot.com/review/www.walmart.com?page=264
Fetching page 265: https://www.trustpilot.com/review/www.walmart.com?page=265
Fetching page 266: https://www.trustpilot.com/review/www.walmart

## SCRAPING 392 TO 513 PAGES

In [13]:
import random
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from time import sleep

# Function to extract data from BeautifulSoup objects
def soup2list(src, list_, attr=None):
    if attr:
        for val in src:
            list_.append(val.get(attr, "N/A"))  # Avoid KeyError if attribute is missing
    else:
        for val in src:
            list_.append(val.get_text(strip=True))  # Strip spaces

# Lists to store extracted data
users = []
userReviewNum = []
ratings = []
locations = []
dates = []
reviews = []

from_page = 392
to_page = 513
company = 'www.walmart.com'

# List of user-agents to simulate different browsers
user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:81.0) Gecko/20100101 Firefox/81.0",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
]

# Function to fetch a page with retries in case of failure
def fetch_page_with_retry(url, retries=5):
    for _ in range(retries):
        headers = {
            "User-Agent": random.choice(user_agents)
        }
        try:
            result = requests.get(url, headers=headers)
            if result.status_code == 200:
                return result
            else:
                print(f"Failed to retrieve page. Status code: {result.status_code}. Retrying...")
                time.sleep(5)
        except requests.exceptions.RequestException as e:
            print(f"Request error for {url}: {e}. Retrying...")
            time.sleep(5)

    print(f"Failed to retrieve {url} after {retries} retries.")
    return None  # Return None if all retries fail

# Scraping the reviews from page 111 to 411
for i in range(from_page, to_page + 1):
    url = f"https://www.trustpilot.com/review/{company}?page={i}"
    print(f"Fetching page {i}: {url}")

    # Fetch the page with retry mechanism
    result = fetch_page_with_retry(url)

    if not result:
        print(f"Skipping page {i} due to repeated failures.")
        continue  # Skip this page if request fails

    soup = BeautifulSoup(result.content, "html.parser")

    # Extracting Usernames
    soup2list(soup.find_all('span', class_='typography_heading-xs__osRhC'), users)  # Usernames

    # Extracting Total Reviews (Numbers only)
    for user_info in soup.find_all('div', class_='styles_consumerExtraDetails__TylYM'):
        total_reviews_text = user_info.get_text(strip=True)
        total_reviews = ''.join(filter(str.isdigit, total_reviews_text))  # Extract digits only
        userReviewNum.append(total_reviews if total_reviews else "N/A")

    # Extracting Dates
    soup2list(soup.find_all('time'), dates, attr='datetime')  # Dates

    # Extracting Ratings
    soup2list(soup.find_all('div', class_='styles_reviewHeader__PuHBd'), ratings, attr='data-service-review-rating')  # Ratings

    # Extracting Review Texts
    for review_content in soup.find_all('div', class_='styles_reviewContent__SCYfD'):
        review_text = review_content.get_text(strip=True)
        reviews.append(review_text if review_text else "N/A")

    sleep(1)  # Prevents request throttling

# Fix: Ensure all lists have the same length
max_length = max(len(users), len(userReviewNum), len(locations), len(dates), len(reviews), len(ratings))

# Pad shorter lists with "N/A"
users += ["N/A"] * (max_length - len(users))
userReviewNum += ["N/A"] * (max_length - len(userReviewNum))
dates += ["N/A"] * (max_length - len(dates))
reviews += ["N/A"] * (max_length - len(reviews))
ratings += ["N/A"] * (max_length - len(ratings))

# Creating a DataFrame
review_data = pd.DataFrame(
    {
        'Username': users,
        'Total reviews': userReviewNum,
        'Date': dates,
        'Review': reviews,
        'Rating': ratings
    }
)

# # Optionally save to CSV
# review_data.to_csv("review_data.csv", index=False)

# Display first 5 rows for debugging purposes
print(review_data.head())

Fetching page 392: https://www.trustpilot.com/review/www.walmart.com?page=392
Fetching page 393: https://www.trustpilot.com/review/www.walmart.com?page=393
Fetching page 394: https://www.trustpilot.com/review/www.walmart.com?page=394
Fetching page 395: https://www.trustpilot.com/review/www.walmart.com?page=395
Fetching page 396: https://www.trustpilot.com/review/www.walmart.com?page=396
Fetching page 397: https://www.trustpilot.com/review/www.walmart.com?page=397
Fetching page 398: https://www.trustpilot.com/review/www.walmart.com?page=398
Fetching page 399: https://www.trustpilot.com/review/www.walmart.com?page=399
Fetching page 400: https://www.trustpilot.com/review/www.walmart.com?page=400
Fetching page 401: https://www.trustpilot.com/review/www.walmart.com?page=401
Fetching page 402: https://www.trustpilot.com/review/www.walmart.com?page=402
Fetching page 403: https://www.trustpilot.com/review/www.walmart.com?page=403
Fetching page 404: https://www.trustpilot.com/review/www.walmart

## SCRAPING 445 TO 476 PAGES

In [16]:
import random
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from time import sleep

# Function to extract data from BeautifulSoup objects
def soup2list(src, list_, attr=None):
    if attr:
        for val in src:
            list_.append(val.get(attr, "N/A"))  # Avoid KeyError if attribute is missing
    else:
        for val in src:
            list_.append(val.get_text(strip=True))  # Strip spaces

# Lists to store extracted data
users = []
userReviewNum = []
ratings = []
locations = []
dates = []
reviews = []

from_page = 445
to_page = 513
company = 'www.walmart.com'

# List of user-agents to simulate different browsers
user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:81.0) Gecko/20100101 Firefox/81.0",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
]

# Function to fetch a page with retries in case of failure
def fetch_page_with_retry(url, retries=5):
    for _ in range(retries):
        headers = {
            "User-Agent": random.choice(user_agents)
        }
        try:
            result = requests.get(url, headers=headers)
            if result.status_code == 200:
                return result
            else:
                print(f"Failed to retrieve page. Status code: {result.status_code}. Retrying...")
                time.sleep(5)
        except requests.exceptions.RequestException as e:
            print(f"Request error for {url}: {e}. Retrying...")
            time.sleep(5)

    print(f"Failed to retrieve {url} after {retries} retries.")
    return None  # Return None if all retries fail

# Scraping the reviews from page 111 to 411
for i in range(from_page, to_page + 1):
    url = f"https://www.trustpilot.com/review/{company}?page={i}"
    print(f"Fetching page {i}: {url}")

    # Fetch the page with retry mechanism
    result = fetch_page_with_retry(url)

    if not result:
        print(f"Skipping page {i} due to repeated failures.")
        continue  # Skip this page if request fails

    soup = BeautifulSoup(result.content, "html.parser")

    # Extracting Usernames
    soup2list(soup.find_all('span', class_='typography_heading-xs__osRhC'), users)  # Usernames

    # Extracting Total Reviews (Numbers only)
    for user_info in soup.find_all('div', class_='styles_consumerExtraDetails__TylYM'):
        total_reviews_text = user_info.get_text(strip=True)
        total_reviews = ''.join(filter(str.isdigit, total_reviews_text))  # Extract digits only
        userReviewNum.append(total_reviews if total_reviews else "N/A")

    # Extracting Dates
    soup2list(soup.find_all('time'), dates, attr='datetime')  # Dates

    # Extracting Ratings
    soup2list(soup.find_all('div', class_='styles_reviewHeader__PuHBd'), ratings, attr='data-service-review-rating')  # Ratings

    # Extracting Review Texts
    for review_content in soup.find_all('div', class_='styles_reviewContent__SCYfD'):
        review_text = review_content.get_text(strip=True)
        reviews.append(review_text if review_text else "N/A")

    sleep(1)  # Prevents request throttling

# Fix: Ensure all lists have the same length
max_length = max(len(users), len(userReviewNum), len(locations), len(dates), len(reviews), len(ratings))

# Pad shorter lists with "N/A"
users += ["N/A"] * (max_length - len(users))
userReviewNum += ["N/A"] * (max_length - len(userReviewNum))
dates += ["N/A"] * (max_length - len(dates))
reviews += ["N/A"] * (max_length - len(reviews))
ratings += ["N/A"] * (max_length - len(ratings))

# Creating a DataFrame
review_data = pd.DataFrame(
    {
        'Username': users,
        'Total reviews': userReviewNum,
        'Date': dates,
        'Review': reviews,
        'Rating': ratings
    }
)

# # Optionally save to CSV
# review_data.to_csv("review_data.csv", index=False)

# Display first 5 rows for debugging purposes
print(review_data.head())

Fetching page 445: https://www.trustpilot.com/review/www.walmart.com?page=445
Fetching page 446: https://www.trustpilot.com/review/www.walmart.com?page=446
Fetching page 447: https://www.trustpilot.com/review/www.walmart.com?page=447
Fetching page 448: https://www.trustpilot.com/review/www.walmart.com?page=448
Fetching page 449: https://www.trustpilot.com/review/www.walmart.com?page=449
Fetching page 450: https://www.trustpilot.com/review/www.walmart.com?page=450
Fetching page 451: https://www.trustpilot.com/review/www.walmart.com?page=451
Fetching page 452: https://www.trustpilot.com/review/www.walmart.com?page=452
Fetching page 453: https://www.trustpilot.com/review/www.walmart.com?page=453
Fetching page 454: https://www.trustpilot.com/review/www.walmart.com?page=454
Fetching page 455: https://www.trustpilot.com/review/www.walmart.com?page=455
Fetching page 456: https://www.trustpilot.com/review/www.walmart.com?page=456
Fetching page 457: https://www.trustpilot.com/review/www.walmart

## SCRAPING 477 TO 497 PAGES

In [19]:
import random
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from time import sleep

# Function to extract data from BeautifulSoup objects
def soup2list(src, list_, attr=None):
    if attr:
        for val in src:
            list_.append(val.get(attr, "N/A"))  # Avoid KeyError if attribute is missing
    else:
        for val in src:
            list_.append(val.get_text(strip=True))  # Strip spaces

# Lists to store extracted data
users = []
userReviewNum = []
ratings = []
locations = []
dates = []
reviews = []

from_page = 477
to_page = 513
company = 'www.walmart.com'

# List of user-agents to simulate different browsers
user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:81.0) Gecko/20100101 Firefox/81.0",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
]

# Function to fetch a page with retries in case of failure
def fetch_page_with_retry(url, retries=5):
    for _ in range(retries):
        headers = {
            "User-Agent": random.choice(user_agents)
        }
        try:
            result = requests.get(url, headers=headers)
            if result.status_code == 200:
                return result
            else:
                print(f"Failed to retrieve page. Status code: {result.status_code}. Retrying...")
                time.sleep(5)
        except requests.exceptions.RequestException as e:
            print(f"Request error for {url}: {e}. Retrying...")
            time.sleep(5)

    print(f"Failed to retrieve {url} after {retries} retries.")
    return None  # Return None if all retries fail

# Scraping the reviews from page 111 to 411
for i in range(from_page, to_page + 1):
    url = f"https://www.trustpilot.com/review/{company}?page={i}"
    print(f"Fetching page {i}: {url}")

    # Fetch the page with retry mechanism
    result = fetch_page_with_retry(url)

    if not result:
        print(f"Skipping page {i} due to repeated failures.")
        continue  # Skip this page if request fails

    soup = BeautifulSoup(result.content, "html.parser")

    # Extracting Usernames
    soup2list(soup.find_all('span', class_='typography_heading-xs__osRhC'), users)  # Usernames

    # Extracting Total Reviews (Numbers only)
    for user_info in soup.find_all('div', class_='styles_consumerExtraDetails__TylYM'):
        total_reviews_text = user_info.get_text(strip=True)
        total_reviews = ''.join(filter(str.isdigit, total_reviews_text))  # Extract digits only
        userReviewNum.append(total_reviews if total_reviews else "N/A")

    # Extracting Dates
    soup2list(soup.find_all('time'), dates, attr='datetime')  # Dates

    # Extracting Ratings
    soup2list(soup.find_all('div', class_='styles_reviewHeader__PuHBd'), ratings, attr='data-service-review-rating')  # Ratings

    # Extracting Review Texts
    for review_content in soup.find_all('div', class_='styles_reviewContent__SCYfD'):
        review_text = review_content.get_text(strip=True)
        reviews.append(review_text if review_text else "N/A")

    sleep(1)  # Prevents request throttling

# Fix: Ensure all lists have the same length
max_length = max(len(users), len(userReviewNum), len(locations), len(dates), len(reviews), len(ratings))

# Pad shorter lists with "N/A"
users += ["N/A"] * (max_length - len(users))
userReviewNum += ["N/A"] * (max_length - len(userReviewNum))
dates += ["N/A"] * (max_length - len(dates))
reviews += ["N/A"] * (max_length - len(reviews))
ratings += ["N/A"] * (max_length - len(ratings))

# Creating a DataFrame
review_data = pd.DataFrame(
    {
        'Username': users,
        'Total reviews': userReviewNum,
        'Date': dates,
        'Review': reviews,
        'Rating': ratings
    }
)

# # Optionally save to CSV
# review_data.to_csv("review_data.csv", index=False)

# Display first 5 rows for debugging purposes
print(review_data.head())

Fetching page 477: https://www.trustpilot.com/review/www.walmart.com?page=477
Fetching page 478: https://www.trustpilot.com/review/www.walmart.com?page=478
Fetching page 479: https://www.trustpilot.com/review/www.walmart.com?page=479
Fetching page 480: https://www.trustpilot.com/review/www.walmart.com?page=480
Fetching page 481: https://www.trustpilot.com/review/www.walmart.com?page=481
Fetching page 482: https://www.trustpilot.com/review/www.walmart.com?page=482
Fetching page 483: https://www.trustpilot.com/review/www.walmart.com?page=483
Fetching page 484: https://www.trustpilot.com/review/www.walmart.com?page=484
Fetching page 485: https://www.trustpilot.com/review/www.walmart.com?page=485
Fetching page 486: https://www.trustpilot.com/review/www.walmart.com?page=486
Fetching page 487: https://www.trustpilot.com/review/www.walmart.com?page=487
Fetching page 488: https://www.trustpilot.com/review/www.walmart.com?page=488
Fetching page 489: https://www.trustpilot.com/review/www.walmart

## SCRAPING 498 TO 513 PAGES

In [22]:
import random
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from time import sleep

# Function to extract data from BeautifulSoup objects
def soup2list(src, list_, attr=None):
    if attr:
        for val in src:
            list_.append(val.get(attr, "N/A"))  # Avoid KeyError if attribute is missing
    else:
        for val in src:
            list_.append(val.get_text(strip=True))  # Strip spaces

# Lists to store extracted data
users = []
userReviewNum = []
ratings = []
locations = []
dates = []
reviews = []

from_page = 498
to_page = 513
company = 'www.walmart.com'

# List of user-agents to simulate different browsers
user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:81.0) Gecko/20100101 Firefox/81.0",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
]

# Function to fetch a page with retries in case of failure
def fetch_page_with_retry(url, retries=5):
    for _ in range(retries):
        headers = {
            "User-Agent": random.choice(user_agents)
        }
        try:
            result = requests.get(url, headers=headers)
            if result.status_code == 200:
                return result
            else:
                print(f"Failed to retrieve page. Status code: {result.status_code}. Retrying...")
                time.sleep(5)
        except requests.exceptions.RequestException as e:
            print(f"Request error for {url}: {e}. Retrying...")
            time.sleep(5)

    print(f"Failed to retrieve {url} after {retries} retries.")
    return None  # Return None if all retries fail

# Scraping the reviews from page 111 to 411
for i in range(from_page, to_page + 1):
    url = f"https://www.trustpilot.com/review/{company}?page={i}"
    print(f"Fetching page {i}: {url}")

    # Fetch the page with retry mechanism
    result = fetch_page_with_retry(url)

    if not result:
        print(f"Skipping page {i} due to repeated failures.")
        continue  # Skip this page if request fails

    soup = BeautifulSoup(result.content, "html.parser")

    # Extracting Usernames
    soup2list(soup.find_all('span', class_='typography_heading-xs__osRhC'), users)  # Usernames

    # Extracting Total Reviews (Numbers only)
    for user_info in soup.find_all('div', class_='styles_consumerExtraDetails__TylYM'):
        total_reviews_text = user_info.get_text(strip=True)
        total_reviews = ''.join(filter(str.isdigit, total_reviews_text))  # Extract digits only
        userReviewNum.append(total_reviews if total_reviews else "N/A")

    # Extracting Dates
    soup2list(soup.find_all('time'), dates, attr='datetime')  # Dates

    # Extracting Ratings
    soup2list(soup.find_all('div', class_='styles_reviewHeader__PuHBd'), ratings, attr='data-service-review-rating')  # Ratings

    # Extracting Review Texts
    for review_content in soup.find_all('div', class_='styles_reviewContent__SCYfD'):
        review_text = review_content.get_text(strip=True)
        reviews.append(review_text if review_text else "N/A")

    sleep(1)  # Prevents request throttling

# Fix: Ensure all lists have the same length
max_length = max(len(users), len(userReviewNum), len(locations), len(dates), len(reviews), len(ratings))

# Pad shorter lists with "N/A"
users += ["N/A"] * (max_length - len(users))
userReviewNum += ["N/A"] * (max_length - len(userReviewNum))
dates += ["N/A"] * (max_length - len(dates))
reviews += ["N/A"] * (max_length - len(reviews))
ratings += ["N/A"] * (max_length - len(ratings))

# Creating a DataFrame
review_data = pd.DataFrame(
    {
        'Username': users,
        'Total reviews': userReviewNum,
        'Date': dates,
        'Review': reviews,
        'Rating': ratings
    }
)

# # Optionally save to CSV
# review_data.to_csv("review_data.csv", index=False)

# Display first 5 rows for debugging purposes
print(review_data.head())

Fetching page 498: https://www.trustpilot.com/review/www.walmart.com?page=498
Fetching page 499: https://www.trustpilot.com/review/www.walmart.com?page=499
Fetching page 500: https://www.trustpilot.com/review/www.walmart.com?page=500
Fetching page 501: https://www.trustpilot.com/review/www.walmart.com?page=501
Fetching page 502: https://www.trustpilot.com/review/www.walmart.com?page=502
Fetching page 503: https://www.trustpilot.com/review/www.walmart.com?page=503
Fetching page 504: https://www.trustpilot.com/review/www.walmart.com?page=504
Fetching page 505: https://www.trustpilot.com/review/www.walmart.com?page=505
Fetching page 506: https://www.trustpilot.com/review/www.walmart.com?page=506
Fetching page 507: https://www.trustpilot.com/review/www.walmart.com?page=507
Fetching page 508: https://www.trustpilot.com/review/www.walmart.com?page=508
Fetching page 509: https://www.trustpilot.com/review/www.walmart.com?page=509
Fetching page 510: https://www.trustpilot.com/review/www.walmart

In [23]:
review_data

Unnamed: 0,Username,Total reviews,Date,Review,Rating
0,,3,2025-03-09T00:02:13.000Z,After experience in the Center Texas …After ex...,1
1,Madeline Hall,4,2025-03-11T17:46:12.000Z,Walmart is greatWalmart is great! I ordered fr...,5
2,Christy Mashore,19,2025-03-11T02:19:00.000Z,WALMART On line ordering is a scam. NO SAVIN...,1
3,LA,,2025-03-11T20:29:17.000Z,Great store and peopleDate of experience:Decem...,5
4,Laurie,2,2017-12-15T22:43:26.000Z,Ordered video game on sale to pick-up …Ordered...,1
...,...,...,...,...,...
662,chiru,,,,
663,PS,,,,
664,priya sen,,,,
665,"Show reviews in all languages. (10,872reviews)",,,,


In [24]:
# Convert the DataFrame to a CSV file and download it

review_data.to_csv('reviews.csv', index=False)
files.download('reviews.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>