### **Product** : ᖳᖰ Apple AirPods with Charging Case (2nd Generation)

**Note**: Since Walmart prohibits direct web scraping of reviews, we will obtain the reviews by following these steps:

1. Download the HTML pages containing the product reviews.
2. Extract the reviews from the downloaded HTML pages.


###    

---
# **Step-1 : Download the HTML pages containing the product reviews.**
---

## **Star Rating : (1 - 3)**

In [None]:
import requests
import os
import random
import time
from bs4 import BeautifulSoup


def get_proxies_list():
    # Implement your logic for getting proxies from a file, service, or API
    # Return a list of proxy URLs (e.g., ["http://proxy1.example.com:8080", "http://proxy2.example.com:8080"])
    # For this example we will use a static list
    return ["http://104.248.76.249:8888","http://188.240.148.226:8080","http://212.193.241.137:8080","http://185.105.137.15:8080"]

def download_walmart_review_page(url, output_dir=".", max_retries=5, min_delay=2, max_delay=20):
    """
    Downloads the HTML content of a Walmart review page using requests with enhanced bot protection measures.

    Args:
        url (str): The URL of the Walmart review page.
        output_dir (str, optional): The directory to save the file in. Defaults to the current directory.
        max_retries (int, optional): Maximum number of retry attempts. Defaults to 5.
        min_delay (int, optional): Minimum random delay. Defaults to 2.
        max_delay (int, optional): Maximum random delay. Defaults to 20.

    Returns:
        bool: True if the download was successful, False otherwise.
    """
    return download_walmart_review_page_requests(url, output_dir, max_retries, min_delay, max_delay)

def download_walmart_review_page_requests(url, output_dir=".", max_retries=5, min_delay=2, max_delay=20):
    headers = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.9",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
    }

    user_agents = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:121.0) Gecko/20100101 Firefox/121.0",
    ]
    for attempt in range(max_retries):
        time.sleep(random.uniform(min_delay, max_delay))
        try:
            headers["User-Agent"] = random.choice(user_agents)
            proxies = {"http": random.choice(get_proxies_list())}  # Add proxy here
            response = requests.get(url, headers=headers, timeout=10, proxies=proxies)
            response.raise_for_status()
            html_content = response.text
            # Check if we got a capture page
            if is_capture_page(html_content):
                print(f"Capture page detected at: {url} (Attempt {attempt + 1}/{max_retries})")
                if attempt < max_retries - 1:
                    delay = random.uniform(min_delay, max_delay) + (attempt * 5)
                    time.sleep(delay)
                    continue  # Skip saving and retry
                else:
                    print(f"Exceeded max retries for : {url} returning False")
                    return False


            file_name_base = "reviews_"
            if "page=" in url:
                page_number = url.split("page=")[1]
                file_name = f"{file_name_base}_page_{page_number}_requests.html"
            else:
                file_name = f"{file_name_base}_requests.html"

            file_path = os.path.join(output_dir, file_name)

            with open(file_path, "w", encoding="utf-8") as file:
                file.write(html_content)

            print(f"Downloaded: {url} to {file_path}")
            return True

        except requests.exceptions.RequestException as e:
            print(f"Error downloading {url} (Attempt {attempt + 1}/{max_retries}): {e}")
            if attempt < max_retries - 1:
                delay = random.uniform(min_delay, max_delay) + (attempt * 5)
                time.sleep(delay)
            else:
                return False

    return False


def is_capture_page(html_content):
    """
    Checks if the provided HTML content corresponds to a capture page.

    Args:
        html_content (str): The HTML content to analyze.

    Returns:
        bool: True if a capture page is detected, False otherwise.
    """
    soup = BeautifulSoup(html_content, "html.parser")
    # Look for specific elements that usually appear on Walmart's capture page
    if soup.find("div", {"id": "px-captcha"}):
        return True
    if soup.find("h1", string=lambda text: "Please verify you're a human" in text if text else False):
        return True
    if "Please verify you're a human" in html_content:
        return True
    return False


if __name__ == "__main__":
    base_url = "https://www.walmart.com/reviews/product/604342441"
    main_output_folder = "Walmart_AirPods_Reviews"  # Main directory for all reviews

    # Create main output directory
    os.makedirs(main_output_folder, exist_ok=True)

    # Define pages per rating
    pages_per_rating = {
        1: 177,
        2: 45,
        3: 117
    }

    for rating, num_pages in pages_per_rating.items():
        output_folder = os.path.join(main_output_folder, f"Star_{rating}")  # Create subdir
        os.makedirs(output_folder, exist_ok=True)

        for page_number in range(1, num_pages + 1):
            page_url = f"{base_url}?ratings={rating}&page={page_number}"
            if download_walmart_review_page(page_url, output_dir=output_folder):
                print(f"Rating {rating}, Page {page_number} Downloaded Successfully (Requests)")
            else:
                print(f"Failed to download Rating {rating}, Page {page_number} (Requests)")

Downloaded: https://www.walmart.com/reviews/product/604342441?ratings=1&page=1 to Walmart_AirPods_Reviews/Star_1/reviews__page_1_requests.html
Rating 1, Page 1 Downloaded Successfully (Requests)
Downloaded: https://www.walmart.com/reviews/product/604342441?ratings=1&page=2 to Walmart_AirPods_Reviews/Star_1/reviews__page_2_requests.html
Rating 1, Page 2 Downloaded Successfully (Requests)
Downloaded: https://www.walmart.com/reviews/product/604342441?ratings=1&page=3 to Walmart_AirPods_Reviews/Star_1/reviews__page_3_requests.html
Rating 1, Page 3 Downloaded Successfully (Requests)
Downloaded: https://www.walmart.com/reviews/product/604342441?ratings=1&page=4 to Walmart_AirPods_Reviews/Star_1/reviews__page_4_requests.html
Rating 1, Page 4 Downloaded Successfully (Requests)
Downloaded: https://www.walmart.com/reviews/product/604342441?ratings=1&page=5 to Walmart_AirPods_Reviews/Star_1/reviews__page_5_requests.html
Rating 1, Page 5 Downloaded Successfully (Requests)
Downloaded: https://www.w

##        

## **Star Rating - 4**

In [None]:
import requests
import os
import random
import time
from bs4 import BeautifulSoup

def get_proxies_list():
    # Implement your logic for getting proxies from a file, service, or API
    # Return a list of proxy URLs (e.g., ["http://proxy1.example.com:8080", "http://proxy2.example.com:8080"])
    # For this example we will use a static list
    return ["http://104.248.76.249:8888","http://188.240.148.226:8080","http://212.193.241.137:8080","http://185.105.137.15:8080"]


def download_walmart_review_page(url, output_dir=".", max_retries=5, min_delay=2, max_delay=20):
    """
    Downloads the HTML content of a Walmart review page using requests with enhanced bot protection measures.

    Args:
        url (str): The URL of the Walmart review page.
        output_dir (str, optional): The directory to save the file in. Defaults to the current directory.
        max_retries (int, optional): Maximum number of retry attempts. Defaults to 5.
        min_delay (int, optional): Minimum random delay. Defaults to 2.
        max_delay (int, optional): Maximum random delay. Defaults to 20.

    Returns:
        bool: True if the download was successful, False otherwise.
    """
    return download_walmart_review_page_requests(url, output_dir, max_retries, min_delay, max_delay)

def download_walmart_review_page_requests(url, output_dir=".", max_retries=5, min_delay=2, max_delay=20):
    headers = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.9",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
    }

    user_agents = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:121.0) Gecko/20100101 Firefox/121.0",
    ]
    for attempt in range(max_retries):
        time.sleep(random.uniform(min_delay, max_delay))
        try:
            headers["User-Agent"] = random.choice(user_agents)
            proxies = {"http": random.choice(get_proxies_list())}  # Add proxy here
            response = requests.get(url, headers=headers, timeout=10, proxies=proxies)
            response.raise_for_status()
            html_content = response.text
            # Check if we got a capture page
            if is_capture_page(html_content):
                print(f"Capture page detected at: {url} (Attempt {attempt + 1}/{max_retries})")
                if attempt < max_retries - 1:
                    delay = random.uniform(min_delay, max_delay) + (attempt * 5)
                    time.sleep(delay)
                    continue  # Skip saving and retry
                else:
                    print(f"Exceeded max retries for : {url} returning False")
                    return False


            file_name_base = "Reviews"
            if "page=" in url:
                page_number = url.split("page=")[1]
                file_name = f"{file_name_base}_page_{page_number}_requests.html"
            else:
                file_name = f"{file_name_base}_requests.html"

            file_path = os.path.join(output_dir, file_name)

            with open(file_path, "w", encoding="utf-8") as file:
                file.write(html_content)

            print(f"Downloaded: {url} to {file_path}")
            return True

        except requests.exceptions.RequestException as e:
            print(f"Error downloading {url} (Attempt {attempt + 1}/{max_retries}): {e}")
            if attempt < max_retries - 1:
                delay = random.uniform(min_delay, max_delay) + (attempt * 5)
                time.sleep(delay)
            else:
                return False

    return False


def is_capture_page(html_content):
    """
    Checks if the provided HTML content corresponds to a capture page.

    Args:
        html_content (str): The HTML content to analyze.

    Returns:
        bool: True if a capture page is detected, False otherwise.
    """
    soup = BeautifulSoup(html_content, "html.parser")
    # Look for specific elements that usually appear on Walmart's capture page
    if soup.find("div", {"id": "px-captcha"}):
        return True
    if soup.find("h1", string=lambda text: "Please verify you're a human" in text if text else False):
        return True
    if "Please verify you're a human" in html_content:
        return True
    return False


if __name__ == "__main__":
    base_url = "https://www.walmart.com/reviews/product/604342441?ratings=4"
    output_folder = "Walmart_Airpods_Reviews_Star_4"  # Define the folder name

    # Create the output directory if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # Using Requests
    num_pages = 347
    for page_number in range(1, num_pages+1):
        page_url = base_url + f"&page={page_number}"
        if download_walmart_review_page(page_url, output_dir=output_folder):
            print(f"Page {page_number} Downloaded Successfully (Requests)")
        else:
            print(f"Failed to download page {page_number} (Requests)")

Downloaded: https://www.walmart.com/reviews/product/604342441?ratings=4&page=1 to Walmart_Airpods_Reviews_Star_4/Reviews_page_1_requests.html
Page 1 Downloaded Successfully (Requests)
Downloaded: https://www.walmart.com/reviews/product/604342441?ratings=4&page=2 to Walmart_Airpods_Reviews_Star_4/Reviews_page_2_requests.html
Page 2 Downloaded Successfully (Requests)
Downloaded: https://www.walmart.com/reviews/product/604342441?ratings=4&page=3 to Walmart_Airpods_Reviews_Star_4/Reviews_page_3_requests.html
Page 3 Downloaded Successfully (Requests)
Downloaded: https://www.walmart.com/reviews/product/604342441?ratings=4&page=4 to Walmart_Airpods_Reviews_Star_4/Reviews_page_4_requests.html
Page 4 Downloaded Successfully (Requests)
Downloaded: https://www.walmart.com/reviews/product/604342441?ratings=4&page=5 to Walmart_Airpods_Reviews_Star_4/Reviews_page_5_requests.html
Page 5 Downloaded Successfully (Requests)
Downloaded: https://www.walmart.com/reviews/product/604342441?ratings=4&page=6 t

##        

## **Star Rating - 5**  
**Note**: Scraping all 1000 pages at once could trigger rate-limiting or result in Walmart blocking our IP. To mitigate this risk, we will divide the 1000 pages into smaller batches. Initially, we will scrape 502 pages, with the remaining pages scraped in subsequent requests.

### **First 502 pages**

In [None]:
import requests
import os
import random
import time
from bs4 import BeautifulSoup

def get_proxies_list():
    # Implement your logic for getting proxies from a file, service, or API
    # Return a list of proxy URLs (e.g., ["http://proxy1.example.com:8080", "http://proxy2.example.com:8080"])
    # For this example we will use a static list
    return ["http://104.248.76.249:8888","http://188.240.148.226:8080","http://212.193.241.137:8080","http://185.105.137.15:8080"]


def download_walmart_review_page(url, output_dir=".", max_retries=5, min_delay=2, max_delay=10):
    """
    Downloads the HTML content of a Walmart review page using requests with enhanced bot protection measures.

    Args:
        url (str): The URL of the Walmart review page.
        output_dir (str, optional): The directory to save the file in. Defaults to the current directory.
        max_retries (int, optional): Maximum number of retry attempts. Defaults to 5.
        min_delay (int, optional): Minimum random delay. Defaults to 2.
        max_delay (int, optional): Maximum random delay. Defaults to 20.

    Returns:
        bool: True if the download was successful, False otherwise.
    """
    return download_walmart_review_page_requests(url, output_dir, max_retries, min_delay, max_delay)

def download_walmart_review_page_requests(url, output_dir=".", max_retries=5, min_delay=2, max_delay=10):
    headers = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.9",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
    }

    user_agents = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:121.0) Gecko/20100101 Firefox/121.0",
    ]
    for attempt in range(max_retries):
        time.sleep(random.uniform(min_delay, max_delay))
        try:
            headers["User-Agent"] = random.choice(user_agents)
            proxies = {"http": random.choice(get_proxies_list())}  # Add proxy here
            response = requests.get(url, headers=headers, timeout=10, proxies=proxies)
            response.raise_for_status()
            html_content = response.text
            # Check if we got a capture page
            if is_capture_page(html_content):
                print(f"Capture page detected at: {url} (Attempt {attempt + 1}/{max_retries})")
                if attempt < max_retries - 1:
                    delay = random.uniform(min_delay, max_delay) + (attempt * 5)
                    time.sleep(delay)
                    continue  # Skip saving and retry
                else:
                    print(f"Exceeded max retries for : {url} returning False")
                    return False


            file_name_base = "Reviews"
            if "page=" in url:
                page_number = url.split("page=")[1]
                file_name = f"{file_name_base}_page_{page_number}_requests.html"
            else:
                file_name = f"{file_name_base}_requests.html"

            file_path = os.path.join(output_dir, file_name)

            with open(file_path, "w", encoding="utf-8") as file:
                file.write(html_content)

            print(f"Downloaded: {url} to {file_path}")
            return True

        except requests.exceptions.RequestException as e:
            print(f"Error downloading {url} (Attempt {attempt + 1}/{max_retries}): {e}")
            if attempt < max_retries - 1:
                delay = random.uniform(min_delay, max_delay) + (attempt * 5)
                time.sleep(delay)
            else:
                return False

    return False


def is_capture_page(html_content):
    """
    Checks if the provided HTML content corresponds to a capture page.

    Args:
        html_content (str): The HTML content to analyze.

    Returns:
        bool: True if a capture page is detected, False otherwise.
    """
    soup = BeautifulSoup(html_content, "html.parser")
    # Look for specific elements that usually appear on Walmart's capture page
    if soup.find("div", {"id": "px-captcha"}):
        return True
    if soup.find("h1", string=lambda text: "Please verify you're a human" in text if text else False):
        return True
    if "Please verify you're a human" in html_content:
        return True
    return False


if __name__ == "__main__":
    base_url = "https://www.walmart.com/reviews/product/604342441?ratings=5"
    output_folder = "Walmart_Airpods_Reviews_Star_5"  # Define the folder name

    # Create the output directory if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # Using Requests
    num_pages = 502
    for page_number in range(1, num_pages+1):
        page_url = base_url + f"&page={page_number}"
        if download_walmart_review_page(page_url, output_dir=output_folder):
            print(f"Page {page_number} Downloaded Successfully (Requests)")
        else:
            print(f"Failed to download page {page_number} (Requests)")

## **Remaining Pages :**

In [None]:
import requests
import os
import random
import time
from bs4 import BeautifulSoup

def get_proxies_list():
    # Implement your logic for getting proxies from a file, service, or API
    # Return a list of proxy URLs (e.g., ["http://proxy1.example.com:8080", "http://proxy2.example.com:8080"])
    # For this example we will use a static list
    return ["http://104.248.76.249:8888","http://188.240.148.226:8080","http://212.193.241.137:8080","http://185.105.137.15:8080"]


def download_walmart_review_page(url, output_dir=".", max_retries=5, min_delay=2, max_delay=10):
    """
    Downloads the HTML content of a Walmart review page using requests with enhanced bot protection measures.

    Args:
        url (str): The URL of the Walmart review page.
        output_dir (str, optional): The directory to save the file in. Defaults to the current directory.
        max_retries (int, optional): Maximum number of retry attempts. Defaults to 5.
        min_delay (int, optional): Minimum random delay. Defaults to 2.
        max_delay (int, optional): Maximum random delay. Defaults to 20.

    Returns:
        bool: True if the download was successful, False otherwise.
    """
    return download_walmart_review_page_requests(url, output_dir, max_retries, min_delay, max_delay)

def download_walmart_review_page_requests(url, output_dir=".", max_retries=5, min_delay=2, max_delay=10):
    headers = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.9",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
    }

    user_agents = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:121.0) Gecko/20100101 Firefox/121.0",
    ]
    for attempt in range(max_retries):
        time.sleep(random.uniform(min_delay, max_delay))
        try:
            headers["User-Agent"] = random.choice(user_agents)
            proxies = {"http": random.choice(get_proxies_list())}  # Add proxy here
            response = requests.get(url, headers=headers, timeout=10, proxies=proxies)
            response.raise_for_status()
            html_content = response.text
            # Check if we got a capture page
            if is_capture_page(html_content):
                print(f"Capture page detected at: {url} (Attempt {attempt + 1}/{max_retries})")
                if attempt < max_retries - 1:
                    delay = random.uniform(min_delay, max_delay) + (attempt * 5)
                    time.sleep(delay)
                    continue  # Skip saving and retry
                else:
                    print(f"Exceeded max retries for : {url} returning False")
                    return False


            file_name_base = "Reviews"
            if "page=" in url:
                page_number = url.split("page=")[1]
                file_name = f"{file_name_base}_page_{page_number}_requests.html"
            else:
                file_name = f"{file_name_base}_requests.html"

            file_path = os.path.join(output_dir, file_name)

            with open(file_path, "w", encoding="utf-8") as file:
                file.write(html_content)

            print(f"Downloaded: {url} to {file_path}")
            return True

        except requests.exceptions.RequestException as e:
            print(f"Error downloading {url} (Attempt {attempt + 1}/{max_retries}): {e}")
            if attempt < max_retries - 1:
                delay = random.uniform(min_delay, max_delay) + (attempt * 5)
                time.sleep(delay)
            else:
                return False

    return False


def is_capture_page(html_content):
    """
    Checks if the provided HTML content corresponds to a capture page.

    Args:
        html_content (str): The HTML content to analyze.

    Returns:
        bool: True if a capture page is detected, False otherwise.
    """
    soup = BeautifulSoup(html_content, "html.parser")
    # Look for specific elements that usually appear on Walmart's capture page
    if soup.find("div", {"id": "px-captcha"}):
        return True
    if soup.find("h1", string=lambda text: "Please verify you're a human" in text if text else False):
        return True
    if "Please verify you're a human" in html_content:
        return True
    return False


if __name__ == "__main__":
    base_url = "https://www.walmart.com/reviews/product/604342441?ratings=5"
    output_folder = "Walmart_Airpods_Reviews_Star_5"  # Define the folder name

    # Create the output directory if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # Using Requests
    num_pages = 1000
    for page_number in range(503, num_pages+1):
        page_url = base_url + f"&page={page_number}"
        if download_walmart_review_page(page_url, output_dir=output_folder):
            print(f"Page {page_number} Downloaded Successfully (Requests)")
        else:
            print(f"Failed to download page {page_number} (Requests)")

Downloaded: https://www.walmart.com/reviews/product/604342441?ratings=5&page=503 to Walmart_Airpods_Reviews_Star_5/Reviews_page_503_requests.html
Page 503 Downloaded Successfully (Requests)
Downloaded: https://www.walmart.com/reviews/product/604342441?ratings=5&page=504 to Walmart_Airpods_Reviews_Star_5/Reviews_page_504_requests.html
Page 504 Downloaded Successfully (Requests)
Capture page detected at: https://www.walmart.com/reviews/product/604342441?ratings=5&page=505 (Attempt 1/5)
Downloaded: https://www.walmart.com/reviews/product/604342441?ratings=5&page=505 to Walmart_Airpods_Reviews_Star_5/Reviews_page_505_requests.html
Page 505 Downloaded Successfully (Requests)
Downloaded: https://www.walmart.com/reviews/product/604342441?ratings=5&page=506 to Walmart_Airpods_Reviews_Star_5/Reviews_page_506_requests.html
Page 506 Downloaded Successfully (Requests)
Downloaded: https://www.walmart.com/reviews/product/604342441?ratings=5&page=507 to Walmart_Airpods_Reviews_Star_5/Reviews_page_507

##      

---
# **Step-2 : Extract the reviews from the downloaded HTML pages**
---

## **Star Rating : (1 - 3)**

In [None]:
import os
import json
from bs4 import BeautifulSoup
import pandas as pd

# Set the path to the main directory
main_directory = '/content/drive/My Drive/Walmart_AirPods_Reviews_Star(1-3)'

# List to hold all reviews data from all subdirectories
all_reviews_data = []

# Walk through all subdirectories and files in the main directory
for root, dirs, files in os.walk(main_directory):
    for file_name in files:
        # Check if the file is an HTML file
        if file_name.endswith('.html'):
            file_path = os.path.join(root, file_name)
            with open(file_path, "r", encoding="utf-8") as file:
                html_content = file.read()

            # Parse the HTML content
            soup = BeautifulSoup(html_content, "html.parser")

            # Locate the <script> tag with id="__NEXT_DATA__"
            script_tag = soup.find("script", id="__NEXT_DATA__", type="application/json")
            if script_tag:
                try:
                    # Load the JSON data
                    json_data = json.loads(script_tag.string)

                    # Navigate to the reviews section
                    reviews = json_data.get("props", {}).get("pageProps", {}).get("initialData", {}).get("data", {}).get("reviews", {}).get("customerReviews", [])

                    # Prepare data for DataFrame
                    for review in reviews:
                        review_text = review.get("reviewText", "").strip() if review.get("reviewText") else ""
                        review_text = " ".join(review_text.split())  # Clean up text
                        review_title = review.get("reviewTitle", "").strip() if review.get("reviewTitle") else ""
                        rating = review.get("rating")
                        user_name = review.get("userNickname", "").strip() if review.get("userNickname") else ""
                        review_date = review.get("reviewSubmissionTime", "")

                        all_reviews_data.append({
                            "Review_Title": review_title,
                            "User_Name": user_name,
                            "Rating": rating,
                            "Review_Date": review_date,
                            "Review_Text": review_text
                        })

                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON in file {file_path}: {e}")
            else:
                print(f"No script tag with id='__NEXT_DATA__' found in file {file_path}.")

# Create DataFrame with all collected reviews
df1 = pd.DataFrame(all_reviews_data, columns=["Review_Title", "User_Name", "Rating", "Review_Date", "Review_Text"])
df1

Unnamed: 0,Review_Title,User_Name,Rating,Review_Date,Review_Text
0,,WalmartCustomer,1,10/7/2024,The AirPods were delivered to the wrong addres...
1,,WalmartCustomer,1,10/18/2024,"I never recieved my item, it was stolen and by..."
2,,patti,1,12/11/2024,The product is fine . It's the service that wa...
3,,Brittany,1,12/9/2024,I was not at all pleased with my purchase of t...
4,Defective Airpods,Tia,1,11/21/2024,I bought these as a back up in case I lost or ...
...,...,...,...,...,...
3376,,K,3,11/29/2019,I received these put it didnt come with a bloc...
3377,,Ken,3,1/6/2020,They are not the last way model they are first...
3378,,Sara,3,6/22/2020,"They broke already, only one ear bud working"
3379,I love it,Carol,3,9/27/2019,AirPod


##     

## **Star Rating - 4**

In [None]:
import os
import json
from bs4 import BeautifulSoup
import pandas as pd

# Set the path to the main directory
directory = '/content/drive/MyDrive/Walmart_AirPods_Reviews_Star_4'

# List to hold all reviews data from the directory
all_reviews_data = []

# Walk through the files in the specified directory
for file_name in os.listdir(directory):
    if file_name.endswith('.html'):
        file_path = os.path.join(directory, file_name)
        with open(file_path, "r", encoding="utf-8") as file:
            html_content = file.read()

        # Parse the HTML content
        soup = BeautifulSoup(html_content, "html.parser")

        # Locate the <script> tag with id="__NEXT_DATA__"
        script_tag = soup.find("script", id="__NEXT_DATA__", type="application/json")
        if script_tag:
            try:
                # Load the JSON data
                json_data = json.loads(script_tag.string)

                # Navigate to the reviews section
                reviews = json_data.get("props", {}).get("pageProps", {}).get("initialData", {}).get("data", {}).get("reviews", {}).get("customerReviews", [])

                # Prepare data for DataFrame
                for review in reviews:
                    review_text = review.get("reviewText", "").strip() if review.get("reviewText") else ""
                    review_text = " ".join(review_text.split())  # Clean up text
                    review_title = review.get("reviewTitle", "").strip() if review.get("reviewTitle") else ""
                    rating = review.get("rating")
                    user_name = review.get("userNickname", "").strip() if review.get("userNickname") else ""
                    review_date = review.get("reviewSubmissionTime", "")

                    all_reviews_data.append({
                        "Review_Title": review_title,
                        "User_Name": user_name,
                        "Rating": rating,
                        "Review_Date": review_date,
                        "Review_Text": review_text
                    })

            except json.JSONDecodeError as e:
                print(f"Error decoding JSON in file {file_path}: {e}")
        else:
            print(f"No script tag with id='__NEXT_DATA__' found in file {file_path}.")

# Create DataFrame with all collected reviews
df2 = pd.DataFrame(all_reviews_data, columns=["Review_Title", "User_Name", "Rating", "Review_Date", "Review_Text"])
df2

Unnamed: 0,Review_Title,User_Name,Rating,Review_Date,Review_Text
0,,Phairby,4,12/6/2024,"I received the package same day, shortly after..."
1,,Aden,4,12/12/2024,For the price and for how old these head phone...
2,Great for Personal Use,Christen,4,11/20/2024,These airpods are great for everyday use. They...
3,2nd Gen Airpods,Eugenia,4,12/31/2024,I like the good sound quality. They are much b...
4,Good quality bad ear canal shape.,Rachel,4,8/25/2024,The sound is awesome when I can keep them in m...
...,...,...,...,...,...
3463,Good,mzkitty76,4,6/3/2020,My husband bought these for himself.
3464,,Rogelio,4,3/3/2022,good quality and autentic
3465,,Carey,4,12/28/2019,"They sound y, but come out of the ear sometimes"
3466,Love them,kylar,4,5/19/2020,"Really nice AirPods, sounds amazing , can’t go..."


###      

## **Star Rating - 5**

### **First 502 pages**

In [None]:
import os
import json
from bs4 import BeautifulSoup
import pandas as pd

# Set the path to the main directory
directory = '/content/drive/MyDrive/Walmart_AirPods_Reviews_Star_5(1-502)'

# List to hold all reviews data from the directory
all_reviews_data = []

# Walk through the files in the specified directory
for file_name in os.listdir(directory):
    if file_name.endswith('.html'):
        file_path = os.path.join(directory, file_name)
        with open(file_path, "r", encoding="utf-8") as file:
            html_content = file.read()

        # Parse the HTML content
        soup = BeautifulSoup(html_content, "html.parser")

        # Locate the <script> tag with id="__NEXT_DATA__"
        script_tag = soup.find("script", id="__NEXT_DATA__", type="application/json")
        if script_tag:
            try:
                # Load the JSON data
                json_data = json.loads(script_tag.string)

                # Navigate to the reviews section
                reviews = json_data.get("props", {}).get("pageProps", {}).get("initialData", {}).get("data", {}).get("reviews", {}).get("customerReviews", [])

                # Prepare data for DataFrame
                for review in reviews:
                    review_text = review.get("reviewText", "").strip() if review.get("reviewText") else ""
                    review_text = " ".join(review_text.split())  # Clean up text
                    review_title = review.get("reviewTitle", "").strip() if review.get("reviewTitle") else ""
                    rating = review.get("rating")
                    user_name = review.get("userNickname", "").strip() if review.get("userNickname") else ""
                    review_date = review.get("reviewSubmissionTime", "")

                    all_reviews_data.append({
                        "Review_Title": review_title,
                        "User_Name": user_name,
                        "Rating": rating,
                        "Review_Date": review_date,
                        "Review_Text": review_text
                    })

            except json.JSONDecodeError as e:
                print(f"Error decoding JSON in file {file_path}: {e}")
        else:
            print(f"No script tag with id='__NEXT_DATA__' found in file {file_path}.")

# Create DataFrame with all collected reviews
df_502 = pd.DataFrame(all_reviews_data, columns=["Review_Title", "User_Name", "Rating", "Review_Date", "Review_Text"])
df_502

Unnamed: 0,Review_Title,User_Name,Rating,Review_Date,Review_Text
0,Father's Day gift,jeni,5,7/5/2024,My husband always uses his gift cards to buy t...
1,A-M-A-Z-I-N-G,Zynnique,5,1/3/2024,These AirPods are just 💋! The sound quality is...
2,Apple AirPods are a fantastic upgrade,Natalie,5,12/20/2024,The Apple AirPods with Charging Case (2nd Gene...
3,Older IS better this time!,McMere,5,12/27/2024,Love these 2nd generation Apple AirPods! Tried...
4,,,5,12/19/2024,My wife is really happy with the effort. She r...
...,...,...,...,...,...
5015,Great sound quality,jenr134,5,10/29/2024,The sound quality is great and has lasted me s...
5016,Goood quality,saraa908,5,10/21/2024,They are so comfortable in my ear and the soun...
5017,Quality,dianel101,5,10/24/2024,This AirPods has sound quality specially when ...
5018,Easy to use,nhipham96,5,10/16/2024,I really like it because it's very helpful for...


###      

## **Remaining Pages**

In [None]:
import os
import json
from bs4 import BeautifulSoup
import pandas as pd

# Set the path to the main directory
directory = '/content/drive/MyDrive/Walmart_AirPods_Reviews_Star_5(503-1000)'

# List to hold all reviews data from the directory
all_reviews_data = []

# Walk through the files in the specified directory
for file_name in os.listdir(directory):
    if file_name.endswith('.html'):
        file_path = os.path.join(directory, file_name)
        with open(file_path, "r", encoding="utf-8") as file:
            html_content = file.read()

        # Parse the HTML content
        soup = BeautifulSoup(html_content, "html.parser")

        # Locate the <script> tag with id="__NEXT_DATA__"
        script_tag = soup.find("script", id="__NEXT_DATA__", type="application/json")
        if script_tag:
            try:
                # Load the JSON data
                json_data = json.loads(script_tag.string)

                # Navigate to the reviews section
                reviews = json_data.get("props", {}).get("pageProps", {}).get("initialData", {}).get("data", {}).get("reviews", {}).get("customerReviews", [])

                # Prepare data for DataFrame
                for review in reviews:
                    review_text = review.get("reviewText", "").strip() if review.get("reviewText") else ""
                    review_text = " ".join(review_text.split())  # Clean up text
                    review_title = review.get("reviewTitle", "").strip() if review.get("reviewTitle") else ""
                    rating = review.get("rating")
                    user_name = review.get("userNickname", "").strip() if review.get("userNickname") else ""
                    review_date = review.get("reviewSubmissionTime", "")

                    all_reviews_data.append({
                        "Review_Title": review_title,
                        "User_Name": user_name,
                        "Rating": rating,
                        "Review_Date": review_date,
                        "Review_Text": review_text
                    })

            except json.JSONDecodeError as e:
                print(f"Error decoding JSON in file {file_path}: {e}")
        else:
            print(f"No script tag with id='__NEXT_DATA__' found in file {file_path}.")

# Create DataFrame with all collected reviews
df_502_1000 = pd.DataFrame(all_reviews_data, columns=["Review_Title", "User_Name", "Rating", "Review_Date", "Review_Text"])
df_502_1000

Unnamed: 0,Review_Title,User_Name,Rating,Review_Date,Review_Text
0,Sound Investment,bradb38,5,10/17/2024,I love the sound and durability of my AirPods....
1,Great Product,shaunr23,5,10/31/2024,I bought these when they came out and haven't ...
2,Great buds,toxickitsunekun,5,10/22/2024,Perfect fit to wear comfortablely and easy to ...
3,Great fit,alexandrag256,5,11/12/2024,I absolutely love my AirPods I can't live with...
4,"Great to have, especially for traveling!",kamaj,5,11/7/2024,The sound quality was amazing! I had them for ...
...,...,...,...,...,...
4975,super comfortable earbuds with amazing sound q...,sagee10,5,1/31/2024,These AirPods were my go to AirPods for the lo...
4976,The sound is fantastic,brendab150,5,3/7/2024,"I left my Apple AirPods, and the charging case..."
4977,Apple AirPods,lum33,5,3/27/2024,I love the Apple AirPods. I have tried other e...
4978,Love the noise suppression,kylerk4,5,3/11/2024,I love the sound suppression that the AirPods ...


In [None]:
df3 = pd.concat([df_502, df_502_1000], axis = 0, ignore_index=True)
df3

Unnamed: 0,Review_Title,User_Name,Rating,Review_Date,Review_Text
0,Father's Day gift,jeni,5,7/5/2024,My husband always uses his gift cards to buy t...
1,A-M-A-Z-I-N-G,Zynnique,5,1/3/2024,These AirPods are just 💋! The sound quality is...
2,Apple AirPods are a fantastic upgrade,Natalie,5,12/20/2024,The Apple AirPods with Charging Case (2nd Gene...
3,Older IS better this time!,McMere,5,12/27/2024,Love these 2nd generation Apple AirPods! Tried...
4,,,5,12/19/2024,My wife is really happy with the effort. She r...
...,...,...,...,...,...
9995,super comfortable earbuds with amazing sound q...,sagee10,5,1/31/2024,These AirPods were my go to AirPods for the lo...
9996,The sound is fantastic,brendab150,5,3/7/2024,"I left my Apple AirPods, and the charging case..."
9997,Apple AirPods,lum33,5,3/27/2024,I love the Apple AirPods. I have tried other e...
9998,Love the noise suppression,kylerk4,5,3/11/2024,I love the sound suppression that the AirPods ...


##        

---
# **Merging Review DataFrames (df1, df2, df3)**
---

In [None]:
df = pd.concat([df1, df2, df3], axis = 0, ignore_index=True)
df

Unnamed: 0,Review_Title,User_Name,Rating,Review_Date,Review_Text
0,,WalmartCustomer,1,10/7/2024,The AirPods were delivered to the wrong addres...
1,,WalmartCustomer,1,10/18/2024,"I never recieved my item, it was stolen and by..."
2,,patti,1,12/11/2024,The product is fine . It's the service that wa...
3,,Brittany,1,12/9/2024,I was not at all pleased with my purchase of t...
4,Defective Airpods,Tia,1,11/21/2024,I bought these as a back up in case I lost or ...
...,...,...,...,...,...
16844,super comfortable earbuds with amazing sound q...,sagee10,5,1/31/2024,These AirPods were my go to AirPods for the lo...
16845,The sound is fantastic,brendab150,5,3/7/2024,"I left my Apple AirPods, and the charging case..."
16846,Apple AirPods,lum33,5,3/27/2024,I love the Apple AirPods. I have tried other e...
16847,Love the noise suppression,kylerk4,5,3/11/2024,I love the sound suppression that the AirPods ...


In [None]:
df['Review_Date'] = pd.to_datetime(df['Review_Date'], format='%m/%d/%Y')
df = df.sort_values(by='Review_Date', ascending=False).reset_index(drop=True)
df

Unnamed: 0,Review_Title,User_Name,Rating,Review_Date,Review_Text
0,Don't work,Lilly,1,2025-01-04,They don't work
1,,Janelle,5,2025-01-04,Love these so easy to use great sound
2,,Nicole,5,2025-01-04,For my daughter and she absolutely loves them....
3,,MONICA,5,2025-01-04,My daughter love them
4,Lasted less than 6 months,cynthia,3,2025-01-04,2nd pair of apple AirPods I've owned. The left...
...,...,...,...,...,...
16844,,,5,2019-04-05,This is the refreshed version for 2019. This s...
16845,,Clarified,5,2019-04-05,These are the latest. The latest gen comes in ...
16846,,HolyGraill,3,2019-04-04,The sound is okay and the connectivity and hey...
16847,,HolyGraill,3,2019-04-04,The sound is okay and the connectivity and hey...


In [None]:
df.to_csv('Walmart_AirPods_All_Reviews.csv', index=False)