In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime

headers = {
    'authority': 'www.amazon.com',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'accept-language': 'en-US,en;q=0.9,bn;q=0.8',
    'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="102", "Google Chrome";v="102"',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'
}

reviews_url = 'https://www.amazon.com/Intel-i5-13600KF-Desktop-Processor-P-cores/dp/B0BCF5CZ16/ref=sr_1_9?crid=3T1HK1JCBJB5C&dib=eyJ2IjoiMSJ9.NjW4u06ZDPSjk1XiQIiwF1JoDVwT_YkiUR3BxeGPgmBDeBl6fw4BW453cTwMK80msJOcdkHwy4B6tgER9yWaRmHKkU7-OOa_GEpI8_c0Uzw3fZqop7LXCx16KufctVWUML-UmQ8wxL2Z-31ElMck27OPcSUbTq54Rtnlb24Xlbx9exPyr5tao1NOMkua4AhZvxSXGSoBEhjzhwZYxCOpF6bTxQd5DEKavO6Cw9nczsk.td8xttJx7tNUeAja1WRsX32w3ua0PvRqZqLRID52cMI&dib_tag=se&keywords=intel&qid=1720506772&sprefix=in%2Caps%2C376&sr=8-9'

len_page = 2

def reviewsHtml(url, len_page):
    soups = []
    for page_no in range(1, len_page + 1):
        params = {
            'ie': 'UTF8',
            'reviewerType': 'all_reviews',
            'filterByStar': 'critical',
            'pageNumber': page_no,
        }
        response = requests.get(url, headers=headers, verify=False)
        soup = BeautifulSoup(response.text, 'lxml')
        soups.append(soup)
    return soups

def getReviews(html_data):
    data_dicts = []
    boxes = html_data.select('div[data-hook="review"]')
    for box in boxes:
        try:
            name = box.select_one('[class="a-profile-name"]').text.strip()
        except Exception as e:
            name = 'N/A'

        try:
            stars = box.select_one('[data-hook="review-star-rating"]').text.strip().split(' out')[0]
        except Exception as e:
            stars = 'N/A'   

        try:
            title = box.select_one('[data-hook="review-title"]').text.strip()
        except Exception as e:
            title = 'N/A'

        try:
            datetime_str = box.select_one('[data-hook="review-date"]').text.strip().split(' on ')[-1]
            date = datetime.strptime(datetime_str, '%B %d, %Y').strftime("%d/%m/%Y")
        except Exception as e:
            date = 'N/A'

        try:
            description = box.select_one('[data-hook="review-body"]').text.strip()
        except Exception as e:
            description = 'N/A'

        data_dict = {
            'Name': name,
            'Stars': stars,
            'Title': title,
            'Date': date,
            'Description': description
        }
        data_dicts.append(data_dict)
    return data_dicts

html_datas = reviewsHtml(reviews_url, len_page)

reviews = []
for html_data in html_datas:
    review = getReviews(html_data)
    reviews += review

df_reviews = pd.DataFrame(reviews)

print(df_reviews)

# Save data
df_reviews.to_csv('reviews.csv', index=False)




                Name Stars                                              Title  \
0             YALE70   5.0  5.0 out of 5 stars\nThe performance bargain of...   
1           Vladimir   5.0           5.0 out of 5 stars\nWhat an amazing CPU!   
2      Evan Herriges   5.0        5.0 out of 5 stars\nAmazingly underated cpu   
3    Amazon Customer   5.0          5.0 out of 5 stars\nComing from i5 12400f   
4   Clinton Peterson   5.0             5.0 out of 5 stars\nGreat performance!   
5      D. Villanueva   5.0  5.0 out of 5 stars\nPerfect processor for thos...   
6            monkous   4.0      4.0 out of 5 stars\nswapped from amd to intel   
7      Connor Loomis   5.0               5.0 out of 5 stars\nGreat for gaming   
8        Fernando M.   N/A                                          Excelente   
9            Joselma   N/A                                 Rápido e eficiente   
10               Sam   N/A                                      Great Upgrade   
11            Carlos   N/A  

In [3]:
import pandas as pd

input_jsonl = r"D:\Electronics.jsonl"
output_csv = 'intel_processor_reviews2.csv'


filtered_data = pd.DataFrame(columns=["user_id", "asin", "rating", "helpful_vote", "verified_purchase", "text"])


chunksize = 10000  
max_reviews = 1500 
total_filtered = 0

with open(input_jsonl, 'r', encoding='utf-8') as file:
    chunk = []
    for i, line in enumerate(file):
        chunk.append(line)
        if (i + 1) % chunksize == 0:
            df_chunk = pd.read_json(''.join(chunk), lines=True)
            filtered_chunk = df_chunk[df_chunk['text'].str.contains('Intel', case=False, na=False)]
            filtered_chunk = filtered_chunk[["user_id", "asin", "rating", "helpful_vote", "verified_purchase", "text"]]
            filtered_data = pd.concat([filtered_data, filtered_chunk], ignore_index=True)
            total_filtered += len(filtered_chunk)
            chunk = []
            if total_filtered >= max_reviews:
                break

    if chunk and total_filtered < max_reviews:
        df_chunk = pd.read_json(''.join(chunk), lines=True)
        filtered_chunk = df_chunk[df_chunk['text'].str.contains('Intel', case=False, na=False)]
        filtered_chunk = filtered_chunk[["user_id", "asin", "rating", "helpful_vote", "verified_purchase", "text"]]
        filtered_data = pd.concat([filtered_data, filtered_chunk], ignore_index=True)
        total_filtered += len(filtered_chunk)


filtered_data = filtered_data.head(max_reviews)

# Saving the filtered data to a new CSV file
filtered_data.to_csv(output_csv, index=False)

print(f"Filtered data containing 'Intel' has been saved to '{output_csv}'.")


  df_chunk = pd.read_json(''.join(chunk), lines=True)
  df_chunk = pd.read_json(''.join(chunk), lines=True)
  df_chunk = pd.read_json(''.join(chunk), lines=True)
  df_chunk = pd.read_json(''.join(chunk), lines=True)
  df_chunk = pd.read_json(''.join(chunk), lines=True)
  df_chunk = pd.read_json(''.join(chunk), lines=True)
  df_chunk = pd.read_json(''.join(chunk), lines=True)
  df_chunk = pd.read_json(''.join(chunk), lines=True)
  df_chunk = pd.read_json(''.join(chunk), lines=True)
  df_chunk = pd.read_json(''.join(chunk), lines=True)
  df_chunk = pd.read_json(''.join(chunk), lines=True)
  df_chunk = pd.read_json(''.join(chunk), lines=True)
  df_chunk = pd.read_json(''.join(chunk), lines=True)
  df_chunk = pd.read_json(''.join(chunk), lines=True)
  df_chunk = pd.read_json(''.join(chunk), lines=True)
  df_chunk = pd.read_json(''.join(chunk), lines=True)
  df_chunk = pd.read_json(''.join(chunk), lines=True)
  df_chunk = pd.read_json(''.join(chunk), lines=True)
  df_chunk = pd.read_json(''

Filtered data containing 'Intel' has been saved to 'intel_processor_reviews2.csv'.
