In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import pickle

In [None]:
DATASET_PATH = f"../../Datasets/amazon-products-dataset/Amazon-Products.csv"

In [None]:
class AmazonProductExtractor:
    def __init__(self, url):
        self.url = url
        self.soup = self._get_soup()

    def _get_soup(self):
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
            "Accept-Language": "en-US,en;q=0.9",
        }
        response = requests.get(self.url, headers=headers)
        if response.status_code == 200:
            return BeautifulSoup(response.text, "html.parser")
        else:
            raise Exception(
                f"Failed to fetch page, status code: {response.status_code}"
            )

    def extract_description(self):
        desc_tag = self.soup.find("div", id="productDescription")
        desc_tag = desc_tag.get_text(strip=True) if desc_tag else None
        feature_tag = self.soup.find("div", id="feature-bullets")
        feature_tag = feature_tag.get_text(strip=True) if feature_tag else None
        overview_tag = self.soup.find("div", id="productOverview_feature_div")
        overview_tag = overview_tag.get_text(strip=True) if overview_tag else None
        tech_spec_tag = self.soup.find("div", id="productDetails_techSpec_section_1")
        tech_spec_tag = tech_spec_tag.get_text(strip=True) if tech_spec_tag else None
        return [str(t) for t in [desc_tag, feature_tag, overview_tag, tech_spec_tag]]

In [None]:
df = pd.read_csv(DATASET_PATH)
df.head()
new_df = pd.DataFrame(columns=["Unnamed: 0","name","details"])
new_df["Unnamed: 0"] = df["Unnamed: 0"]
new_df["name"] = df["name"]
new_df.head()

In [None]:
def get_description(row):
    url = row["link"]
    name = row["name"]
    try:
        extractor = AmazonProductExtractor(url)
        description = extractor.extract_description()
        print(f"Description of {name}: {description}")
        return description
    except Exception as e:
        print(f"Failed to extract description for {name}: {e}")
        return [None, None, None, None]
    

new_df["details"] = df.apply(get_description, axis=1)
new_df.head()

In [None]:
new_df.to_pickle("Amazon-Products-Details.pkl", index=False)