In [None]:
import time
import pandas as pd
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys

# Set up Selenium WebDriver
driver = webdriver.Chrome()  # Update with your path
url = "https://www.jarir.com/fiction-literature.html"  # The main product listing page
driver.get(url)

# Scroll down to load all products
last_height = driver.execute_script("return document.body.scrollHeight")

while True:
    ActionChains(driver).send_keys(Keys.PAGE_DOWN).perform()
    time.sleep(2)  # Wait for more products to load
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height

# Get all product links from the main page
soup = BeautifulSoup(driver.page_source, 'html.parser')
product_links = ["https://www.jarir.com/" + link['href'] for link in soup.select("a.product-tile__link")]

# Initialize an empty list to store product data
product_data = []

# Visit each product link and extract details
for product_url in product_links[:200]:
    driver.get(product_url)
    time.sleep(1)  # Allow time for the product page to load
    
    product_soup = BeautifulSoup(driver.page_source, 'html.parser')
    try:
        brand = product_soup.select_one("p.product-title__brand").text.strip()
    except AttributeError:
        brand = "Not found"
        
    try:
        title = product_soup.select_one("h2.product-title__title").text.strip()
    except AttributeError:
        title = "Not found"
    
    # Extract author and publication date
    try:
        author = product_soup.select_one("div.ebook-details b:contains('المؤلف:') + span").text.strip()
    except AttributeError:
        author = "Not found"
    
    try:
        publication_date = product_soup.select_one("div.ebook-details b:contains('تاريخ النشر:')").find_next_sibling(text=True).strip()
    except AttributeError:
        publication_date = "Not found"
        
    try:
        cover = product_soup.select_one("div.ebook-details b:contains('الصيغة:') + span").text.strip()
    except AttributeError:
        cover = "Not found"
        
    try:
        category = product_soup.select_one("div.ebook-details b:contains('تصنيف الكتاب:') + span").text.strip()
    except AttributeError:
        category = "Not found"
    try:
        num_pages = product_soup.select_one("div.ebook-details b:contains('عدد الصفحات:') + span").text.strip()
    except AttributeError:
        num_pages = "Not found"
        
    try:
        reviews = product_soup.select_one("meta[itemprop='reviewCount']")["content"]
    except (AttributeError, TypeError):
        reviews = "Not found"
        
    try:
        price = product_soup.select_one("div.price-box--ebook div.price span.ar-number").text.strip()
    except AttributeError:
        price = "Not found"
    
     # Extract the second image URL with the specified class
    try:
        img_url = product_soup.select("img.image--contain")[1]["src"]  # Get the second image
    except (IndexError, AttributeError, TypeError):
        img_url = "Not found"
    
    # Append data to the list
    product_data.append(
        {
        "url" : product_url,
        "title":title,
        "brand": brand,
        "author": author,
        "cover": cover,
        "publication_date": publication_date,
        "category": category,
        "reviews": reviews,
        "num_pages": num_pages,
        "price": price,
        "img_url": img_url
})

# Close the driver
driver.quit()

# Convert the list of dictionaries into a DataFrame
df = pd.DataFrame(product_data)
df["Language"] = "انجليزي"
# Display the DataFrame
df
# Optionally, save the DataFrame to a CSV file
#df.to_csv("product_data.csv", index=False)


ReadTimeoutError: HTTPConnectionPool(host='localhost', port=51004): Read timed out. (read timeout=120)

In [4]:
pd.set_option('display.max_colwidth', None)
df["cover"].value_counts()

cover
Paperback              180
Not found               12
Hardcover               11
Flexibound               1
Book with CD or DVD      1
DVD-ROM                  1
Name: count, dtype: int64

In [5]:
df.to_csv("Computer_tech.csv", index=False)
