In [None]:
!pip install selenium

In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, StaleElementReferenceException
import math
import re
import time
import pandas as pd

In [3]:
# Define the URL to usex
url = "https://www.discogs.com/seller/teckel45/profile"

# Extract seller_name
url_parts = url.split('/')
seller_index = url_parts.index('seller')
seller_name = url_parts[seller_index + 1]

In [7]:
# Create Chrome WebDriver with ad-blocking optionsx
chrome_options = Options()
chrome_options.add_extension('uBlock_Origin_extension_folder/1.54.0_0.crx')

# Create an instance of the Chrome WebDriver
driver = webdriver.Chrome(options=chrome_options)

# Get the URL
driver.get(url)

# Reject cookies
try:
    reject_cookie = WebDriverWait(driver, 3).until(
        EC.presence_of_element_located((By.XPATH, '//*[@id="onetrust-reject-all-handler"]'))
    )
    reject_cookie.click()
except TimeoutException:
    print('No cookies banner')

# Maximize the window
driver.maximize_window()
                      

In [5]:
# Calculate the number of pages to scrap with 250 items per page
raw_pagination = driver.find_element(By.XPATH,'//*[@id="pjax_container"]/nav[1]/form/strong').text
match = re.search(r'\d{1,3}(?:,\d{3})*(?=\D*$)', raw_pagination)
nb_for_sale = int(match.group().replace(',', ''))
max_page = math.ceil(nb_for_sale/250)

items_list = []

for page in range(max_page):

    if page <max_page:
        url_page = f'{url}?sort=listed%2Cdesc&limit=250&page={page+1}'
        driver.get(url_page)

    else:
        break
    
    for i in range(250):


        try:

            WebDriverWait(driver,3).until(
                EC.presence_of_element_located((By.CLASS_NAME, 'item_description'))
            )
            
            # Identify element with class='item_description' for the item i
            item_description_element = driver.find_elements(By.CLASS_NAME, 'item_description')[i]

            # Get the media condition
            media_condition_element = item_description_element.find_element(By.XPATH, './/p[@class="item_condition"]/span[contains(@class, "condition-label-mobile")]/following-sibling::span')
            media_condition = media_condition_element.text

            # Get the sleeve condition
            try:
                sleeve_condition = item_description_element.find_element(By.CLASS_NAME, 'item_sleeve_condition').text
            except NoSuchElementException:
                sleeve_condition = None
            
            # Get the 'view release' URL
            view_release_link = item_description_element.find_element(By.CLASS_NAME, 'item_release_link')
            release_url = view_release_link.get_attribute('href')

            WebDriverWait(driver,3).until(
                EC.presence_of_element_located((By.CLASS_NAME, 'price'))
            )

            # Identify the price with for the item i
            item_price = driver.find_elements(By.CLASS_NAME, 'price')[i]
            price = item_price.get_attribute('data-pricevalue')

        except IndexError:
            break

        item_attributes = [release_url, media_condition, sleeve_condition, price]
        items_list.append(item_attributes)
        
driver.quit()

# Create a dataframe
df_items = pd.DataFrame(items_list, columns = ['release_url','media_condition','sleeve_condition','price'])

# Stock and display the number of duplicates
nb_of_duplicates = df_items.duplicated().sum()
print(f'{nb_of_duplicates} duplicate has been removed')

# Remove the duplicates
df_items.drop_duplicates(inplace=True)

# Export the dataframe
try:
    df_items.to_csv(f'{seller_name}.csv')
except:
    print("Something went wrong")
else:
    print(f'{seller_name}.csv has been created')


1 duplicate has been removed
teckel45.csv has been created


In [8]:
# Record the start time
start_time = time.time()

# List of new columns to retrieve
new_columns = ['img_url', 'format_detail', 'nb_for_sale', 'current_lowest_price', 'have',
              'want', 'avg_rating', 'ratings', 'last_sold_date', 'low_price', 'median_price', 'high_price']

# Initialize columns with None
for column in new_columns:
    df_items[column] = None
    
# Loop through each URL in the 'release_url' column
for index, row in df_items.iterrows():
    release_url = row['release_url']

    # Perform actions with Selenium on each URL
    driver.get(release_url)

    # Wait for the first block element in the right top corner to ensure it has loaded
    try:
        WebDriverWait(driver,3).until(
            EC.presence_of_element_located((By.CLASS_NAME, 'body_32Bo9'))
        )

        # Store the retrieved information in the appropriated columns
        try:
            df_items.at[index, 'img_url'] = driver.find_element(By.XPATH, './/div[@class="image_3rzgk bezel_2NSgk"]/picture/img').get_attribute('src')
        except NoSuchElementException:
            df_items.at[index, 'img_url'] = None
        try:
            df_items.at[index, 'format_detail'] = driver.find_element(By.CLASS_NAME,'format_item_3SAJn').text
        except NoSuchElementException:
            df_items.at[index, 'format_detail'] = None
    except TimeoutException:
        df_items.at[index, 'img_url'] = None
        df_items.at[index, 'format_detail'] = None

    # Wait for the "For Sale on Discogs" block to ensure it has loaded
    try:
        WebDriverWait(driver,3).until(
            EC.presence_of_element_located((By.CLASS_NAME, 'forsale_QoVFl'))
        )

        # Get infos
        try:
            df_items.at[index, 'nb_for_sale'] = driver.find_element(By.XPATH, './/span[@class="forsale_QoVFl"]/a').text
        except NoSuchElementException:
            df_items.at[index, 'nb_for_sale'] = None
        try:
            df_items.at[index, 'current_lowest_price'] = driver.find_element(By.CLASS_NAME, 'price_2Wkos').text
        except NoSuchElementException:
            df_items.at[index, 'current_lowest_price'] = None
    except TimeoutException:
        df_items.at[index, 'nb_for_sale'] = None
        df_items.at[index, 'current_lowest_price'] = None
        

    # Wait for "Statistics" block to ensure it has loaded
    try:
        WebDriverWait(driver,3).until(
            EC.presence_of_element_located((By.ID, 'release-stats'))
        )

        # Get the statistic information
        df_items.at[index, 'have'] = driver.find_element(By.XPATH, '//*[@id="release-stats"]/div/div/ul[1]/li[1]/a').text
        df_items.at[index, 'want'] = driver.find_element(By.XPATH, '//*[@id="release-stats"]/div/div/ul[1]/li[2]/a').text
        df_items.at[index, 'avg_rating'] = driver.find_element(By.XPATH, '//*[@id="release-stats"]/div/div/ul[1]/li[3]/span[2]').text
        df_items.at[index, 'ratings'] = driver.find_element(By.XPATH, '//*[@id="release-stats"]/div/div/ul[1]/li[4]/a').text
        try:
            df_items.at[index, 'last_sold_date'] = driver.find_element(By.XPATH, '//*[@id="release-stats"]/div/div/ul[2]/li[1]/a/time').text
        except:
            df_items.at[index, 'last_sold_date'] = driver.find_element(By.XPATH, '//*[@id="release-stats"]/div/div/ul[2]/li[1]/span[2]').text
        df_items.at[index, 'low_price'] = driver.find_element(By.XPATH, '//*[@id="release-stats"]/div/div/ul[2]/li[2]/span[2]').text
        df_items.at[index, 'median_price'] = driver.find_element(By.XPATH, '//*[@id="release-stats"]/div/div/ul[2]/li[3]/span[2]').text
        df_items.at[index, 'high_price'] = driver.find_element(By.XPATH, '//*[@id="release-stats"]/div/div/ul[2]/li[4]/span[2]').text
    except TimeoutException:
        columns_statistics = ['have','want', 'avg_rating', 'ratings', 'last_sold_date', 'low_price', 'median_price', 'high_price']
        for column_statistics in columns_statistics:
            df_items[column_statistics] = None
        
driver.quit()

print("--- %s seconds ---" % (time.time() - start_time))

--- 1299.3862941265106 seconds ---


In [10]:
df_items.head()

Unnamed: 0,release_url,media_condition,sleeve_condition,price,img_url,format_detail,nb_for_sale,current_lowest_price,have,want,avg_rating,ratings,last_sold_date,low_price,median_price,high_price
0,https://www.discogs.com/release/2540837-Bernar...,Very Good Plus (VG+),No Cover,5.0,https://i.discogs.com/NxJghaw9KbmkXzDZf4Yf14Ke...,"Vinyl, LP, Album",62 For Sale,€1.91,589,29,3.82 / 5,56,"Oct 17, 2023",€2.00,€5.00,€30.00
1,https://www.discogs.com/release/2215375-Soldat...,Very Good Plus (VG+),Very Good Plus (VG+),5.0,https://i.discogs.com/C33WYzZxSkjaqz3hq0IcJOBK...,"Vinyl, LP, Album, Stereo",22 For Sale,€10.00,639,99,4.12 / 5,49,"Dec 1, 2023",€5.00,€14.45,€32.20
2,https://www.discogs.com/release/6292482-Odeurs...,Good Plus (G+),Very Good Plus (VG+),15.0,https://i.discogs.com/6gwY4g28OR22Z-52h809RCjs...,"Vinyl, LP, Album, Test Pressing, White Label",1 For Sale,€8.00,3,17,3 / 5,2,Never,--,--,--
3,https://www.discogs.com/release/2168529-Edith-...,Very Good Plus (VG+),Very Good Plus (VG+),15.0,https://i.discogs.com/uGP70kkzl1XtA9sWrnhlk7V2...,"3 x Vinyl, LP",37 For Sale,€6.00,280,17,4.41 / 5,17,"Dec 13, 2021",€5.00,€14.99,€35.00
4,https://www.discogs.com/release/590446-Gilbert...,Very Good Plus (VG+),Very Good Plus (VG+),8.0,https://i.discogs.com/F7QxhIBNmi5eniXau6lZtoya...,"Vinyl, LP, Album",39 For Sale,€1.50,1056,75,3.69 / 5,59,"Dec 17, 2023",€2.00,€5.00,€10.90


In [11]:
df_items.to_csv('teckel_45_full.csv', index=False)

In [1]:
print('coucou')

coucou
