In [17]:
# Vinted Data Scraper

# Import necessary libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import json

# Function to scrape Vinted listings
def scrape_vinted(base_url, pages=5):
    """
    Scrapes Vinted website for listing information.
    
    Parameters:
        base_url (str): Base URL of the Vinted search page.
        pages (int): Number of pages to scrape.
        
    Returns:
        pd.DataFrame: DataFrame containing scraped data.
    """
    listings = []
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
    }

    for page in range(1, pages + 1):
        url = f"{base_url}&page={page}"
        response = requests.get(url, headers=headers)

        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            items = soup.find_all('div', class_='catalog-item')

            for item in items:
                try:
                    title = item.find('h3', class_='catalog-item__title').text.strip()
                    price = item.find('span', class_='catalog-item__price').text.strip()
                    link = item.find('a', class_='catalog-item__title-link')['href']
                    timestamp = time.time()

                    listings.append({
                        'title': title,
                        'price': price,
                        'link': f"https://www.vinted.fr{link}",
                        'timestamp': timestamp,
                        'request_url': url
                    })
                except AttributeError:
                    continue

        else:
            print(f"Failed to fetch page {page}. Status code: {response.status_code}")
        
        time.sleep(1)  # Pause to avoid being rate-limited

    return pd.DataFrame(listings)

# Example usage
if __name__ == "__main__":
    BASE_URL = "https://www.vinted.fr/vetements?search_text=jeans"  # Example search query
    data = scrape_vinted(BASE_URL, pages=3)

    # Save data to CSV
    data.to_csv('vinted_listings.csv', index=False)

    print("Scraping complete. Data saved to 'vinted_listings.csv'.")


Scraping complete. Data saved to 'vinted_listings.csv'.


In [18]:
import requests
import pandas as pd
import time

# Base API URL
BASE_URL = "https://www.vinted.com/api/v2/catalog/items"

# Function to fetch data from the API
def fetch_vinted_data(categories, pages=5):
    data = []
    for category in categories:
        for page in range(1, pages + 1):
            print(f"Fetching page {page} for category: {category}")
            
            params = {
                # "catalog_ids": category,  # Replace with appropriate catalog ID
                # "page": page,
                # "per_page": 24,  # Adjust if necessary
                'search_id': '19409578490',
                'time': str(int(time.time())),
            }
            
            try:
                response = requests.get(BASE_URL, params=params)
                response.raise_for_status()  # Raise error for HTTP issues
                items = response.json().get("items", [])
                
                for item in items:
                    data.append({
                        "Name": item.get("title"),
                        "Size": item.get("size_title"),
                        "Price": item.get("price"),
                        "Category": category,
                        "URL": item.get("url"),
                    })
                
                # Sleep to avoid rate-limiting
                time.sleep(1)
            except requests.RequestException as e:
                print(f"Error fetching data for category {category}, page {page}: {e}")
                continue
    
    return pd.DataFrame(data)

# Example category IDs (replace with actual IDs from Vinted API documentation or inspection)
categories = [2012, 2013]  # Replace with the actual category IDs for women and men
pages_to_scrape = 10

# Fetch and save data
dataset = fetch_vinted_data(categories, pages=pages_to_scrape)
dataset.to_csv("vinted_api_dataset.csv", index=False)
print("Data saved to vinted_api_dataset.csv")


Fetching page 1 for category: 2012
Error fetching data for category 2012, page 1: 401 Client Error: Unauthorized for url: https://www.vinted.com/api/v2/catalog/items?catalog_ids=2012&page=1&per_page=24
Fetching page 2 for category: 2012
Error fetching data for category 2012, page 2: 401 Client Error: Unauthorized for url: https://www.vinted.com/api/v2/catalog/items?catalog_ids=2012&page=2&per_page=24
Fetching page 3 for category: 2012
Error fetching data for category 2012, page 3: 401 Client Error: Unauthorized for url: https://www.vinted.com/api/v2/catalog/items?catalog_ids=2012&page=3&per_page=24
Fetching page 4 for category: 2012
Error fetching data for category 2012, page 4: 401 Client Error: Unauthorized for url: https://www.vinted.com/api/v2/catalog/items?catalog_ids=2012&page=4&per_page=24
Fetching page 5 for category: 2012
Error fetching data for category 2012, page 5: 401 Client Error: Unauthorized for url: https://www.vinted.com/api/v2/catalog/items?catalog_ids=2012&page=5&pe

In [None]:
ip_addresses = [ "51.159.159.73:80","51.158.169.52:29976","5.189.130.42:23055","188.165.49.152:80"]

In [None]:
proxy = { "http": ip_addresses[2], "https": ip_addresses[2]}
response = requests.get(f'https://www.vinted.fr/api/v2/items/{id_pants[1]}', cookies=cookies, headers=headers,proxies=proxy)
data=response.json()
print(response)

In [None]:
ip_addresses = ["51.158.169.52:29976","5.189.130.42:23055","188.165.49.152:80"]
for i in range(len_pants):
    proxy_index = random.randint(0, len(ip_addresses)- 1)
    print(proxy_index)
    proxy = { "http": ip_addresses[proxy_index], "https": ip_addresses[proxy_index]}
    response = requests.get(f'https://www.vinted.fr/api/v2/items/{id_pants[i]}',s, headers=headers,proxies=proxy)
    data=response.json()
    # verify response to fix proxy error (max rentries exceeded)
    size.append(data['item']['size'])
    view_count.append(data['item']['view_count'])
    favourite_count.append(data['item']['favourite_count'])
    active_bid_count.append(data['item']['active_bid_count'])
    price_numeric.append(data['item']['price_numeric'])
    item_closing_action.append(data['item']['item_closing_action'])
    brand_id.append(data['item']['brand_id'])
    created_at_ts.append(data['item']['created_at_ts'])
    updated_at_ts.append(data['item']['updated_at_ts'])

    print(response)

In [None]:
ip_addresses = ["51.158.169.52:29976","5.189.130.42:23055","188.165.49.152:80"]
for i in range(len_pants):
    proxy_index = random.randint(0, len(ip_addresses)- 1)
    print(proxy_index)
    proxy = { "http": ip_addresses[proxy_index], "https": ip_addresses[proxy_index]}
    response = requests.get(f'https://www.vinted.fr/api/v2/items/{id_pants[i]}',s, headers=headers,proxies=proxy)
    data=response.json()
    # verify response to fix proxy error (max rentries exceeded)
    size.append(data['item']['size'])
    view_count.append(data['item']['view_count'])
    favourite_count.append(data['item']['favourite_count'])
    active_bid_count.append(data['item']['active_bid_count'])
    price_numeric.append(data['item']['price_numeric'])
    item_closing_action.append(data['item']['item_closing_action'])
    brand_id.append(data['item']['brand_id'])
    created_at_ts.append(data['item']['created_at_ts'])
    updated_at_ts.append(data['item']['updated_at_ts'])

    print(response)

In [None]:
ip_addresses = [ "51.159.159.73:80","51.254.78.223:80","152.228.154.2:80","188.165.49.152:80"]

for i in range(len_pants):
    try:
        proxy_index = random.randint(0, len(ip_addresses)- 1)
        print(proxy_index)
        proxy = { "http": ip_addresses[proxy_index], "https": ip_addresses[proxy_index]}
        response = requests.get(f'https://www.vinted.fr/api/v2/items/{id_pants[i]}', params=params, cookies=cookies, headers=headers,proxies=proxy)
    # verify response to fix proxy error (max rentries exceeded)
    except:
    # remove used proxy and try new one
        ip_addresses.pop(proxy_index)
        proxy_index = random.randint(0, len(ip_addresses)- 1)
        print(proxy_index)
        proxy = { "http": ip_addresses[proxy_index], "https": ip_addresses[proxy_index]}
        response = requests.get(f'https://www.vinted.fr/api/v2/items/{id_pants[i]}', params=params, cookies=cookies, headers=headers,proxies=proxy)
    data=response.json()    
    size.append(data['item']['size'])
    view_count.append(data['item']['view_count'])
    favourite_count.append(data['item']['favourite_count'])
    active_bid_count.append(data['item']['active_bid_count'])
    price_numeric.append(data['item']['price_numeric'])
    item_closing_action.append(data['item']['item_closing_action'])
    brand_id.append(data['item']['brand_id'])
    created_at_ts.append(data['item']['created_at_ts'])
    updated_at_ts.append(data['item']['updated_at_ts'])

    print(response)