In [1]:
# import libraries
import pandas as pd
import numpy as np
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver import ActionChains
from selenium.common.exceptions import NoSuchElementException
import time
import datetime
from tqdm import tqdm
import gspread
from df2gspread import df2gspread as d2g
from oauth2client.service_account import ServiceAccountCredentials

# constants
from confidential.conf_constants import *

In [3]:
# setting up chrome options
chrome_options = Options()
# browsing in incognito mode
chrome_options.add_argument("--incognito")
# setting window size to 1920x1080
chrome_options.add_argument("--window-size=1920x1080")

In [4]:
# create instance
driver = webdriver.Chrome(chrome_options=chrome_options, executable_path=
"/Users/marthab/Desktop/Python_Labs/vinted_crawler/chromedriver")

  This is separate from the ipykernel package so we can avoid doing imports until


In [5]:
# set url to Women > Jacquemus, filter: newest first
url = "https://www.vinted.de/vetements?brand_id[]=168278&catalog[]=1904&order=newest_first"
driver.get(url)

# wait 2 sec for page to fully load
time.sleep(2)

In [6]:
# don't accept cookies
try:
# privatsphäre-einstellung alle ablehnen
    reject_all_button = driver.find_element_by_css_selector("#onetrust-reject-all-handler")
    ActionChains(driver).click(reject_all_button).perform()
#NoSuchElementException thrown if not present
except NoSuchElementException:
    print("Not asked for privacy settings")

# choose country Germany
try:
    germany_button = driver.find_element_by_xpath("/html/body/div[13]/div/div/div/div[3]/div[3]/div[2]/div/h2/span")
    ActionChains(driver).click(germany_button).perform()
except NoSuchElementException:
    print("Not asked for country")

# wait 2 sec for page to fully load
time.sleep(2)

In [7]:
# make sure you are on the right page
if 'brand_id[]=168278&catalog[]=1904' not in driver.current_url:
    url = "https://www.vinted.de/vetements?brand_id[]=168278&catalog[]=1904&order=newest_first"
    driver.get(url)
    time.sleep(2)

In [8]:
# get url of all products on first page 
products = driver.find_elements_by_css_selector("a.ItemBox_overlay__1kNfX")
product_urls = [prod.get_attribute("href") for prod in products]

In [9]:
#scraping individual product details
product_details=[]
today = datetime.datetime.now().date()
yesterday = today - datetime.timedelta(days=1)

for product_url in tqdm(product_urls):
    driver.get(product_url)
    #time.sleep(3)

    # get upload_date and stop for loop if item was uploaded more than 1 day ago
    upload_date_full = driver.find_element_by_xpath("//div[@class='details-list__item-value']/time").get_attribute("datetime")
    upload_date = datetime.datetime.strptime(upload_date_full, "%Y-%m-%dT%H:%M:%S%z").date()
    if upload_date != yesterday and upload_date != today:
        break

    # get product id from url
    product_id = product_url.split('/')[-1]

    # get product subcatgory from url
    product_subcat = product_url.split('/')[-2]

    # get product catgory from url
    product_cat = product_url.split('/')[-3]

    # get brand
    brand = driver.find_element_by_xpath("//a[@itemprop='url']/span[@itemprop='name']").text

    # get price
    price = driver.find_element_by_xpath('/html/body/div[5]/div/section/div/div[2]/main/aside/div[1]/div[1]/div[1]/div[1]/span/div').text

    # get size 
    try:
        size_check = driver.find_element_by_xpath("//div[@class='details-list__item u-position-relative']/div[contains(text(), 'Größe')]")
        size = driver.find_element_by_xpath('/html/body/div[5]/div/section/div/div[2]/main/aside/div[1]/div[1]/div[2]/div[2]/div[2]').text
    except NoSuchElementException:
        size = "-"

    # get item condition
    condition = driver.find_element_by_xpath("//div[@itemprop='itemCondition']").text

    # get colour
    colour = driver.find_element_by_xpath("//div[@itemprop='color']").text


    infos ={
        'Upload_date':upload_date,
        'Product_Id':product_id,
        'Category':product_cat,
        'Sub_Category':product_subcat,
        'Brand':brand,
        'Price':price,
        'Size':size,
        'Condition':condition,
        'URL':product_url}

    product_details.append(infos)

    

 50%|█████     | 11/22 [00:32<00:32,  2.95s/it]


In [10]:
product_details_df = pd.DataFrame(product_details)
product_details_df

Unnamed: 0,Upload_date,Product_Id,Category,Sub_Category,Brand,Price,Size,Condition,URL
0,2021-08-20,1266371452-handtasche-riviera-jacquemus,taschen,handtaschen,JACQUEMUS,"190,00 €",-,"NEU, MIT ETIKETT",https://www.vinted.de/damen/taschen/handtasche...
1,2021-08-20,1266125235-neu-mit-etikett-jacquemus-shirt,tops-and-t-shirts,shirts,JACQUEMUS,"80,00 €",S / 36 / 8,"NEU, MIT ETIKETT",https://www.vinted.de/damen/kleidung/tops-and-...
2,2021-08-20,1266124136-jacquemus-handtasche,taschen,handtaschen,JACQUEMUS,"170,00 €",-,"NEU, MIT ETIKETT",https://www.vinted.de/damen/taschen/handtasche...
3,2021-08-20,1266053457-jacquemus-tasche,taschen,handtaschen,JACQUEMUS,"150,00 €",-,"NEU, MIT ETIKETT",https://www.vinted.de/damen/taschen/handtasche...
4,2021-08-20,1265934342-jacquemus-handtasche,taschen,handtaschen,JACQUEMUS,"170,00 €",-,"NEU, MIT ETIKETT",https://www.vinted.de/damen/taschen/handtasche...
5,2021-08-20,1265708928-schwarze-jacquemus-le-petit-chiquit...,taschen,handtaschen,JACQUEMUS,"135,00 €",-,"NEU, MIT ETIKETT",https://www.vinted.de/damen/taschen/handtasche...
6,2021-08-19,1264941733-jacquemus-fischerhut-neu,hute-and-mutzen,sonstiges,JACQUEMUS,"125,00 €",-,"NEU, MIT ETIKETT",https://www.vinted.de/damen/accessoires/hute-a...
7,2021-08-19,1264854208-jaquemus-le-gandjo-bag,taschen,handtaschen,JACQUEMUS,"210,00 €",-,"NEU, MIT ETIKETT",https://www.vinted.de/damen/taschen/handtasche...
8,2021-08-19,1264055260-jacquemus-le-carre,taschen-and-rucksacke,umhangetaschen,JACQUEMUS,"500,00 €",-,SEHR GUT,https://www.vinted.de/herren/accessoires/tasch...
9,2021-08-19,1263716785-jacquemus-ciquito-tasche,taschen,handtaschen,JACQUEMUS,"330,00 €",-,"NEU, MIT ETIKETT",https://www.vinted.de/damen/taschen/handtasche...


In [13]:
product_details_df.to_csv('data/2008_product_details.csv')

In [2]:
product_details_df = pd.read_csv('data/2008_product_details.csv', index_col=0)

In [3]:
# import google credentials
scope = [SPSH_URL,
         GAPI_URL]
credentials = ServiceAccountCredentials.from_json_keyfile_name(
    GKEY_PATH, scope)
gc = gspread.authorize(credentials)

In [5]:
spreadsheet_key = SPSH_KEY
wks_name = 'crawler_output'
d2g.upload(product_details_df, spreadsheet_key, wks_name, credentials=credentials, col_names=True, row_names=True)

<Worksheet 'crawler_output' id:0>