In [1]:
import pandas as pd
import scrapy
import time
from scrapy.crawler import CrawlerProcess

In [2]:
class eBaySpider(scrapy.Spider):
    name = "eBaySpider"
    start_urls = ['https://www.ebay-kleinanzeigen.de/s-autos/anzeige:angebote/seite:2/c216']
    
    def parse(self, response):
        urls = []
        for i in range(1, 3):
            urls.append(response.xpath('//*[@id="srchrslt-adtable-topads"]/li[{}]/article/div[2]/h2//a/@href'.format(str(i))).extract())
        for i in range(1, 28):
            urls.append(response.xpath('//*[@id="srchrslt-adtable"]/li[{}]/article/div[2]/h2//a/@href'.format(str(i))).extract())
        
        flat_urls = [item for sublist in urls for item in sublist]
        
        for url in flat_urls:
            url = response.urljoin(url)
            yield scrapy.Request(url=url, callback=self.parse_listing)
     
        next_page_url = response.xpath('//*[@id="srchrslt-pagination"]/div/div[3]/a/@href').extract_first()
        if next_page_url:
            next_page_url = response.urljoin(next_page_url)
            yield scrapy.Request(url=next_page_url, callback=self.parse)
    
    def parse_listing(self, response):
        gte = {"Besch\u00e4digtes Fahrzeug:" : "notRepairedDamage",
                "Marke:" : "brand",
                "Modell:" : "model",
                "Kilometerstand:" : "kilometer",
                'Erstzulassungsjahr:' : "yearOfRegistration",
                "Fahrzeugtyp:" : 'vehicleType',
                "Kraftstoffart:" : 'fuelType',
                "Leistung (PS):" : "powerPS",
                "Getriebe:" : "gearbox",
                "T\u00dcV Jahr:" : "someYearThing",
                "T\u00dcV Monat:": "monthThing",
                "Umweltplakette:" : "environmentalThing",
                "Schadstoffklasse:" : "emissionThing",
                "Au\u00dfenfarbe:": "color",
                "Material Innenausstattung:" : "interiorMaterial",
                "Anzahl Türen:" : "numberOfDoors",
                "Ausstattung:" : 'domesticEquipment',
                "Ort:" : "place",
                "Erstellungsdatum:" : 'creationDate',
                "Anzeigennummer:" : "referenceNumber",
                "Erstzulassungsmonat:": "monthOfRegistration",
              }
        
        for listing in response.xpath('//*[@id="viewad-main"]'):
            vari = {'name' : ''.join(listing.xpath("//*[@id='viewad-title']/text()").extract()).strip(),
                'price' : ''.join(listing.xpath("//*[@id='viewad-price']/text()").extract()).strip()
                  }
        for listing in response.xpath('//*[@id="viewad-details"]/section'):
            able = {gte[''.join(listing.xpath("//dl/dt[{}]/text()".format(i)).extract()).strip()] : ''.join(listing.xpath("//dl/dd[{}]/span/text()".format(i)).extract()).strip()+''.join(listing.xpath("//dl/dd[{}]/span/a/text()".format(i)).extract()).strip() for i in list(range(1,22))}
            
                
                    
        yield {**vari, **able}

In [3]:
start = time.time()
# Instantiate our crawler.
process = CrawlerProcess({
    'FEED_FORMAT': 'json',
    'FEED_URI': 'used_cars.json',
    'ROBOTSTXT_OBEY': True,
    'LOG_ENABLED': False,
    'AUTOTHROTTLE_ENABLED': True,
    'HTTPCACHE_ENABLED': True
})

# Start the crawler with our spider.
process.crawl(eBaySpider)
process.start()
print(time.time() - start)

52.41045880317688


In [4]:
used_cars = pd.read_json('used_cars.json')

In [5]:
# comparing how these scraped differently and trying to correct it
used_cars

Unnamed: 0,brand,color,creationDate,domesticEquipment,emissionThing,environmentalThing,fuelType,gearbox,interiorMaterial,kilometer,...,name,notRepairedDamage,numberOfDoors,place,powerPS,price,referenceNumber,someYearThing,vehicleType,yearOfRegistration
0,Porsche,Weiß,,,Euro6,4 (Grün),Benzin,Automatik,Vollleder,35.110,...,Porsche 991 Carrera 4S Cabriolet SportChrono S...,Nein,2/3,",",420,Preis: 119.500 €,,2020,Cabrio,2017
1,Volkswagen,Blau,,,Euro2,4 (Grün),Benzin,Manuell,Stoff,258.305,...,Volkswagen Golf 1.6,Nein,4/5,,75,Preis: 1.000 €,,2020,Limousine,1997
2,Mini,Silber,,,Euro4,4 (Grün),Benzin,Manuell,Vollleder,150.000,...,Mini Cabrio Sonderausstattung mit TÜV Leder s...,Nein,2/3,,90,Preis: 4.450 € VB,,2020,Cabrio,2006
3,Volkswagen,Braun,,,Euro5,4 (Grün),Diesel,Manuell,Alcantara,105.000,...,Volkswagen Passat CC 2.0 TDI BlueMotion Techno...,Nein,4/5,,140,Preis: 12.490 €,,2020,Coupé,2011
4,Smart,Orange,,,Euro6,4 (Grün),Benzin,Manuell,Stoff,13.364,...,Smart fortwo Coupe Passion/Cool&Audio/JBL/LED/...,Nein,2/3,",",71,Preis: 10.980 €,,2021,Kleinwagen,2018
5,Smart,Blau,,,Euro6,4 (Grün),Benzin,Automatik,Vollleder,9.154,...,Smart fortwo Coupe Prime/twinamic/Cool&Audio/L...,Nein,2/3,",",90,Preis: 12.960 €,,2021,Kleinwagen,2018
6,BMW,Schwarz,,,Euro4,4 (Grün),Benzin,Manuell,Teilleder,178.000,...,BMW 525xi M Paket XENON NAVI,Nein,4/5,,218,Preis: 10.000 €,,2019,Limousine,2007
7,Nissan,Braun,,,Euro4,4 (Grün),Benzin,Manuell,Stoff,58.900,...,Nissan Micra 1.2 edition 25 Jahre 1 Jahr Garan...,Nein,4/5,",",65,Preis: 4.780 €,,2021,Limousine,2009
8,Toyota,Grau,,,Euro4,4 (Grün),Benzin,Manuell,Stoff,182.000,...,"Toyota Yaris Basis, Neue Ganzjahresreifen, Tüv...",Nein,2/3,,69,Preis: 2.001 € VB,,2020,Kleinwagen,2009
9,Opel,Grau,,,Euro4,4 (Grün),Benzin,Automatik,Stoff,149.568,...,Opel Meriva 1.8 16V Cosmo mit Gasanlage,Nein,4/5,,125,Preis: 3.100 €,,2019,Van/Bus,2004
