In [1]:
# Building a more efficient scrapy scraper

In [2]:
from twisted.internet import reactor
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from scrapy.spiders import Spider
from twisted.internet import defer

class MySpider(Spider):
    name = 'my_spider'
    custom_settings = {'LOG_LEVEL': 'INFO'}

    def __init__(self, summoner_name, region, *args, **kwargs):
        super(MySpider, self).__init__(*args, **kwargs)
        self.start_urls = [f"https://u.gg/lol/profile/{region}/{summoner_name}/champion-stats"]
        self.data = {}
        self.columns = ['Rank', 'Champion', 'Win Rate', 'Wins/Loses', 'Unnamed', 'Kills', 'Deaths', 'Assists', 'LP',
                        'Max Kills', 'Max Deaths', 'CS', 'Damage', 'Gold']

    def parse(self, response):
        try:
            row = {}
            selectors = [
                "div.rt-tr-group:nth-child(1) > div:nth-child(1) > div:nth-child(1) > span:nth-child(1)::text",
                "div.rt-tr-group:nth-child(1) > div:nth-child(1) > div:nth-child(2) > div:nth-child(1) > span:nth-child(2)::text",
                "div.rt-tr-group:nth-child(1) > div:nth-child(1) > div:nth-child(3) > div:nth-child(1) > strong:nth-child(1)::text",
                "div.rt-tr-group:nth-child(1) > div:nth-child(1) > div:nth-child(3) > div:nth-child(1) > span:nth-child(3)::text",
                "div.rt-tr-group:nth-child(1) > div:nth-child(1) > div:nth-child(4) > div:nth-child(1) > div:nth-child(1) > strong:nth-child(1)::text",
                "div.rt-tr-group:nth-child(1) > div:nth-child(1) > div:nth-child(4) > div:nth-child(1) > span:nth-child(2) > strong:nth-child(1)::text",
                "div.rt-tr-group:nth-child(1) > div:nth-child(1) > div:nth-child(4) > div:nth-child(1) > span:nth-child(2) > strong:nth-child(3)::text",
                "div.rt-tr-group:nth-child(1) > div:nth-child(1) > div:nth-child(4) > div:nth-child(1) > span:nth-child(2) > strong:nth-child(5)::text",
                "div.rt-tr-group:nth-child(1) > div:nth-child(1) > div:nth-child(5) > span:nth-child(1) > span:nth-child(2)::text",
                "div.rt-tr-group:nth-child(1) > div:nth-child(1) > div:nth-child(6) > span:nth-child(1)::text",
                "div.rt-tr-group:nth-child(1) > div:nth-child(1) > div:nth-child(7) > span:nth-child(1)::text",
                "div.rt-tr-group:nth-child(1) > div:nth-child(1) > div:nth-child(8) > span:nth-child(1)::text",
                "div.rt-tr-group:nth-child(1) > div:nth-child(1) > div:nth-child(9) > span:nth-child(1)::text",
                "div.rt-tr-group:nth-child(1) > div:nth-child(1) > div:nth-child(10) > span:nth-child(1)::text",
            ]
            for col, selector in zip(self.columns, selectors):
                item = response.css(selector).get()
                row[col] = item.strip() if item else 'N/A'
            self.data.update(row)

        except Exception as e:
            self.log(f"Error: {e}")

    def closed(self, reason):
        reactor.stop()

def run_spider(summoner_name, region):
    configure_logging({'LOG_LEVEL': 'INFO'})
    runner = CrawlerRunner()
    d = runner.crawl(MySpider, summoner_name=summoner_name, region=region)
    d.addBoth(lambda _: reactor.stop())
    reactor.run()
    print("Runner crawlers:", runner.crawlers)  # Debugging line


    # At this point, the spider instance is stored in the runner's crawlers attribute
    # Retrieve the spider data from there
    spider = list(runner.crawlers)[0].spider  # Convert set to list and then get the first element
    return spider.data

if __name__ == "__main__":
    summoner_name = "leaguify"
    region = "euw1"
    result = run_spider(summoner_name, region)
    print(result)  # This is just for testing; remove it when you integrate this code elsewhere

2023-09-25 12:22:40 [scrapy.addons] INFO: Enabled addons:
[]


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
  return cls(crawler)

2023-09-25 12:22:40 [scrapy.extensions.telnet] INFO: Telnet Password: 85cec191b1b9035b
2023-09-25 12:22:41 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.logstats.LogStats']
2023-09-25 12:22:41 [scrapy.crawler] INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}
2023-09-25 12:22:41 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloade

IndexError: list index out of range

In [1]:
from twisted.internet import reactor, defer
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from scrapy.spiders import Spider

class MySpider(Spider):
    name = 'my_spider'
    custom_settings = {'LOG_LEVEL': 'INFO'}

    def __init__(self, summoner_name, region, *args, **kwargs):
        super(MySpider, self).__init__(*args, **kwargs)
        self.start_urls = [f"https://u.gg/lol/profile/{region}/{summoner_name}/champion-stats"]
        self.data = {}
        self.columns = ['Rank', 'Champion', 'Win Rate', 'Wins/Loses', 'Unnamed', 'Kills', 'Deaths', 'Assists', 'LP',
                        'Max Kills', 'Max Deaths', 'CS', 'Damage', 'Gold']

    def parse(self, response):
        try:
            row = {}
            selectors = [
                "div.rt-tr-group:nth-child(1) > div:nth-child(1) > div:nth-child(1) > span:nth-child(1)::text",
                "div.rt-tr-group:nth-child(1) > div:nth-child(1) > div:nth-child(2) > div:nth-child(1) > span:nth-child(2)::text",
                "div.rt-tr-group:nth-child(1) > div:nth-child(1) > div:nth-child(3) > div:nth-child(1) > strong:nth-child(1)::text",
                "div.rt-tr-group:nth-child(1) > div:nth-child(1) > div:nth-child(3) > div:nth-child(1) > span:nth-child(3)::text",
                "div.rt-tr-group:nth-child(1) > div:nth-child(1) > div:nth-child(4) > div:nth-child(1) > div:nth-child(1) > strong:nth-child(1)::text",
                "div.rt-tr-group:nth-child(1) > div:nth-child(1) > div:nth-child(4) > div:nth-child(1) > span:nth-child(2) > strong:nth-child(1)::text",
                "div.rt-tr-group:nth-child(1) > div:nth-child(1) > div:nth-child(4) > div:nth-child(1) > span:nth-child(2) > strong:nth-child(3)::text",
                "div.rt-tr-group:nth-child(1) > div:nth-child(1) > div:nth-child(4) > div:nth-child(1) > span:nth-child(2) > strong:nth-child(5)::text",
                "div.rt-tr-group:nth-child(1) > div:nth-child(1) > div:nth-child(5) > span:nth-child(1) > span:nth-child(2)::text",
                "div.rt-tr-group:nth-child(1) > div:nth-child(1) > div:nth-child(6) > span:nth-child(1)::text",
                "div.rt-tr-group:nth-child(1) > div:nth-child(1) > div:nth-child(7) > span:nth-child(1)::text",
                "div.rt-tr-group:nth-child(1) > div:nth-child(1) > div:nth-child(8) > span:nth-child(1)::text",
                "div.rt-tr-group:nth-child(1) > div:nth-child(1) > div:nth-child(9) > span:nth-child(1)::text",
                "div.rt-tr-group:nth-child(1) > div:nth-child(1) > div:nth-child(10) > span:nth-child(1)::text",
            ]
            for col, selector in zip(self.columns, selectors):
                item = response.css(selector).get()
                row[col] = item.strip() if item else 'N/A'
            self.data.update(row)
        except Exception as e:
            self.log(f"Error: {e}")

    def closed(self, reason):
        reactor.stop()

@defer.inlineCallbacks
def run_spider(summoner_name, region):
    runner = CrawlerRunner()
    yield runner.crawl(MySpider, summoner_name=summoner_name, region=region)
    
    # Debugging line to check the contents of runner.crawlers
    print("Runner crawlers:", runner.crawlers)
    
    if runner.crawlers:
        spider = list(runner.crawlers)[0].spider  # Convert set to list and then get the first element
        print(spider.data)  # This is just for testing; remove it when you integrate this code elsewhere
    else:
        print("No crawlers were run.")
    reactor.stop()

if __name__ == "__main__":
    configure_logging({'LOG_LEVEL': 'INFO'})
    summoner_name = "leaguify"
    region = "euw1"
    
    run_spider(summoner_name, region)
    reactor.run()

2023-09-25 12:28:09 [scrapy.addons] INFO: Enabled addons:
[]


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
  return cls(crawler)

2023-09-25 12:28:09 [scrapy.extensions.telnet] INFO: Telnet Password: 8de3ade976199422
2023-09-25 12:28:09 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.logstats.LogStats']
2023-09-25 12:28:09 [scrapy.crawler] INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}
2023-09-25 12:28:09 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloade

Runner crawlers: set()
No crawlers were run.
