Webcrawler

* initial crawler commit * Added readme * Bugfixes * removed Junk * Bugfix * bugfixes, added player data, sample data * [CODE] Duplicate entries removed * [FEATURE] Added Batting and bowling crawlers to webcrawler (fixes #42,fixes #43) Co-authored-by: Bastian Große <bastiangrosse@pop-os.localdomain> Co-authored-by: scientes <scientes12@gmail.com> Co-authored-by: Royston E Tauro <54945757+lucasace@users.noreply.github.com>
HackerSpace-PESU · Dec 17, 2020 · 6272197 · 6272197
1 parent 97f6e1a
commit 6272197
Show file tree

Hide file tree

Showing 15 changed files with 49,537 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -1 +1,130 @@
-__pycache__/*.pyc
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+.vscode/settings.json
diff --git a/crawler/README.md b/crawler/README.md
@@ -0,0 +1,8 @@
+# Installation:
+
+`pip3 install scrapy python-dateutil`
+
+# Usage:
+
+`cd crawler/cricketcrawler`
+`scrapy crawl howstat`
diff --git a/crawler/cricketcrawler/cricketcrawler/__init__.py b/crawler/cricketcrawler/cricketcrawler/__init__.py
@@ -0,0 +1,10 @@
+import os
+
+if not os.path.exists("../../data_crawler"):
+	os.makedirs("../../data_crawler/ODI")
+	os.mkdir("../../data_crawler/T20")
+	os.mkdir("../../data_crawler/TEST")
+	open("../../data_crawler/id_names.csv", "w+").close()
+	open("../../data_crawler/ODI/match_ids.csv", "w+").close()
+	open("../../data_crawler/T20/match_ids.csv", "w+").close()
+	open("../../data_crawler/TEST/match_ids.csv", "w+").close()
diff --git a/crawler/cricketcrawler/cricketcrawler/csvexporter.py b/crawler/cricketcrawler/cricketcrawler/csvexporter.py
@@ -0,0 +1,8 @@
+from scrapy.exporters import CsvItemExporter
+
+class CsvItemExporter_M(CsvItemExporter):
+    """
+    Fixes Bug in Scrapy
+    """
+    def finish_exporting(self):
+        self.stream.close()
diff --git a/crawler/cricketcrawler/cricketcrawler/fantasy_leagues.py b/crawler/cricketcrawler/cricketcrawler/fantasy_leagues.py
@@ -0,0 +1,72 @@
+"""
+This modules defines all the fantasy leagues that
+the project supports
+"""
+
+class Scorer:
+    """
+    Scorer class to use different formats of the game
+
+    :py:class `t20`: A T20 dictionary which contains scoring metrics
+    :py:class `odi`: A ODI dictionary which contains scoring metrics
+    :py:class `test`: A TEST dictionary which contains scoring metrics
+
+    :param scoring_dict: A dictionary containing scoring metrics.
+            Each value represents a list  where index 0 represents `T20`,
+            1 represents `ODI`, 2 represents `TEST`
+    """
+
+    def __init__(self, scoring_dict):
+
+        self.t20 = {key: scoring_dict[key][0] for key in scoring_dict}
+        self.odi = {key: scoring_dict[key][1] for key in scoring_dict}
+        self.test = {key: scoring_dict[key][2] for key in scoring_dict}
+
+    def get_score(self, stats, playing_format):
+        """
+        Function which sums the score for that platform
+        depending on the format of the game
+
+        :param playing_format: One of `ODI`,`T20`,`TEST`
+        """
+        type_map = {"ODI": self.odi, "T20": self.t20, "TEST": self.test}
+        return sum(type_map[playing_format][key] * stats[key] for key in stats)
+
+
+class Dream11:
+    """Dream11 League
+
+    Supported platforms:
+            * ODI
+            * T20
+            * TEST
+    """
+
+    name = "Dream11"
+
+    batting_dict = Scorer(
+        {
+            "runs": [1, 1, 1],
+            "boundaries": [1, 1, 1],
+            "sixes": [2, 2, 2],
+            "50": [8, 4, 4],
+            "100": [16, 8, 8],
+            "duck": [-2, -3, -4],
+        }
+    )
+
+    bowling_dict = Scorer(
+        {
+            "wicket": [25, 25, 16],
+            "4-wicket-haul": [8, 4, 4],
+            "5-wicket-haul": [16, 8, 8],
+            "Maiden": [4, 8, 0],
+        }
+    )
+
+    wk_dict = Scorer(
+        {
+            "Catch": [8, 8, 8],
+            "Stump": [12, 12, 12],
+        }
+    )
diff --git a/crawler/cricketcrawler/cricketcrawler/items.py b/crawler/cricketcrawler/cricketcrawler/items.py
@@ -0,0 +1,60 @@
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class Baseitem(scrapy.Item):
+    file = ""
+    folder = scrapy.Field()
+    unique = []
+
+    def get_key(self):
+        d = dict(self)
+        key = self.file
+        key += "".join([d[i] for i in self.unique])
+
+
+class MatchidItem(Baseitem):
+    file = "match_ids"
+    name = scrapy.Field()
+    matchid = scrapy.Field()
+    date = scrapy.Field()
+    unique = ["name", "matchid"]
+
+
+class PlayerItem(Baseitem):
+    file = "id_names"
+    longname = scrapy.Field()
+    name = scrapy.Field()
+    gametype = scrapy.Field()
+    retired = scrapy.Field()
+    unique = ["name", "gametype"]
+
+
+class BowlingItem(Baseitem):
+    file = "bowling"
+    matchid = scrapy.Field()
+    overs = scrapy.Field()
+    score = scrapy.Field()
+    economy = scrapy.Field()
+    name = scrapy.Field()
+    folder= scrapy.Field()
+
+
+class BattingItem(Baseitem):
+    file = "batting"
+    matchid = scrapy.Field()
+    score = scrapy.Field()
+    strike_rate = scrapy.Field()
+    name = scrapy.Field()
+    folder= scrapy.Field()
+
+class wicketkeepingItem(Baseitem):
+    file = "wicketkeeping"
+    matchid = scrapy.Field()
+    score = scrapy.Field()
+    name = scrapy.Field()
+    folder= scrapy.Field()
diff --git a/crawler/cricketcrawler/cricketcrawler/pipelines.py b/crawler/cricketcrawler/cricketcrawler/pipelines.py
@@ -0,0 +1,37 @@
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+
+
+# useful for handling different item types with a single interface
+from itemadapter import ItemAdapter
+from scrapy.exceptions import DropItem
+from scrapy.exporters import CsvItemExporter
+
+
+class CricketcrawlerPipeline:
+    def open_spider(self, spider):
+        self.name_to_exporter = {}
+        self.seen = list()
+
+    def close_spider(self, spider):
+        for exporter, fp in self.name_to_exporter.values():
+            exporter.finish_exporting()
+            fp.close()
+
+    def process_item(self, item, spider):
+        exporter= self._exporter_for_item(item)
+        del item["folder"]
+        item2 = dict(item)
+        exporter.export_item(item2)
+        return item
+
+    def _exporter_for_item(self, item):
+        name = f'../../data_crawler/{item["folder"]}/{item.file}.csv'
+        if name not in self.name_to_exporter:
+            f = open(name, "wb")
+            exporter = CsvItemExporter(f)
+            exporter.start_exporting()
+            self.name_to_exporter[name] = exporter, f
+        return self.name_to_exporter[name][0]