Skip to content
This repository has been archived by the owner on Jun 22, 2024. It is now read-only.

Commit

Permalink
Webcrawler
Browse files Browse the repository at this point in the history
* initial crawler commit

* Added readme

* Bugfixes

* removed Junk

* Bugfix

* bugfixes, added player data, sample data

* [CODE] Duplicate entries removed

* [FEATURE] Added Batting and bowling crawlers to webcrawler (fixes #42,fixes #43)

Co-authored-by: Bastian Große <bastiangrosse@pop-os.localdomain>
Co-authored-by: scientes <scientes12@gmail.com>
Co-authored-by: Royston E Tauro <54945757+lucasace@users.noreply.github.com>
  • Loading branch information
4 people committed Dec 17, 2020
1 parent 97f6e1a commit 6272197
Show file tree
Hide file tree
Showing 15 changed files with 49,537 additions and 1 deletion.
131 changes: 130 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,130 @@
__pycache__/*.pyc
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
.python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/
.vscode/settings.json
8 changes: 8 additions & 0 deletions crawler/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Installation:

`pip3 install scrapy python-dateutil`

# Usage:

`cd crawler/cricketcrawler`
`scrapy crawl howstat`
10 changes: 10 additions & 0 deletions crawler/cricketcrawler/cricketcrawler/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import os

if not os.path.exists("../../data_crawler"):
os.makedirs("../../data_crawler/ODI")
os.mkdir("../../data_crawler/T20")
os.mkdir("../../data_crawler/TEST")
open("../../data_crawler/id_names.csv", "w+").close()
open("../../data_crawler/ODI/match_ids.csv", "w+").close()
open("../../data_crawler/T20/match_ids.csv", "w+").close()
open("../../data_crawler/TEST/match_ids.csv", "w+").close()
8 changes: 8 additions & 0 deletions crawler/cricketcrawler/cricketcrawler/csvexporter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from scrapy.exporters import CsvItemExporter

class CsvItemExporter_M(CsvItemExporter):
"""
Fixes Bug in Scrapy
"""
def finish_exporting(self):
self.stream.close()
72 changes: 72 additions & 0 deletions crawler/cricketcrawler/cricketcrawler/fantasy_leagues.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
"""
This modules defines all the fantasy leagues that
the project supports
"""

class Scorer:
"""
Scorer class to use different formats of the game
:py:class `t20`: A T20 dictionary which contains scoring metrics
:py:class `odi`: A ODI dictionary which contains scoring metrics
:py:class `test`: A TEST dictionary which contains scoring metrics
:param scoring_dict: A dictionary containing scoring metrics.
Each value represents a list where index 0 represents `T20`,
1 represents `ODI`, 2 represents `TEST`
"""

def __init__(self, scoring_dict):

self.t20 = {key: scoring_dict[key][0] for key in scoring_dict}
self.odi = {key: scoring_dict[key][1] for key in scoring_dict}
self.test = {key: scoring_dict[key][2] for key in scoring_dict}

def get_score(self, stats, playing_format):
"""
Function which sums the score for that platform
depending on the format of the game
:param playing_format: One of `ODI`,`T20`,`TEST`
"""
type_map = {"ODI": self.odi, "T20": self.t20, "TEST": self.test}
return sum(type_map[playing_format][key] * stats[key] for key in stats)


class Dream11:
"""Dream11 League
Supported platforms:
* ODI
* T20
* TEST
"""

name = "Dream11"

batting_dict = Scorer(
{
"runs": [1, 1, 1],
"boundaries": [1, 1, 1],
"sixes": [2, 2, 2],
"50": [8, 4, 4],
"100": [16, 8, 8],
"duck": [-2, -3, -4],
}
)

bowling_dict = Scorer(
{
"wicket": [25, 25, 16],
"4-wicket-haul": [8, 4, 4],
"5-wicket-haul": [16, 8, 8],
"Maiden": [4, 8, 0],
}
)

wk_dict = Scorer(
{
"Catch": [8, 8, 8],
"Stump": [12, 12, 12],
}
)
60 changes: 60 additions & 0 deletions crawler/cricketcrawler/cricketcrawler/items.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class Baseitem(scrapy.Item):
file = ""
folder = scrapy.Field()
unique = []

def get_key(self):
d = dict(self)
key = self.file
key += "".join([d[i] for i in self.unique])


class MatchidItem(Baseitem):
file = "match_ids"
name = scrapy.Field()
matchid = scrapy.Field()
date = scrapy.Field()
unique = ["name", "matchid"]


class PlayerItem(Baseitem):
file = "id_names"
longname = scrapy.Field()
name = scrapy.Field()
gametype = scrapy.Field()
retired = scrapy.Field()
unique = ["name", "gametype"]


class BowlingItem(Baseitem):
file = "bowling"
matchid = scrapy.Field()
overs = scrapy.Field()
score = scrapy.Field()
economy = scrapy.Field()
name = scrapy.Field()
folder= scrapy.Field()


class BattingItem(Baseitem):
file = "batting"
matchid = scrapy.Field()
score = scrapy.Field()
strike_rate = scrapy.Field()
name = scrapy.Field()
folder= scrapy.Field()

class wicketkeepingItem(Baseitem):
file = "wicketkeeping"
matchid = scrapy.Field()
score = scrapy.Field()
name = scrapy.Field()
folder= scrapy.Field()
37 changes: 37 additions & 0 deletions crawler/cricketcrawler/cricketcrawler/pipelines.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
from scrapy.exceptions import DropItem
from scrapy.exporters import CsvItemExporter


class CricketcrawlerPipeline:
def open_spider(self, spider):
self.name_to_exporter = {}
self.seen = list()

def close_spider(self, spider):
for exporter, fp in self.name_to_exporter.values():
exporter.finish_exporting()
fp.close()

def process_item(self, item, spider):
exporter= self._exporter_for_item(item)
del item["folder"]
item2 = dict(item)
exporter.export_item(item2)
return item

def _exporter_for_item(self, item):
name = f'../../data_crawler/{item["folder"]}/{item.file}.csv'
if name not in self.name_to_exporter:
f = open(name, "wb")
exporter = CsvItemExporter(f)
exporter.start_exporting()
self.name_to_exporter[name] = exporter, f
return self.name_to_exporter[name][0]
Loading

0 comments on commit 6272197

Please sign in to comment.