In [None]:
#hide
#default_exp ingest
from nbdev.showdoc import *
from fastcore.test import *

In [None]:
#export
import re
import datetime

import sc2reader
import pymongo
import sc2reaper.init_ingest as ings

# Python std_lib dependencies 
from typing import Generator, Tuple
from dataclasses import dataclass, asdict
from pathlib import Path
from datetime import datetime

# Import dependencies from this package
from sc2trainingg.load_config import *

# The `ingest` Module

> This module contains the tools needed to extract game-play data from a set of StarCraft II replays. This process entails extracting various features that describe each players' performance during a 1v1 online match and organising all the information in a set of collections that compose a MongoDB database.



`ingest` uses three modules to carry on its principal functions. 

1. It uses [MongoDB](https://www.mongodb.com/) and [pymongo](https://pymongo.readthedocs.io/en/stable/) to configure a document-based database where it stores the information of the replays. 
2. It uses [sc2reaper (Gonzalez Duque & Arbelaez Echeverri, 2019)](https://github.com/miguelgondu/sc2reaper) to extract and load various default collections of information to the database (i.e. actions, players, replays, scores, and states). 
3. It uses [sc2reader (Leung, 2020)](https://pypi.org/project/sc2reader/#history) to extract information about the players' username so that the replays can be grouped by players later. In this case, `ingest` stores this extra information in an additional collection (i.e. replays_info) which extends the database where the previous collections exist. 

This module organises the gameplay data into the following collections: 
* actions
* players
* replays = summary of the replays that have been processed.
* states
* scores
* replays_info = summary of the replays that have been processed that includes usernames.

This database is built using two functions defined in this module; `build_replay_info` and `build_reaper_collections` (see below).

## Requirements for Proper Functioning
Before anything else, this module looks for the local `config.json` file and loads its information into a `Config_settings` object using the `load_configurations` function from the `load_config` module.

With this information at hand, the module configures the MongoDB client it needs to store the replay data.

In [None]:
#exports
# Definition of the initial data needed to function
CONFIG_PATH = Path("/Users/david/Documents/phdcode/sc2trainingg") / "config.json"
CONFIG = load_configurations(CONFIG_PATH)

# Define the client and data base to work with MongoDB
DB_Client = pymongo.MongoClient(CONFIG.port_address, CONFIG.port_number)
DB = DB_Client[CONFIG.db_name]
replays_info = DB['replays_info']


# Define the dependency the default to sc2reader
assert Path(CONFIG.replay_path).exists(), "Invalid replay path"
REPLAY_GEN = sc2reader.load_replays(CONFIG.replay_path)


### Information Management
Fallowing a functional programming approach in the development of this project, two frozen dataclasses are defined to ensure inmutability during the process of the information; `Replay_data` and `Player_data`. 


In [None]:
#export
@dataclass(frozen=True)
class Player_data:
    """
    Immutable dataclass that contains Information that describes a 
    player's attributes in a match.
    
    *Attributes:*
        - name (str): 
            The player's user name.
        - number (int): 
            Player number in the match. In a 1v1, match there would be a Player 1 and 2.
        - race (str): 
            The game race (Protoss, Terran, Zerg) with which the player played this match.
        - result (str): 
            Variable descriving whether the player was the matches winner ('Win') or loser ('Loss').

    """
    name: str
    number: int
    race: str
    result: str

In [None]:
show_doc(Player_data, title_level=4)

<h4 id="Player_data" class="doc_header"><code>class</code> <code>Player_data</code><a href="" class="source_link" style="float:right">[source]</a></h4>

> <code>Player_data</code>(**`name`**:`str`, **`number`**:`int`, **`race`**:`str`, **`result`**:`str`)

Immutable dataclass that contains Information that describes a 
player's attributes in a match.

*Attributes:*
    - name (str): 
        The player's user name.
    - number (int): 
        Player number in the match. In a 1v1, match there would be a Player 1 and 2.
    - race (str): 
        The game race (Protoss, Terran, Zerg) with which the player played this match.
    - result (str): 
        Variable descriving whether the player was the matches winner ('Win') or loser ('Loss').

In [None]:
#export
@dataclass(frozen=True)
class Replay_data:
    """
    Immutable dataclass that contains information summarising a 
    match's main attributes.

    *Attributes:*
        - replay_name (str):
            Absolute path of where the Replay was stored when uploaded.
        - replay_id (str):
            Name of the SC2Replay file.
        - date_time (datetime):
            Date and time when the match was played and recorded.
        - match_type (str):
            Descrives the team configuration of the match (eg '1v1', '2v2').
        - game_release (str):
            Version and patch number for the game release where the match
            played.
        - map_name (str):
            Name of the match's map.
        - category (str):
            Descrives if the match was 'Ladder' or other type of match.
        - winner (str):
            User name of the match's winner
        - players (Tuple[Player_data, ...]):
            Summarised information of the match's players (see Player_data 
            class).
    """
    replay_name: str
    replay_id: str
    date_time: datetime
    match_type: str
    game_release: str
    map_name: str
    category: str
    winner: str
    players: Tuple[Player_data, ...]

In [None]:
show_doc(Replay_data, title_level=4)

<h4 id="Replay_data" class="doc_header"><code>class</code> <code>Replay_data</code><a href="" class="source_link" style="float:right">[source]</a></h4>

> <code>Replay_data</code>(**`replay_name`**:`str`, **`replay_id`**:`str`, **`date_time`**:`datetime`, **`match_type`**:`str`, **`game_release`**:`str`, **`map_name`**:`str`, **`category`**:`str`, **`winner`**:`str`, **`players`**:`Tuple`\[[`Player_data`](/sc2trainingg/ingest.html#Player_data), `Ellipsis`\])

Immutable dataclass that contains information summarising a 
match's main attributes.

*Attributes:*
    - replay_name (str):
        Absolute path of where the Replay was stored when uploaded.
    - replay_id (str):
        Name of the SC2Replay file.
    - date_time (datetime):
        Date and time when the match was played and recorded.
    - match_type (str):
        Descrives the team configuration of the match (eg '1v1', '2v2').
    - game_release (str):
        Version and patch number for the game release where the match
        played.
    - map_name (str):
        Name of the match's map.
    - category (str):
        Descrives if the match was 'Ladder' or other type of match.
    - winner (str):
        User name of the match's winner
    - players (Tuple[Player_data, ...]):
        Summarised information of the match's players (see Player_data 
        class).



## Module's Functions

As explained above, the module uses the functions `build_replay_info` and `build_reaper_collections` to construct a document-based database. Of these functions, the former is a custom function, inspired by `sc2reaper`'s information extraction and organisation processes, that selects rough information from the replay file using `sc2reader` and organises it inside the replay_info collection. The latter uses `sc2reaper` to create the rest of the data collections in the database.

### Auxiliary Functions
Appart from `build_replay_info` and `build_reaper_collections`, this modules defines a number of auxiliary functions that are used within those primary functions. 

Here is a brief summary of those functions. 

In [None]:
#export
"""
These auxiliar functions exists to assist the module's main functions.
"""
# Functions that format data according to the dataclasses
def extend_player_info(participant: sc2reader.objects.Participant) -> Player_data:
    '''
    Extracts the players' data from a Participant Object, into a 
    Player_data instance.
    
    *Args:*
        - participant (sc2reader.objects.Participant):
            Participant object containing all data related to a SC2Player

    *Returns:*
        - Player_data:
            Summary of a player's attributes on a match.
    '''
    return Player_data(
        participant.name, 
        participant.pid, 
        participant.play_race,
        participant.result
    )

In [None]:
show_doc(extend_player_info, title_level=4)

<h4 id="extend_player_info" class="doc_header"><code>extend_player_info</code><a href="__main__.py#L6" class="source_link" style="float:right">[source]</a></h4>

> <code>extend_player_info</code>(**`participant`**:`Participant`)

Extracts the players' data from a Participant Object, into a 
Player_data instance.

*Args:*
    - participant (sc2reader.objects.Participant):
        Participant object containing all data related to a SC2Player

*Returns:*
    - Player_data:
        Summary of a player's attributes on a match.

In [None]:
#export
def get_replay_info(replay: sc2reader.resources.Replay) -> Replay_data:
    '''
    Replay_data dataclass instance with a replay's general 
    information.

    *Args:*
        - replay (sc2reader.resources.Replay):
            Replay object to be analysed.

    *Returns:*
        - Replay_data
            Summary of a matches main descriptive information. 
    '''
    file_name_regex = re.compile(r'[^\\]*[.]SC2Replay$')

    # Collect information about the match in a document. 
    return Replay_data(
        replay.filename,
        file_name_regex.search(replay.filename).group(),
        replay.start_time,
        replay.type,
        replay.release_string,
        replay.map_name,
        replay.category,
        replay.winner.players[0].name,
        tuple(extend_player_info(player) for player in tuple(replay.players))
    )

In [None]:
show_doc(get_replay_info, title_level=4)

<h4 id="get_replay_info" class="doc_header"><code>get_replay_info</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h4>

> <code>get_replay_info</code>(**`replay`**:`Replay`)

Replay_data dataclass instance with a replay's general 
information.

*Args:*
    - replay (sc2reader.resources.Replay):
        Replay object to be analysed.

*Returns:*
    - Replay_data
        Summary of a matches main descriptive information. 

In [None]:
#export
# Functions that build the Collections within the database
def not_replay_duplicate(replay: sc2reader.resources.Replay, collection_: pymongo.collection.Collection = replays_info) -> bool:
    '''
    Verify that the replay does not exist in a collection.

    *Args:*
        - replay (sc2reader.resources.Replay):
            The replay being cheked
        - collection_ (pymongo.collection.Collection):
            The collection where the existance check is being performed.

    *Returns:*
        - bool: 
            True if the replay is not in the collection, False if it is.
    '''
    if not collection_.count_documents(
        {'replay_name': replay.filename}, 
        limit = 1
    ):
        print(f'New replay found: {Path(replay.filename).name} \n adding to replay_info collection.')
        return True
    else:
        print(f'{replay.filename} already exists in the replay_info collection.')
        return False

In [None]:
show_doc(not_replay_duplicate, title_level=4)

<h4 id="not_replay_duplicate" class="doc_header"><code>not_replay_duplicate</code><a href="__main__.py#L3" class="source_link" style="float:right">[source]</a></h4>

> <code>not_replay_duplicate</code>(**`replay`**:`Replay`, **`collection_`**:`Collection`=*`Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'TEST_library'), 'replays_info.func')`*)

Verify that the replay does not exist in a collection.

*Args:*
    - replay (sc2reader.resources.Replay):
        The replay being cheked
    - collection_ (pymongo.collection.Collection):
        The collection where the existance check is being performed.

*Returns:*
    - bool: 
        True if the replay is not in the collection, False if it is.

In [None]:
#export
def get_replays_data_set(rp_gen: Generator, collection_: pymongo.collection.Collection) -> Generator:
    '''
    Build a generator that can yield a group of Replay_data 
    instances that represent a the descriptive information of a set
    of replays that are found by an sc2reader replay generator
    and that have not been already added to a specific collection.

    *Args:*
        - rp_gen (Generator): 
            a sc2reader.resources.Replay generator that yields
            the replays found in the CONFIG.replay_path.
        - collection_ (pymongo.collection.Collection): 
            the database collection that could contain the replays

    Returns:
        - Generator: 
            Yields Replay_data instances.
    '''
    return (get_replay_info(replay) for replay in rp_gen if not_replay_duplicate(replay, collection_))

In [None]:
show_doc(get_replays_data_set, title_level=4)

<h4 id="get_replays_data_set" class="doc_header"><code>get_replays_data_set</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h4>

> <code>get_replays_data_set</code>(**`rp_gen`**:`Generator`, **`collection_`**:`Collection`)

Build a generator that can yield a group of Replay_data 
instances that represent a the descriptive information of a set
of replays that are found by an sc2reader replay generator
and that have not been already added to a specific collection.

*Args:*
    - rp_gen (Generator): 
        a sc2reader.resources.Replay generator that yields
        the replays found in the CONFIG.replay_path.
    - collection_ (pymongo.collection.Collection): 
        the database collection that could contain the replays

Returns:
    - Generator: 
        Yields Replay_data instances.

### Main Functions `build_replay_info` and `build_reaper_collections`

In [None]:
#export
def build_replay_info(
    rp_gen: Generator = REPLAY_GEN,
    db_collection:pymongo.collection.Collection = replays_info
    ) -> bool:
    '''
    Triggers the search for new replays at CONFIG.replay_path. Adds the 
    information description of the replays to the a data collection within 
    a MongoDB data base, if they are not in the database already.

    *Args:*
        - rp_gen (Generator = REPLAY_GEN):
            sc2reader.resources.Replay generator that yields the replays found 
            in the CONFIG.replay_path. 
        - db_collection (pymongo.collection.Collection = replays_info): 
            the database where the function adds the new documents.
            
    *Returns:*
        - bool:
            True if new replays were found and added to the replay_info collection, False otherwise.
    '''
    replays_data_set = [asdict(replay_data) for replay_data in get_replays_data_set(rp_gen, db_collection) if replay_data != None]
    if replays_data_set:
        db_collection.insert_many(replays_data_set)
        return True
    else:
        print(f'No new replays at {CONFIG.replay_path}') 
        return False


In [None]:
show_doc(build_replay_info, title_level=4)

<h4 id="build_replay_info" class="doc_header"><code>build_replay_info</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h4>

> <code>build_replay_info</code>(**`rp_gen`**:`Generator`=*`load_all`*, **`db_collection`**:`Collection`=*`Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'TEST_library'), 'replays_info.func')`*)

Triggers the search for new replays at CONFIG.replay_path. Adds the 
information description of the replays to the a data collection within 
a MongoDB data base, if they are not in the database already.

*Args:*
    - rp_gen (Generator = REPLAY_GEN):
        sc2reader.resources.Replay generator that yields the replays found 
        in the CONFIG.replay_path. 
    - db_collection (pymongo.collection.Collection = replays_info): 
        the database where the function adds the new documents.
        
*Returns:*
    - bool:
        True if new replays were found and added to the replay_info collection, False otherwise.

In [None]:
#export
def build_reaper_collections() -> bool:
    """
    Calls the ingest function on the sc2reaper package. Make sure
    you install the package from https://github.com/miguelgondu/sc2reaper
    in your environment before running.

    *Returns:*
        - bool:
            True if new replays were found and added to the multiple collections defined by sc2reaper, False otherwise.

    *Raises:*
        - ImportError:
            If sc2reaper is not installed in the environment.
    """
    try:
        ings.ingest(CONFIG.replay_path, 4)
        return True
    except ImportError as ime:
        print("This program needs sc2reaper to be installed before running.")
        print("Check install instructions at https://github.com/miguelgondu/sc2reaper")
        raise ime
    except ValueError as vale:
        # If ing.ingest raises a ValueError asume reaper did not find new replays.
        print("No new .SC2Replays found by sc2reaper")
        return False

In [None]:
show_doc(build_reaper_collections, title_level=4)

<h4 id="build_reaper_collections" class="doc_header"><code>build_reaper_collections</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h4>

> <code>build_reaper_collections</code>()

Calls the ingest function on the sc2reaper package. Make sure
you install the package from https://github.com/miguelgondu/sc2reaper
in your environment before running.

*Returns:*
    - bool:
        True if new replays were found and added to the multiple collections defined by sc2reaper, False otherwise.

*Raises:*
    - ImportError:
        If sc2reaper is not installed in the environment.

## Execution Example


In [None]:
test_eq(build_replay_info(), True)

New replay found: Jagannatha LE.SC2Replay 
 adding to replay_info collection.


In [None]:
#hide
import sys
sys.argv = sys.argv[:1]

In [None]:
test_eq(build_reaper_collections(), True)

Found 0 replays in the database already.
Processing replay Jagannatha LE.SC2Replay
last frame recorded: 13224
last frame recorded: 13224
Successfully filled all collections of replay Jagannatha LE.SC2Replay


In [None]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_core.ipynb.
Converted 01_load_config.ipynb.
Converted 02_ingest.ipynb.
Converted index.ipynb.
