In [None]:
# default_exp ingest.ingest

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#exporti

import re
import pymongo
import json
import sc2reader
import errno
import jsonschema
import os

import pandas as pd
import numpy as np

from typing import *
from pathlib import Path
from pprint import pprint
from jsonschema import validate
from dataclasses import dataclass, astuple, asdict, field

from sc2reader.engine.plugins import APMTracker

from sc_training.ingest import *
sc2reader.engine.register_plugin(APMTracker())
sc2reader.engine.register_plugin(CtrlGroupTracker())

In [None]:
#hide 

test_data_path = (Path(Path.cwd()/'test_replays') 
                  if Path('test_replays').exists() 
                  else Path('../../test_replays'))

test_batch_path = (test_data_path / "TestProfilerBatch")

assert test_data_path.exists()
assert test_data_path.is_dir()

# 9 - The main ingest module 

## Introduction

This module defines the `inventory_replays` function following the design proposed in the Ingestion Sequence Diagram (see <<2 - Data Ingestion and Clustering Process>>). It exports these functions into the ingest module of the ingest sub-package.

### Exportable Members
- `inventory_replays`

#### Exportable Helper functions
- `load_configurations`
- `set_up_db`


## Inventoring and Storing `replays` collection

To inventory the replays in a replay batch I can use the `get_replay_info` defined in the `summarise_rpl` module (see Section 1.1).

In the following code, I load a batch of replays and I print the summary of the first two replays using the `get_replay_info` function.

In [None]:
replay_batch = sc2reader.load_replays(str(test_batch_path))

for i, rpl in enumerate(replay_batch):
    print(get_replay_info(rpl))
    print(type(get_replay_info(rpl)))
    if i == 1:
        break

File path:                   c:\Users\david\Documents\phdcode\sc_training\test_replays\TestProfilerBatch\16-Bit LE (2).SC2Replay 
File name:                   16-Bit LE (2).SC2Replay 
Date (datetime.datetime):    2021-06-04 03:49:10 
Duration (seconds):          707 
Game type:                   1v1 
Game release:                5.0.7.84643 
Map:                         16-Bit LE 
Game category:               Private 
winner:                      1 
players:                     [(1, 'HDEspino', 'Terran', 'Win'), (2, 'A.I. 1 (Harder)', 'Zerg', 'Loss')] 

<class 'sc_training.ingest.summarise_rpl.Replay_data'>
File path:                   c:\Users\david\Documents\phdcode\sc_training\test_replays\TestProfilerBatch\16-Bit LE (3).SC2Replay 
File name:                   16-Bit LE (3).SC2Replay 
Date (datetime.datetime):    2021-06-08 02:04:37 
Duration (seconds):          384 
Game type:                   1v1 
Game release:                5.0.7.84643 
Map:                         16-Bit LE 
G

### Storing inventory in Database

I can store the `Replay_data` objects returned by `get_replay_info` in a document-based database. Storing this inventory in a database means that other processes could access it to navigate the information built by the ingest process.

`sc_training`, uses a `config.json` file, which users should create and store in the working project's data folder, to set up a MongoDB local client. Using this client and the information in the config file the solution also creates the database. The following code loads this file and defines the loading procedure for when this module is imported. 

I will divide this loading process into three steps. 

#### 1 - Handle Config file
I locate and load the `config` file. This location process allows users to create a custom `config.json` in a data directory in their projects. That file can be used to customise the name of the database, the port address and number for the MongoDB client, and the location of the replays the user wants to process. Once the file is located, I also define a procedure to ensure it contains the necessary information to connect with the MongoDB client and access the appropriate database.

Apart from this option, the code also defines how the module can default to a `config` file stored in the library's data folder if the user fails to provide this file. This default file allows the system to attempt to function based on the assumption that the users are using a Windows computer and have a traditional installation of StarCraft II and MongoDB. Of course, this default set-up would not be expected to work in all cases, but it provides an option and a sample of the `config` file.

In [None]:
#exporti

# This section defines the helper functions I use in the 
# config file loading and checking procedure
# To check that the config file contains all necessary information for 
# the solution's proper working I use the following jsonschema   
Config_schema = {
    "type": "object",
    "properties":{
        "DB_NAME": {"type":"string"},
        "PORT_ADDRESS":  {"type":"string"},
        "PORT_NUMBER": {"type":"number"},
        "REPLAY_PATH": {"type":"string"}
    }
}


def validate_config_file(file: Path, schema: Dict[str, Any]) -> bool:
    """This helper function uses the json schema defined above to 
    make sure that the config file includes all the information 
    necessary for the solution's proper work"""
    try:
        validate(file, schema)
    except jsonschema.exceptions.ValidationError as err:
        print(err)
        print("config.json does not conform to the required specifications")
        raise err
    except jsonschema.exceptions.SchemaError as err:
        print(err)
        print("The Config_schema is invalid")
        raise err
    
    return True

# Using the validate_config_file function I can check that the config 
# file exists and has the proper information for sc_training to work.
 
def open_config_file(config_file: Path) -> dict[str, Any]:
    """This helper function verifies the existence of the config file
    and that it has the proper data. If so it imports the data into a
    dict that can be used to access such data in the program"""
    try:
        if not config_file.exists():
            raise FileNotFoundError
        
        validate_config_file(json.load(config_file.open()), Config_schema)
        
        with config_file.open('r') as cf:
            return json.load(cf)

    except FileNotFoundError as err:
        print('config.json not found')
        raise err


The `load_configurations` function, uses various internal helper functions to locate, open, verify and load the information from a project's `config.json` file. It stores this information as a `Config_settings` object. 

In [None]:
#export
@dataclass
class Config_settings:
    """This type of object stores the data extracted from the config file.
    
    *Attributes*
        - port_address: str
            Address of the MongoDB Client that the program will connect to.
        - port_number: int
            Port number of the client located in the address above
        - db_name: str
            Name of the project's data base
        - replay_path: str
            Path to the replays that must be analysed and stored in the 
            database
"""
    port_address: str
    port_number: int
    db_name: str
    replay_path: str

    def __str__(self):
        headers = ["Port Address: ","Port Number: ",
                  "DB Name: ","Replays file: "]
        strings = [f'{h:<15}{att:>40}\n' for h, att
                   in zip(headers, astuple(self))]
        return ''.join(strings)


In [None]:
show_doc(Config_settings, title_level=4)

<h4 id="Config_settings" class="doc_header"><code>class</code> <code>Config_settings</code><a href="" class="source_link" style="float:right">[source]</a></h4>

> <code>Config_settings</code>(**`port_address`**:`str`, **`port_number`**:`int`, **`db_name`**:`str`, **`replay_path`**:`str`)

This type of object stores the data extracted from the config file.

*Attributes*
    - port_address: str
        Address of the MongoDB Client that the program will connect to.
    - port_number: int
        Port number of the client located in the address above
    - db_name: str
        Name of the project's data base
    - replay_path: str
        Path to the replays that must be analysed and stored in the 
        database

In [None]:
#export
def load_configurations() -> Config_settings:
    """Loads the project's configuration information.

    This function locates, verifies and extracts the project's configuration
    data. This data tells sc_training where to find the replays it needs to
    inventory and process, how to connect to the MongoDB client it will use
    to store this data in a database, and the name of the database it 
    should use.

    *Args*
        - None

    *Returns*
        - Config_settings

    *Errors*
        - FileNotFound
            If there is no valid config file
        - jsonschema.exceptions.ValidationError
            If the config file does not contain the necessary data or does
            not conform to the proper schema necessary to work.
    """

    config_file = (Path(Path.cwd()/'data/config.json') 
               if Path(Path.cwd()/'data/config.json').exists()
               else Path(Path(__file__)/'../../../data/config.json'))

    config_dict = open_config_file(config_file)
    return Config_settings(
        config_dict['PORT_ADDRESS'],
        config_dict['PORT_NUMBER'],
        config_dict['DB_NAME'],
        config_dict['REPLAY_PATH']
    )

The following sample code shows the use of `load_configurations` to set up a test data base.

In [None]:
db_settings = load_configurations()
mongo_client = pymongo.MongoClient(db_settings.port_address, 
                                   db_settings.port_number)
worcking_bd = mongo_client[db_settings.db_name]

In [None]:
print('config.json content:')
print(db_settings)
print('Local Mongo DB Client')
print(mongo_client, end='\n\n')
print('Database: ')
print(worcking_bd)


config.json content:
Port Address:                                 localhost
Port Number:                                      27017
DB Name:                                   TEST_library
Replays file:          .\test_replays\TestProfilerBatch

Local Mongo DB Client
MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True)

Database: 
Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'TEST_library')


#### 2 - Index replay batch data in the "replays" collection
Once the database is set, I store the main descriptive data of each replay into the `replays` collection of the database. This information is crucial to be able to iterate through the batch and also to have an index that indicates what players played which matches. It also can be used to review other information about the match, for instance, was the match a ranked match? or what races did each player played with.

To extract this data I use the `get_replay_info` function from the `summarise_rpl` module (see <<3 - Summarising Replays>>).

To collect this data I loop through the replays in a file inserting the return values of the functions from the ingest sub-package's different modules into collections within the database. 

> Note: in the example, I set up a new database called `sample_db` that I will use in this notebook for illustration purposes. 

In [None]:
sample_db = mongo_client['sample_db']
collection = sample_db['replays']

replay_batch = sc2reader.load_replays(db_settings.replay_path)

count_add = 0
count_existed = 0
for rpl in replay_batch:
    if not collection.count_documents({'replay_name': rpl.filename}, 
                                      limit = 1):
        # print(f'Adding {Path(rpl.filename).name} to replays collection.')
        collection.insert_one(asdict(get_replay_info(rpl)))
        count_add += 1
    else:
        count_existed += 1
        # print(rpl.filename, "already exists in the replay_info collection.")

print(f"{count_add} added to replays")
print(f"{count_existed} already existed in replays")

0 added to replays
153 already existed in replays


Once the loop finishes, the collection has been created in the database. The following snippet shows that this collection is now composed of several documents containing the indexing data for each replay. Additionally, I added a conditional statement within the loop to check if the collection already contains a replay. In that case, the loop will print a warning and avoid inserting repeated documents into the database.

In [None]:
print('The collection now contains:', 
      collection.estimated_document_count(), 'documents')

The collection now contains: 153 documents


#### 3 - Building the `indicators` collection

Once a replay's descriptive data is stored in the `replays` collections, I need to also extract and store the performance indicators for each player in the match in the `indicators` collection.

In the following code, I illustrate how I can use all the functions defined in the `ingest` sub-package to build the indicators of two players in a sample replay. Each player's indicators are stored in a single flat dictionary, i.e. a dictionary that has no nested data structures as values.  

In [None]:
#exporti
# The following helper function helps eliminate multiple levels of 
# nesting in a dictionary
def flatten_indicators(nested_value: dict) -> dict[str, float]:
    if isinstance(list(nested_value.values())[0], dict):
        output_dict = {}
        for k, nested in nested_value.items():
            for nested_k, v in nested.items():
                output_dict[f'{k}_{nested_k}'] = v
        return output_dict
    elif isinstance(list(nested_value.values())[0], tuple):
        output_dict = {}
        order = ['first', 'second']
        for k, nested in nested_value.items():
            for ord, v in zip(order, nested):
                output_dict[f'{ord}_{k}'] = str(v)
        return output_dict


In [None]:
#
# First I locate the sample file.
sample_replay = [sc2reader.load_replay('test_replays\Jagannatha LE.SC2Replay')]

# I create two list of all the functions from the ingest sub-package that I use
# to collect the player's performance indicators.
# I use two lists, because I need to make sure that all functions in the 
# list have the same caller structure.
simple_functions = [get_player_macro_econ_stats,
                    get_expan_times,
                    get_expan_counts,
                    calc_attack_ratio,
                    calc_ctrlg_ratio,
                    count_max_active_groups,
                    calc_get_ctrl_grp_ratio,
                    calc_select_ratio,
                    list_player_upgrades,
                    calc_spe_abil_ratios]

double_functions = [count_composition,
                    count_started]

# I loop through the replays in sample replay, and through the players
# in each replay. In each  storing the indicators of each player as a 
# dictionary in the indicators_list. 
indicators_list = []
for rpl in sample_replay:
    print(f'Processing: {rpl.filename}')
    for pid, player in rpl.player.items():
        print(f'Processing player: pid:{pid} {player}')
        
        # Declare the dict that will contain all of the players performance
        # indicators
        rpl_indicators = {}
        
        # Run through all functions that need the caller arguments rpl
        # (for the replay being analysed) and pid (for the player id of
        # the player being parsed) and that return a flat dict. 
        for func in simple_functions:
            rpl_indicators.update(func(rpl, pid))

        # Run through all functions that need the caller arguments rpl
        # (for the replay being analysed), pid (for the player id of
        # the player being parsed) and a flag to focus on extracting data 
        # from the player's building or troops, and that return a flat dict.
        for func in double_functions:
            for flag in [True, False]:
                rpl_indicators.update(flatten_indicators(func(rpl, pid, flag)))

        # I run this last function appart, because I need to flatten the
        # output so that the resulting dictionary has no nested levels. 
        v = get_prefered_spec_abil(rpl, pid)
        rpl_indicators.update(flatten_indicators(v))
                
        indicators_list.append(rpl_indicators)


Processing: test_replays\Jagannatha LE.SC2Replay
Processing player: pid:1 Player 1 - HDEspino (Protoss)
Processing player: pid:2 Player 2 - MxChrisxM (Terran)


As a result of the code above, `indicators_list` now contains two dictionaries that store the indicators for each player. In the `inventory_replays` function, I store this data in the database's `indicators` collection instead of a list. In the previous code I used a list to illustrate how the process returns the following results:  

In [None]:
print('Number of players evaluated: ',len(indicators_list))
print('First set of indicators belongs to:', 
      indicators_list[0]['player_username'])
print('First set of indicators contains: ',
        len(indicators_list[0]), 'indicators.')
print('Second set of indicators belongs to:', 
      indicators_list[1]['player_username'])
print('Second set of indicators contains: ',
        len(indicators_list[1]), 'indicators.')      

Number of players evaluated:  2
First set of indicators belongs to: HDEspino
First set of indicators contains:  388 indicators.
Second set of indicators belongs to: MxChrisxM
Second set of indicators contains:  377 indicators.


> Tip: In some cases, each player's indicators lists would contain a different number of indicators. This difference exists because the list stores separate counts for the players' units and buildings according to their race during the match. Remember that each one of the game races has a different number of available units. 

## Exportable functions 
This section defines `inventory_replays`. Users can pass a directory containing multiple `.SC2Replay` files to this function and it will extract all data from these files and store it in the `replays` and `indicators` collection of the projects database (as defined in the `config.json` file) following the logic explained above. 

Internally, the function uses three helper functions:

- `set_up_db`: connects to the MongoDB client and loads the working database.
- `verify_replays_path`: makes sure that the path past by the user is valid.
- `build_indicators`: runs through the indicator extraction loop and stores the results in the indicators collections.

Of these, I export the `set_up_db`, given that it can be useful in other modules (see for example <<10 - Player Profiler>>)

In [None]:
#export
def set_up_db() -> pymongo.database.Database:
    """Loads the database specified in the project's config.json file.

    *Returns*
        - pymongo.database.Database
            Python object that allows the user to interact with the
            database specified in the project's config file. 
    """
    db_settings = load_configurations()
    mongo_client = pymongo.MongoClient(db_settings.port_address, 
                                    db_settings.port_number)
    worcking_bd = mongo_client[db_settings.db_name]

    return worcking_bd

The following is a sample run of the `set_up_db` function to illustrate how it can be used to connect to a mongo database.

In [None]:
worcking_bd = set_up_db()
print(type(worcking_bd))

<class 'pymongo.database.Database'>


In [None]:
#exporti
# Helper function that verifies the path of the where the replays should 
# be located according to the config file. 
def verify_replays_path(rpl_path: Any) -> Path:

    if isinstance(rpl_path, str):
        path = Path(rpl_path)
    elif not isinstance(rpl_path, Path):
        string = 'replay_path must be of type str or Path, not ' 
        raise TypeError((string + str(type(replay_batch))))
    else:
        path = rpl_path

    if not path.exists():
        raise ValueError(f'{path} is not valid location')

    return path

In [None]:
#exporti
# Helper function that extracts the indicators for each matc's players and 
# stores it in the indicators collection. 
def build_indicators(rpl: sc2reader.resources.Replay,
                    working_db: pymongo.database.Database) -> None:
    
    """Runs through the indicator extraction loop and stores the results 
    in the indicators collections.
    """
    simple_functions = [get_player_macro_econ_stats,
                        get_expan_times,
                        get_expan_counts,
                        calc_attack_ratio,
                        calc_ctrlg_ratio,
                        count_max_active_groups,
                        calc_get_ctrl_grp_ratio,
                        calc_select_ratio,
                        list_player_upgrades,
                        calc_spe_abil_ratios,
                        calc_apms
                        ]

    double_functions = [count_composition,
                        count_started]
    indi_collect = working_db['indicators']
    for pid in rpl.player.keys():  
        rpl_indicators = {}
        for func in simple_functions:
            rpl_indicators.update(func(rpl, pid))

        v = get_prefered_spec_abil(rpl, pid)
        rpl_indicators.update(flatten_indicators(v))

        for func in double_functions:
            for flag in [True, False]:
                rpl_indicators.update(flatten_indicators(func(rpl, pid, flag)))

        rpl_ind = ({k: v 
                    if (not (isinstance(v, np.int64) 
                        or isinstance(v, np.float64)))
                    else float(v) for k, v in rpl_indicators.items()})
        
        indi_collect.insert_one(rpl_ind)

In [None]:
#export
def inventory_replays() -> None:
    """This function builds two collections within the database 
    specified in the config.json file.

    The replay information will be stored in the database specified in 
    cwd/data/config.json in the following collections:
    - `replays`
        Stores the metadata of the replays, can be used for indexing and
        for finding the other replays.
    - `inicators`
        Store the indicators for each performance of every player.

    *Args:*
        - replay_batch
            Directory address where the replays to process are located.

    *Return:*
        -None

    """
    project_config = load_configurations()
    working_db = set_up_db()
    rpls_collect = working_db['replays']
    path = verify_replays_path(project_config.replay_path)

    replays = sc2reader.load_replays(str(path))
    
    load_count = 0
    process_count = 0
    previous = 0
    ignored = 0
    
    print(f'Inventorying replays at: {path} in database {working_db.name}')

    for rpl in replays:
        process_count += 1
        if (not (rpl.type == "1v1")):
            ignored += 1
            # print(f'{rpl.filename}is not 1v1')
            continue
        if not rpls_collect.count_documents({'replay_name': rpl.filename}, 
                                            limit = 1):
            # print(f'Processing {rpl.filename}')
            rpls_collect.insert_one(asdict(get_replay_info(rpl)))
            build_indicators(rpl, working_db)
            load_count += 1
        else:
            previous += 1
            # print(rpl.filename, "already exists in the replay_info rpls_collect.")
        
    print(f'Load complete.')
    print(f'{process_count} files processed')
    print(f'{load_count} files loaded')
    print(f'{ignored} files ignored')
    print(f'{previous} files alredy existed')
        

After running the function once, I get the following sample results:

In [None]:
# 
mongo_client.drop_database('TEST_library')
inventory_replays()

Inventorying replays at: test_replays\TestProfilerBatch in database TEST_library
Load complete.
153 files processed
149 files loaded
4 files ignored
0 files alredy existed


In [None]:
collections = [col for col in worcking_bd.list_collection_names()]
collections

['replays', 'indicators']

In [None]:
for col in collections:
    print(f'{col} has {worcking_bd[col].estimated_document_count()} records.')

replays has 149 records.
indicators has 298 records.


In [None]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_Comp_model.ipynb.
Converted 01_01_ingest_and_clustering.ipynb.
Converted 01_summarise_rpl.ipynb.
Converted 02_handle_tracker_events.ipynb.
Converted 03_macro_econ_parser.ipynb.
Converted 04_build_parser.ipynb.
Converted 05_handle_command_events.ipynb.
Converted 06_selection_parser.ipynb.
Converted 07_ingest.ipynb.
Converted 08_profiler.ipynb.
Converted index.ipynb.
