In [None]:
#!/usr/bin/env python3

# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE.txt file in the root directory of this source tree.

import argparse
import random
from itertools import chain
from pathlib import Path

import attr
import pkg_resources

from torchbiggraph.config import add_to_sys_path, ConfigFileLoader
from torchbiggraph.converters.import_from_tsv import convert_input_data
from torchbiggraph.converters.utils import download_url, extract_gzip
from torchbiggraph.eval import do_eval
from torchbiggraph.train import train
from torchbiggraph.util import (
    set_logging_verbosity,
    setup_logging,
    SubprocessInitializer,
)


TRAIN_FILENAME = "train.txt"
TEST_FILENAME = "test.txt"
FILENAMES = [
    TRAIN_FILENAME,
    TEST_FILENAME,
]
TRAIN_FRACTION = 0.75

# Figure out the path where the sample config was installed by the package manager.
# This can be overridden with --config.
DEFAULT_CONFIG = './config/config_50_epochs_logistic.py'


def random_split_file(fpath: Path) -> None:
    train_file = fpath.parent / TRAIN_FILENAME
    test_file = fpath.parent / TEST_FILENAME

    if train_file.exists() and test_file.exists():
        print("Found some files that indicate that the input data "
              "has already been shuffled and split, not doing it again.")
        print(f"These files are: {train_file} and {test_file}")
        return

    print('Shuffling and splitting train/test file. This may take a while.')

    print(f"Reading data from file: {fpath}")
    with fpath.open("rt") as in_tf:
        lines = in_tf.readlines()

    # The first few lines are comments
    lines = lines[4:]
    print('Shuffling data')
    random.shuffle(lines)
    split_len = int(len(lines) * TRAIN_FRACTION)

    print('Splitting to train and test files')
    with train_file.open("wt") as out_tf_train:
        for line in lines[:split_len]:
            out_tf_train.write(line)

    with test_file.open("wt") as out_tf_test:
        for line in lines[split_len:]:
            out_tf_test.write(line)


def main():
    setup_logging()



    # download data
    path = './data/statml/'
    data_dir = Path(path)
    
    training_data = './data/statml/individual_links.txt'
    fpath = Path(training_data)

    # random split file for train and test
    random_split_file(fpath)

    loader = ConfigFileLoader()
    config = loader.load_config(DEFAULT_CONFIG, overrides=None)
    set_logging_verbosity(config.verbose)
    subprocess_init = SubprocessInitializer()
    subprocess_init.register(setup_logging, config.verbose)
    subprocess_init.register(add_to_sys_path, loader.config_dir.name)
    input_edge_paths = [data_dir / name for name in FILENAMES]
    output_train_path, output_test_path = config.edge_paths

    convert_input_data(
        config.entities,
        config.relations,
        config.entity_path,
        config.edge_paths,
        input_edge_paths,
        lhs_col=0,
        rhs_col=1,
        rel_col=None,
        dynamic_relations=config.dynamic_relations,
    )

    train_config = attr.evolve(config, edge_paths=[output_train_path])
    train(train_config, subprocess_init=subprocess_init)

    eval_config = attr.evolve(config, edge_paths=[output_test_path])
    do_eval(eval_config, subprocess_init=subprocess_init)


if __name__ == "__main__":
    main()

Found some files that indicate that the input data has already been shuffled and split, not doing it again.
These files are: data/statml/train.txt and data/statml/test.txt
Using the 1 relation types given in the config
Searching for the entities in the edge files...
Entity type user_id:
- Found 4867134 entities
- Removing the ones with fewer than 1 occurrences...
- Left with 4867134 entities
- Shuffling them...
Preparing counts and dictionaries for entities and relation types:
- Writing count of entity type user_id and partition 0
Preparing edge path data/train_partitioned, out of the edges found in data/statml/train.txt
- Edges will be partitioned in 1 x 1 buckets.
- Processed 100000 edges so far...
- Processed 200000 edges so far...
- Processed 300000 edges so far...
- Processed 400000 edges so far...
- Processed 500000 edges so far...
- Processed 600000 edges so far...
- Processed 700000 edges so far...
- Processed 800000 edges so far...
- Processed 900000 edges so far...
- Processe