In [11]:
import pandas as pd
import os

In [12]:
def get_unique_question_ids(dataframe):
    return dataframe.original_question_id.unique()

In [13]:
def group_by_id(dataframe, orgq_ids):
    grouped_dict = dict()

    for qid in orgq_ids:
        grouped_dict[qid] = dataframe[dataframe.original_question_id == qid]

    return grouped_dict

In [14]:
def split_grouped(group_dict):
    irrelevant = []
    mixed = []
    
    for qid, qdf in group_dict.items():
        if len(qdf[qdf.relevance == 'Irrelevant']) == 50:
            irrelevant.append(qdf)
        else:
            mixed.append(qdf)
            
    return irrelevant, mixed

In [15]:
def shrink_df(irrelevant_only, mixed, irrelevant_size=200, mixed_size=100, csv_path=None):
    shrinked_irrelevant_only = irrelevant_only[:irrelevant_size//50]
    shrinked_mixed = mixed[:mixed_size//50]
    merged = shrinked_mixed + shrinked_irrelevant_only

    print("Small dataset total size = {} rows".format(len(merged) * 50))
    print("Small mixed size = {} rows".format(len(shrinked_mixed) * 50))
    print("Small irrelevant only size = {} rows".format(len(shrinked_irrelevant_only) * 50))

    shrinked_df = merged[0]
    for df in merged[1:]:
        shrinked_df = shrinked_df.append(df)
    
    if csv_path is not None:
        shrinked_df.to_csv(csv_path, index=False)
    
    return shrinked_df

In [16]:
def generate_smaller_datasets(filepath, name=None, small_sizes=(200, 100), xsmall_sizes=(50, 50), base_dir='/Volumes/DataDrive/stripped'):
    if not os.path.exists(filepath):
        raise FileNotFoundError('{} has not been found.'.format(filepath))
    
    if name is None:
        basename = os.path.basename(filepath)
        name = os.path.splitext(basename)[0]
        
    print("Reading {}...".format(filepath))
    df = pd.read_csv(filepath)
    print("Done.")
    
    unique_qids = get_unique_question_ids(df)
    grouped = group_by_id(df, unique_qids)
    irrelevant, mixed = split_grouped(grouped)
    
    # Small file
    small_filepath = '{}/{}-small.csv'.format(base_dir, name)
    irrelevant_size, mixed_size = small_sizes
    shrink_df(irrelevant, mixed, irrelevant_size=irrelevant_size, mixed_size=mixed_size, csv_path=small_filepath)
    
    # X-Small file
    xsmall_filepath = '{}/{}-xsmall.csv'.format(base_dir, name)
    irrelevant_size, mixed_size = xsmall_sizes
    shrink_df(irrelevant, mixed, irrelevant_size=irrelevant_size, mixed_size=mixed_size, csv_path=xsmall_filepath)

In [17]:
TRAIN_DATASET = '/Volumes/DataDrive/stripped/english-train.csv'
TEST_DATASET = '/Volumes/DataDrive/stripped/english-devel.csv'

# Generate from "train" dataset
generate_smaller_datasets(TRAIN_DATASET, name='english-train', small_sizes=(2000, 3000), xsmall_sizes=(100, 500))

# Generate from "test" dataset
generate_smaller_datasets(TEST_DATASET, name='english-devel', small_sizes=(500, 1000), xsmall_sizes=(50, 200))

Reading /Volumes/DataDrive/stripped/english-train.csv...
Done.
Small dataset total size = 5000 rows
Small mixed size = 3000 rows
Small irrelevant only size = 2000 rows
Small dataset total size = 600 rows
Small mixed size = 500 rows
Small irrelevant only size = 100 rows
Reading /Volumes/DataDrive/stripped/english-devel.csv...
Done.
Small dataset total size = 1500 rows
Small mixed size = 1000 rows
Small irrelevant only size = 500 rows
Small dataset total size = 250 rows
Small mixed size = 200 rows
Small irrelevant only size = 50 rows
