In [1]:
# Load root folder of the Git project
from pathlib import Path
import subprocess

repo_root = Path(
    subprocess.check_output(
        ["git", "rev-parse", "--show-toplevel"],
        text=True
    ).strip()
)

### Load CommonGen

In [2]:
import pandas as pd
import os
INPUT_PATH = repo_root / "offline_datasets/original/CommonNet/"
common_gen_df = pd.read_parquet(os.path.join(INPUT_PATH, 'commonGen.parquet'))
print(f"Loaded dataset has {common_gen_df.shape[0]:,} elements")
common_gen_df.head(2)

Loaded dataset has 221,565 elements


Unnamed: 0,concepts,target,dataset
0,[skier],Two skiers traveling down a snowy mountain on ...,CommonGen
1,[skier],A skier is skiing down a mountain.,CommonGen


### Load WordNet

In [3]:
wordNet_df = pd.read_parquet(os.path.join(INPUT_PATH, 'wordNet.parquet'))
print(f"Loaded dataset has {wordNet_df.shape[0]:,} elements")
wordNet_df.head(2)

Loaded dataset has 53,264 elements


Unnamed: 0,target,concepts,dataset
0,crowd or draw together,[huddle],WordNet
1,a cord that is drawn through eyelets or around...,[lace],WordNet


### Join datasets

In [4]:
full_df = pd.concat([common_gen_df, wordNet_df])
print(f'Full dataset has {full_df.shape[0]:,} rows')
full_df.head(2)

Full dataset has 274,829 rows


Unnamed: 0,concepts,target,dataset
0,[skier],Two skiers traveling down a snowy mountain on ...,CommonGen
1,[skier],A skier is skiing down a mountain.,CommonGen


### Shuffle before exporting

In [5]:
SEED = 29
shuffled_df = full_df.sample(frac=1, random_state=SEED)
shuffled_df.head(2)

Unnamed: 0,concepts,target,dataset
5097,"[meadow, herd]",a herd of sheep on meadow,CommonGen
148806,"[close, parent]",parents and pupils are warned the school will ...,CommonGen


### Out of curiosity: get some stats on the number of concepts 

In [6]:
n_concepts = shuffled_df.concepts.apply(lambda list_concepts: len(list_concepts))
n_concepts.describe()

count    274829.000000
mean          2.037434
std           2.209400
min           1.000000
25%           1.000000
50%           2.000000
75%           3.000000
max         946.000000
Name: concepts, dtype: float64

### Filtering the dataset
Remove rows with a number of concepts > 5 as the network will not going to learn those examples 


In [7]:
filtered_df = shuffled_df[n_concepts<=5].copy()
print(f'Dataset filtered to {filtered_df.shape[0]:,} rows (it had {shuffled_df.shape[0]:,})')
filtered_df.head(2)

Dataset filtered to 273,916 rows (it had 274,829)


Unnamed: 0,concepts,target,dataset
5097,"[meadow, herd]",a herd of sheep on meadow,CommonGen
148806,"[close, parent]",parents and pupils are warned the school will ...,CommonGen


### Check again the stats

In [8]:
n_concepts = filtered_df.concepts.apply(lambda list_concepts: len(list_concepts))
n_concepts.describe()

count    273916.000000
mean          2.003669
std           0.998616
min           1.000000
25%           1.000000
50%           2.000000
75%           3.000000
max           5.000000
Name: concepts, dtype: float64

### Export the filtered dataset to a parquet file

In [9]:
import os
OUTPUT_PATH = repo_root / "offline_datasets/original/CommonNet/"
OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
filtered_df.to_parquet(os.path.join(OUTPUT_PATH, 'commonNet.parquet'), index=False)