In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import json

In [4]:
def load_data(path):
    dataset = []
    with open(path) as f:
        dataset = json.loads(f.read())
    return dataset

In [21]:
ATIS_test_raw = load_data('/content/drive/MyDrive/NLU/IntentSlotDatasets/ATIS_old/test.json')
ATIS_tmp_train_raw = load_data('/content/drive/MyDrive/NLU/IntentSlotDatasets/ATIS_old/train.json')

print("Dimensions:")
print(f"ATIS_test: {len(ATIS_test_raw)}")
print(f"ATIS_train: {len(ATIS_tmp_train_raw)}")

Dimensions:
ATIS_test: 893
ATIS_train: 4978


In [24]:
import random
import numpy as np
from sklearn.model_selection import train_test_split
from collections import Counter
from pprint import pprint

# First we get the 10% of dataset, then we compute the percentage of these examples
# on the training set which is around 11%
portion = round(((len(ATIS_tmp_train_raw) + len(ATIS_test_raw)) * 0.10)/(len(ATIS_tmp_train_raw)),2)

intents = [x['intent'] for x in ATIS_tmp_train_raw] # We stratify on intents
count_y = Counter(intents)

Y = []
X = []
mini_Train = []

for id_y, y in enumerate(intents):
    if count_y[y] > 1: # Some intents have only one instance, we put them in training
        X.append(ATIS_tmp_train_raw[id_y])
        Y.append(y)
    else:
        mini_Train.append(ATIS_tmp_train_raw[id_y])
# Random Stratify
X_train, X_dev, y_train, y_dev = train_test_split(X, Y, test_size=portion,
                                                    random_state=42,
                                                    shuffle=True,
                                                    stratify=Y)
X_train.extend(mini_Train)
ATIS_train_raw = X_train
ATIS_dev_raw = X_dev

y_test = [x['intent'] for x in ATIS_test_raw]

# Intent distribution
print('Train:')
pprint({k:round(v/len(y_train),3)*100 for k, v in sorted(Counter(y_train).items())})
print('Dev:'),
pprint({k:round(v/len(y_dev),3)*100 for k, v in sorted(Counter(y_dev).items())})
print('Test:')
pprint({k:round(v/len(y_test),3)*100 for k, v in sorted(Counter(y_test).items())})
print('='*89)
# Dataset size
print('TRAIN size:', len(ATIS_train_raw))
print('DEV size:', len(ATIS_dev_raw))
print('TEST size:', len(ATIS_test_raw))

Train:
{'abbreviation': 2.9000000000000004,
 'aircraft': 1.6,
 'airfare': 8.5,
 'airline': 3.2,
 'airline+flight_no': 0.0,
 'airport': 0.4,
 'capacity': 0.3,
 'city': 0.4,
 'distance': 0.4,
 'flight': 73.7,
 'flight+airfare': 0.4,
 'flight_no': 0.3,
 'flight_time': 1.0999999999999999,
 'ground_fare': 0.4,
 'ground_service': 5.1,
 'meal': 0.1,
 'quantity': 1.0,
 'restriction': 0.1}
Dev:
{'abbreviation': 3.0,
 'aircraft': 1.7000000000000002,
 'airfare': 8.5,
 'airline': 3.2,
 'airport': 0.3,
 'capacity': 0.3,
 'city': 0.3,
 'distance': 0.3,
 'flight': 73.7,
 'flight+airfare': 0.5,
 'flight_no': 0.2,
 'flight_time': 1.0,
 'ground_fare': 0.3,
 'ground_service': 5.2,
 'meal': 0.2,
 'quantity': 1.0,
 'restriction': 0.2}
Test:
{'abbreviation': 3.6999999999999997,
 'aircraft': 1.0,
 'airfare': 5.4,
 'airfare+flight': 0.1,
 'airline': 4.3,
 'airport': 2.0,
 'capacity': 2.4,
 'city': 0.7000000000000001,
 'day_name': 0.2,
 'distance': 1.0999999999999999,
 'flight': 70.8,
 'flight+airfare': 1.3,
 

In [26]:
# Save ATIS
with open('/content/drive/MyDrive/NLU/IntentSlotDatasets/ATIS/new_ATIS_train.json', 'w') as f:
    json.dump(ATIS_train_raw, f)

# Save dev_raw
with open('/content/drive/MyDrive/NLU/IntentSlotDatasets/ATIS/new_ATIS_dev.json', 'w') as f:
    json.dump(ATIS_dev_raw, f)

# Save test_raw
with open('/content/drive/MyDrive/NLU/IntentSlotDatasets/ATIS/new_ATIS_test_raw.json', 'w') as f:
    json.dump(ATIS_test_raw, f)


#SNIPS

In [5]:
# Load the datasets
SNIPS_test = load_data('/content/drive/MyDrive/NLU/IntentSlotDatasets/SNIPS/test.json')
SNIPS_train = load_data('/content/drive/MyDrive/NLU/IntentSlotDatasets/SNIPS/train.json')
SNIPS_valid = load_data('/content/drive/MyDrive/NLU/IntentSlotDatasets/SNIPS/valid.json')

print("Dimensions:")
print(f"SNIPS_test: {len(SNIPS_test)}")
print(f"SNIPS_train: {len(SNIPS_train)}")
print(f"SNIPS_train: {len(SNIPS_valid)}")

Dimensions:
SNIPS_test: 700
SNIPS_train: 13084
SNIPS_train: 700
