In [1]:
import numpy as np
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

In [2]:
# Load training and test sets
df_train = pd.read_json('datas/training_set.json')
df_test = pd.read_json('datas/testing_set.json')

print(f"Train shape : {df_train.shape}")
print(f"Test shape : {df_test.shape}")

Train shape : (6035, 2)
Test shape : (1065, 2)


In [3]:
df_x_train = df_train['sentence']
df_y_train = df_train['intent']
df_x_test = df_test['sentence']
df_y_test = df_test['intent']

print(f"Train data shape : {df_x_train.shape}")
print(f"Train labels shape : {df_y_train.shape}")
print(f"Test data shape : {df_x_test.shape}")
print(f"Test labels shape : {df_y_test.shape}")

Train data shape : (6035,)
Train labels shape : (6035,)
Test data shape : (1065,)
Test labels shape : (1065,)


In [4]:
# Show the 8 different intents rows counts
df_y_train.value_counts()

irrelevant           3852
purchase              613
find-restaurant       469
find-around-me        383
find-hotel            316
find-train            143
find-flight           142
provide-showtimes     117
Name: intent, dtype: int64

In [5]:
df_x_train = np.reshape(df_x_train.to_numpy(), (-1, 1))

In [6]:
labels = np.unique(df_y_train)
oversampling_dict = {}
for l in labels:
    if l == 'irrelevant': continue
    oversampling_dict[l] = 2000
print(oversampling_dict)

{'find-around-me': 2000, 'find-flight': 2000, 'find-hotel': 2000, 'find-restaurant': 2000, 'find-train': 2000, 'provide-showtimes': 2000, 'purchase': 2000}


In [7]:
oversampler = RandomOverSampler(sampling_strategy=oversampling_dict)
undersampler = RandomUnderSampler(sampling_strategy='majority')

In [8]:
X_over, Y_over = oversampler.fit_resample(df_x_train, df_y_train)
print(X_over.shape)
print(Y_over.shape)
Y_over.value_counts()

(17852, 1)
(17852,)


irrelevant           3852
provide-showtimes    2000
find-flight          2000
find-restaurant      2000
find-hotel           2000
find-train           2000
find-around-me       2000
purchase             2000
Name: intent, dtype: int64

In [9]:
X, Y = undersampler.fit_resample(X_over, Y_over)

In [10]:
print(X.shape)
print(Y.shape)
Y.value_counts()

(16000, 1)
(16000,)


provide-showtimes    2000
find-flight          2000
find-restaurant      2000
find-hotel           2000
irrelevant           2000
find-train           2000
find-around-me       2000
purchase             2000
Name: intent, dtype: int64