In [31]:
import pandas as pd
import numpy as np
import random
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from IPython.display import display

In [32]:
test_portion = 0.20
random_seed = 0
#################
random.seed(random_seed)
old_train_df = pd.read_csv('mari_train.csv')
old_test_df = pd.read_csv('mari_test.csv')

#concat train and test together  drop duplicates
merged_df = pd.concat([old_train_df, old_test_df])

#drop duplicates on 'texts' and 'destination' column
undup_df = merged_df.drop_duplicates(subset = ['texts','texts_deepcut','action','object','destination'])

print('Got rid of %d duplicated rows' % (old_train_df.shape[0] + old_test_df.shape[0] - undup_df.shape[0]))
print('%d rows remaining', undup_df.shape[0])

#shuffle data
undup_df = shuffle(undup_df, random_state = random_seed)

Got rid of 2746 duplicated rows
%d rows remaining 13429


In [33]:
def count_multi_intent_rows(df):
    return df.shape[0]-len(df['texts'].unique())

#sample train-test from each label separately to ensure similar distribution in labels between train-test
train_ls, test_ls = [], []
for label in undup_df['destination'].unique():
    print('Label: %s' % (label))
    print('Number of Sample: %d' % undup_df[undup_df.destination == label].shape[0])
    print('----')
    label_df = undup_df[undup_df.destination == label]
    traindf, test_df = train_test_split(label_df, test_size=test_portion, random_state= random_seed)
    train_ls.append(traindf)
    test_ls.append(test_df)



#concat the samples together
train_df = pd.concat(train_ls)
test_df = pd.concat(test_ls)

#Shuffle post-concatenation
train_df = shuffle(train_df, random_state = random_seed)
test_df = shuffle(test_df, random_state = random_seed)

print('Train Set Size', len(train_df))
print('Test Set Size',len(test_df))


#move rows with texts that are both in train and test to train
train_multi = count_multi_intent_rows(train_df)
test_multi = count_multi_intent_rows(test_df)
movels = set(train_df['texts'].values).intersection(set(test_df['texts'].values))
train_df = pd.concat([train_df, test_df[test_df['texts'].isin(movels)]])
test_df = test_df[~test_df['texts'].isin(movels)]


print('post-moving:')
print('Train Set Size', len(train_df))
print('Test Set Size',len(test_df))


display(train_df[:3])

Label: billing and payment
Number of Sample: 5069
----
Label: other queries
Number of Sample: 2352
----
Label: true money
Number of Sample: 248
----
Label: internet
Number of Sample: 2067
----
Label: promotions
Number of Sample: 3005
----
Label: international dialing
Number of Sample: 457
----
Label: lost and stolen
Number of Sample: 231
----
Train Set Size 10740
Test Set Size 2689
post-moving:
Train Set Size 10762
Test Set Size 2667


Unnamed: 0,texts,texts_deepcut,action,object,destination
5580,จะสอบถามเรื่องการย้ายเครือข่าย จาก AIS ย้ายมาท...,จะ สอบถาม เรื่อง การ ย้าย เครือข่าย จาก AIS ย้...,buy,package,promotions
10716,สมัครไวไฟไปแล้ว ทำไมมันเล่นติดๆขัดๆอ่ะคะพี่,สมัคร ไวไฟ ไป แล้ว ทำไม มัน เล่น ติด ๆ ขัด ๆ อ...,enquire,internet,internet
3489,ค่ะเมื่อเช้าสมัครแพ็คเกจโนเล็จ บ. ตอนนี้ยังดูไ...,ค่ะ เมื่อ เช้า สมัคร แพ็คเกจโนเล็จ บ. ตอน นี้ ...,enquire,nontruemove,other queries


In [34]:
#count each labels in train and test set
print('Label Count in train_df')
count_train = train_df.groupby('destination')['texts'].nunique().to_frame('count').reset_index()
display(count_train)

print('Label Count in test_df')
count_test = test_df.groupby('destination')['texts'].nunique().to_frame('count').reset_index()
display(count_test)

train_df.to_csv('mari-train-undup-balanced-label.csv')
test_df.to_csv('mari-test-undup-balanced-label.csv')

Label Count in train_df


Unnamed: 0,destination,count
0,billing and payment,4043
1,international dialing,364
2,internet,1653
3,lost and stolen,184
4,other queries,1881
5,promotions,2380
6,true money,198


Label Count in test_df


Unnamed: 0,destination,count
0,billing and payment,1005
1,international dialing,88
2,internet,414
3,lost and stolen,47
4,other queries,470
5,promotions,588
6,true money,50
