In [115]:
import pandas as pd
import numpy as np
import random
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from IPython.display import display

In [116]:
test_portion = 0.20
random_seed = 0
#################
random.seed(random_seed)
old_train_df = pd.read_csv('mari_train.csv')
old_test_df = pd.read_csv('mari_test.csv')

#concat train and test together  drop duplicates
merged_df = pd.concat([old_train_df, old_test_df])

#drop duplicates on 'texts' and 'destination' column
undup_df = merged_df.drop_duplicates(subset = ['texts','destination'])

print('Got rid of %d duplicated rows' % (old_train_df.shape[0] + old_test_df.shape[0] - undup_df.shape[0]))

#shuffle data
undup_df = shuffle(undup_df, random_state = random_seed)

Got rid of 2810 duplicated rows


In [117]:
train_ls, test_ls = [], []
for label in undup_df['destination'].unique():
    print('Label: %s' % (label))
    print('Number of Sample: %d' % undup_df[undup_df.destination == label].shape[0])
    print('----')
    label_df = undup_df[undup_df.destination == label]
    traindf, test_df = train_test_split(label_df, test_size=test_portion, random_state= random_seed)
    train_ls.append(traindf)
    test_ls.append(test_df)

train_df = pd.concat(train_ls)
print('Train Set Size', len(train_df))
test_df = pd.concat(test_ls)
print('Test Set Size',len(test_df))
display(train_df[:3])

Label: true money
Number of Sample: 248
----
Label: promotions
Number of Sample: 2968
----
Label: lost and stolen
Number of Sample: 231
----
Label: billing and payment
Number of Sample: 5048
----
Label: internet
Number of Sample: 2067
----
Label: other queries
Number of Sample: 2351
----
Label: international dialing
Number of Sample: 452
----
Train Set Size 10688
Test Set Size 2677


Unnamed: 0,texts,texts_deepcut,action,object,destination
9825,พี่สอบถามนิดนึงพอดีว่าพี่เติมเงินผิดอะ จริงๆต้...,พี่ สอบถาม นิดนึง พอดี ว่า พี่ เติม เงิน ผิด อ...,enquire,truemoney,true money
298,เติมเงินผิดระบบจะเติมไว้โทรแต่เติมไปทรูมันนี่ท...,เติม เงิน ผิด ระบบ จะ เติม ไว้ โทร แต่ เติม ไป...,enquire,truemoney,true money
5617,จะสอบถามเรื่องบัตรทรูมันนี่ มันโอนเข้าระบบทรูม...,จะ สอบถาม เรื่อง บัตร ทรู มัน นี่ มัน โอน เข้า...,enquire,truemoney,true money


In [118]:
#ensuring that label distribution is consistent
print('Label Count in train_df')
count_train = train_df.groupby('destination')['texts'].nunique().to_frame('count').reset_index()
display(count_train)

print('Label Count in test_df')
count_test = test_df.groupby('destination')['texts'].nunique().to_frame('count').reset_index()
display(count_test)

pd.to_csv('mari-train-undup-balanced label.csv')

Unnamed: 0,destination,count
0,billing and payment,4038
1,international dialing,361
2,internet,1653
3,lost and stolen,184
4,other queries,1880
5,promotions,2374
6,true money,198


Unnamed: 0,destination,count
0,billing and payment,1010
1,international dialing,91
2,internet,414
3,lost and stolen,47
4,other queries,471
5,promotions,594
6,true money,50
