In [1]:
TEXTPAIR_PATH = '../../../TextPair/'
import sys
sys.path.append(TEXTPAIR_PATH)

In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from textpair.single.common import TextNormalizer

%matplotlib inline

In [3]:
DATA_PATH = '../data/'
TRAIN_DEV_TEST_PATH = os.path.join(DATA_PATH, 'train_dev_test')
TRAIN_DEV_TEST_ATEC_CCKS_PATH = os.path.join(TRAIN_DEV_TEST_PATH, 'ATEC_CCKS')
ATEC_CCKS_TRAIN_CSV = os.path.join(TRAIN_DEV_TEST_ATEC_CCKS_PATH, 'train.csv')
ATEC_CCKS_DEV_CSV = os.path.join(TRAIN_DEV_TEST_ATEC_CCKS_PATH, 'dev.csv')
ATEC_CCKS_TEST_CSV = os.path.join(TRAIN_DEV_TEST_ATEC_CCKS_PATH, 'test.csv')

PROCESSED_PATH = os.path.join(TRAIN_DEV_TEST_ATEC_CCKS_PATH, 'processed')
if not os.path.exists(PROCESSED_PATH):
    os.makedirs(PROCESSED_PATH)

PROCESSED_TRAIN_CSV = os.path.join(PROCESSED_PATH, 'train.csv')
PROCESSED_DEV_CSV = os.path.join(PROCESSED_PATH, 'dev.csv')
PROCESSED_TEST_CSV = os.path.join(PROCESSED_PATH, 'text.csv')

COLUMNS = ['text_1', 'text_2', 'label']
SEP = '\t'

In [4]:
train_df = pd.read_csv(ATEC_CCKS_TRAIN_CSV, sep = SEP)
dev_df = pd.read_csv(ATEC_CCKS_DEV_CSV, sep = SEP)
test_df = pd.read_csv(ATEC_CCKS_TEST_CSV, sep = SEP)

In [5]:
preprocessor = TextNormalizer()

In [6]:
def preprocess(df, preprocessor):
    _df = df.copy()
    text_1 = df['text_1']
    text_2 = df['text_2']
    _text_1 = text_1.apply(preprocessor.transform)
    _text_2 = text_2.apply(preprocessor.transform)
    _df['text_1'] = _text_1
    _df['text_2'] = _text_2
    return _df

In [7]:
_train_df = preprocess(train_df, preprocessor)
_dev_df = preprocess(dev_df, preprocessor)
_test_df = preprocess(test_df, preprocessor)

In [8]:
train_df.head()

Unnamed: 0,text_1,text_2,label
0,蚂蚁借呗还可以分期还款吗,借呗可以分期还款吗，每个月还一部分的那种,1
1,延期1天还款,27号是还款日，28号还，这样是逾期吗,0
2,我花呗提现额度怎么开不了,为什么花呗不你提出来,0
3,未满足银行要求,怎样满足微重银行审批,0
4,商铺可以开通花呗分期吗,我再办一张银行卡可以开通花呗吗,0


In [9]:
_train_df.head()

Unnamed: 0,text_1,text_2,label
0,蚂蚁借呗还可以分期还款吗,借呗可以分期还款吗每个月还一部分的那种,1
1,延期1天还款,27号是还款日28号还这样是逾期吗,0
2,我花呗提现额度怎么开不了,为什么花呗不你提出来,0
3,未满足银行要求,怎样满足微重银行审批,0
4,商铺可以开通花呗分期吗,我再办一张银行卡可以开通花呗吗,0


In [10]:
_train_df.to_csv(PROCESSED_TRAIN_CSV, index=False, sep= SEP)
_dev_df.to_csv(PROCESSED_DEV_CSV, index=False, sep= SEP)
_test_df.to_csv(PROCESSED_TEST_CSV, index=False, sep=SEP)