In [1]:
TEXTPAIR_PATH = '../../../../TextPair/'
import sys
sys.path.append(TEXTPAIR_PATH)

In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from textpair.single.common import TextNormalizer

%matplotlib inline

In [3]:
NLP_DATASETS_PATH =  '../../../../NLP_Datasets/'
LCQMC_PATH = os.path.join(NLP_DATASETS_PATH, 'LCQMC')


TRAIN_CSV = os.path.join(LCQMC_PATH, 'train.txt')
DEV_CSV = os.path.join(LCQMC_PATH, 'dev.txt')
TEST_CSV = os.path.join(LCQMC_PATH, 'test.txt')

DATA_PATH = '../../data/'
TRAIN_DEV_TEST_PATH = os.path.join(DATA_PATH, 'train_dev_test')
TRAIN_DEV_TEST_LCQMC_PATH = os.path.join(TRAIN_DEV_TEST_PATH, 'LCQMC')

PROCESSED_PATH = os.path.join(TRAIN_DEV_TEST_LCQMC_PATH, 'processed')
if not os.path.exists(PROCESSED_PATH):
    os.makedirs(PROCESSED_PATH)

PROCESSED_TRAIN_CSV = os.path.join(PROCESSED_PATH, 'train.csv')
PROCESSED_DEV_CSV = os.path.join(PROCESSED_PATH, 'dev.csv')
PROCESSED_TEST_CSV = os.path.join(PROCESSED_PATH, 'test.csv')

PROCESSED_TRAIN_SAMPLES_CSV = os.path.join(PROCESSED_PATH, 'train_samples.csv')

COLUMNS = ['text_1', 'text_2', 'label']
SEP = '\t'

In [4]:
train_df = pd.read_csv(TRAIN_CSV, sep = SEP, header = None, names = COLUMNS)
dev_df = pd.read_csv(DEV_CSV, sep = SEP, header = None, names = COLUMNS)
test_df = pd.read_csv(TEST_CSV, sep = SEP, header = None, names = COLUMNS)

In [5]:
preprocessor = TextNormalizer()

In [6]:
def preprocess(df, preprocessor):
    _df = df.copy()
    text_1 = df['text_1']
    text_2 = df['text_2']
    _text_1 = text_1.apply(preprocessor.transform)
    _text_2 = text_2.apply(preprocessor.transform)
    _df['text_1'] = _text_1
    _df['text_2'] = _text_2
    return _df

In [7]:
_train_df = preprocess(train_df, preprocessor)
_dev_df = preprocess(dev_df, preprocessor)
_test_df = preprocess(test_df, preprocessor)

_train_samples_df = _train_df[:1000]

In [8]:
train_df.head()

Unnamed: 0,text_1,text_2,label
0,喜欢打篮球的男生喜欢什么样的女生,爱打篮球的男生喜欢什么样的女生,1
1,我手机丢了，我想换个手机,我想买个新手机，求推荐,1
2,大家觉得她好看吗,大家觉得跑男好看吗？,0
3,求秋色之空漫画全集,求秋色之空全集漫画,1
4,晚上睡觉带着耳机听音乐有什么害处吗？,孕妇可以戴耳机听音乐吗?,0


In [9]:
_train_df.head()

Unnamed: 0,text_1,text_2,label
0,喜欢打篮球的男生喜欢什么样的女生,爱打篮球的男生喜欢什么样的女生,1
1,我手机丢了我想换个手机,我想买个新手机求推荐,1
2,大家觉得她好看吗,大家觉得跑男好看吗,0
3,求秋色之空漫画全集,求秋色之空全集漫画,1
4,晚上睡觉带着耳机听音乐有什么害处吗,孕妇可以戴耳机听音乐吗,0


In [10]:
_train_df.to_csv(PROCESSED_TRAIN_CSV, index=False, sep= SEP)
_dev_df.to_csv(PROCESSED_DEV_CSV, index=False, sep= SEP)
_test_df.to_csv(PROCESSED_TEST_CSV, index=False, sep=SEP)

_train_samples_df.to_csv(PROCESSED_TRAIN_SAMPLES_CSV, index=False, sep=SEP)