In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

%matplotlib inline

In [2]:
DATA_PATH = '../data/'

ATEC_PATH = os.path.join(DATA_PATH, 'ATEC')
ATEC_TRAIN_CSV = os.path.join(ATEC_PATH, 'atec_nlp_sim_train.csv')
ATEC_TRAIN_ADD_CSV = os.path.join(ATEC_PATH, 'atec_nlp_sim_train_add.csv')
ATEC_SEP = '\t'
ATEC_COL_NAMES = ['id', 'text_1', 'text_2', 'label']

CCKS_PATH = os.path.join(DATA_PATH, 'CCKS_2018_3')
CCKS_TRAIN_TEXT = os.path.join(CCKS_PATH, 'task3_train.txt')
CCKS_SEP = '\t'
CCKS_COL_NAMES = ['text_1', 'text_2', 'label']

ATEC_CCKS_DEV_SIZE = 10000
ATEC_CCKS_TEST_SIZE = 10000

TRAIN_DEV_TEST_PATH = os.path.join(DATA_PATH, 'train_dev_test')
TRAIN_DEV_TEST_ATEC_CCKS_PATH = os.path.join(TRAIN_DEV_TEST_PATH, 'ATEC_CCKS')
ATEC_CCKS_TRAIN_CSV = os.path.join(TRAIN_DEV_TEST_ATEC_CCKS_PATH, 'train.csv')
ATEC_CCKS_DEV_CSV = os.path.join(TRAIN_DEV_TEST_ATEC_CCKS_PATH, 'dev.csv')
ATEC_CCKS_TEST_CSV = os.path.join(TRAIN_DEV_TEST_ATEC_CCKS_PATH, 'test.csv')

In [3]:
def read_aetc():
    atec_train_df = pd.read_csv(ATEC_TRAIN_CSV, sep = ATEC_SEP, header = None, names = ATEC_COL_NAMES)
    atec_train_add_df = pd.read_csv(ATEC_TRAIN_ADD_CSV, sep = ATEC_SEP, header = None, names = ATEC_COL_NAMES)
    atec_train_all = pd.concat([atec_train_df, atec_train_add_df], ignore_index=True)
    atec_train_all = atec_train_all.drop('id', axis = 1)
    return atec_train_all
    
def read_ccks():
    records = []
    with open(CCKS_TRAIN_TEXT, 'r', encoding='utf8') as f:
        for line in f:
            text_1, text_2, label = line.split(CCKS_SEP)
            record = [text_1.strip(), text_2.strip(), int(label.strip())]
            records.append(record)

    ccks_train_df = pd.DataFrame(records, columns=CCKS_COL_NAMES)
    return ccks_train_df

def read_aetc_ccks():
    atec_df = read_aetc()
    ccks_df = read_ccks()
    atec_ccks_df = pd.concat([atec_df, ccks_df], ignore_index=True)
    return atec_ccks_df

In [4]:
## 把AETC和CCKS合并到一起
atec_ccks_df = read_aetc_ccks()

In [5]:
atec_ccks_df.head()

Unnamed: 0,text_1,text_2,label
0,﻿怎么更改花呗手机号码,我的花呗是以前的手机号码，怎么更改成现在的支付宝的号码手机号,1
1,也开不了花呗，就这样了？完事了,真的嘛？就是花呗付款,0
2,花呗冻结以后还能开通吗,我的条件可以开通花呗借款吗,0
3,如何得知关闭借呗,想永久关闭借呗,0
4,花呗扫码付钱,二维码扫描可以用花呗吗,0


In [6]:
## Target rate
atec_ccks_df.label.mean()

0.33922371429841414

In [7]:
##  样本数
atec_ccks_df.shape

(202477, 3)

In [8]:
def train_dev_test_split(x, y, dev_size, test_size):
    _x, dev_x, _y, dev_y = train_test_split(x, y, test_size= dev_size, stratify = y)
    train_x, test_x, train_y, test_y = train_test_split(_x, _y, test_size=test_size, stratify = _y)
    return train_x, dev_x, test_x, train_y, dev_y, test_y

In [9]:
## 划分 train， dev， test

# atec_ccks_x_df = atec_ccks_df.drop('label', axis = 1)
atec_ccks_x_df = atec_ccks_df
atec_ccks_y_ser = atec_ccks_df.label

(atec_ccks_train_df,
 atec_ccks_dev_df,
 atec_ccks_test_df, _, _, _) = train_dev_test_split(atec_ccks_x_df,
                                                      atec_ccks_y_ser, 
                                                      ATEC_CCKS_DEV_SIZE, 
                                                      ATEC_CCKS_TEST_SIZE)

In [10]:
## 检查下target rate
def print_target_rate(df, dtype, dataset = 'atec_ccks', label = 'label'):
    target_rate = df[label].mean()
    print(f"{dataset} - {dtype}, target rate: {target_rate:.4f}")

In [11]:
print_target_rate(atec_ccks_train_df, 'train')
print_target_rate(atec_ccks_dev_df, 'dev')
print_target_rate(atec_ccks_test_df, 'test')

atec_ccks - train, target rate: 0.3392
atec_ccks - dev, target rate: 0.3392
atec_ccks - test, target rate: 0.3392


In [12]:
## 保存train, dev, test
def save_df(df, path, index = False, sep = '\t'):
    df.to_csv(path, index=index, sep = sep)
    
save_df()
atec_ccks_train_df.to_csv(ATEC_CCKS_TRAIN_CSV, index=False)
atec_ccks_dev_df.to_csv(ATEC_CCKS_DEV_CSV, index=False)
atec_ccks_test_df.to_csv(ATEC_CCKS_TEST_CSV, index=False)