In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
DATA_PATH = '../data/'

ATEC_PATH = os.path.join(DATA_PATH, 'ATEC')
ATEC_TRAIN_CSV = os.path.join(ATEC_PATH, 'atec_nlp_sim_train.csv')
ATEC_TRAIN_ADD_CSV = os.path.join(ATEC_PATH, 'atec_nlp_sim_train_add.csv')
ATEC_SEP = '\t'
ATEC_COL_NAMES = ['id', 'text_1', 'text_2', 'label']

CCKS_PATH = os.path.join(DATA_PATH, 'CCKS_2018_3')
CCKS_TRAIN_TEXT = os.path.join(CCKS_PATH, 'task3_train.txt')
CCKS_SEP = '\t'
CCKS_COL_NAMES = ['text_1', 'text_2', 'label']

# ATEC

In [3]:
atec_train_df = pd.read_csv(ATEC_TRAIN_CSV, sep = ATEC_SEP, header = None, names = ATEC_COL_NAMES)
atec_train_add_df = pd.read_csv(ATEC_TRAIN_ADD_CSV, sep = ATEC_SEP, header = None, names = ATEC_COL_NAMES)

In [4]:
atec_train_df.head()

Unnamed: 0,id,text_1,text_2,label
0,1,﻿怎么更改花呗手机号码,我的花呗是以前的手机号码，怎么更改成现在的支付宝的号码手机号,1
1,2,也开不了花呗，就这样了？完事了,真的嘛？就是花呗付款,0
2,3,花呗冻结以后还能开通吗,我的条件可以开通花呗借款吗,0
3,4,如何得知关闭借呗,想永久关闭借呗,0
4,5,花呗扫码付钱,二维码扫描可以用花呗吗,0


In [5]:
atec_train_add_df.head()

Unnamed: 0,id,text_1,text_2,label
0,1,为何我无法申请开通花呗信用卡收款,支付宝开通信用卡花呗收款不符合条件怎么回事,1
1,2,花呗分期付款会影响使用吗,花呗分期有什么影响吗,0
2,3,为什么我花呗没有临时额度,花呗没有临时额度怎么可以负,0
3,4,能不能开花呗老兄,花呗逾期了还能开通,0
4,5,我的怎么开通花呗收钱,这个花呗是个什么啥？我没开通 我怎么有账单,0


In [6]:
atec_train_all = pd.concat([atec_train_df, atec_train_add_df], ignore_index=True)
atec_train_all.head()

Unnamed: 0,id,text_1,text_2,label
0,1,﻿怎么更改花呗手机号码,我的花呗是以前的手机号码，怎么更改成现在的支付宝的号码手机号,1
1,2,也开不了花呗，就这样了？完事了,真的嘛？就是花呗付款,0
2,3,花呗冻结以后还能开通吗,我的条件可以开通花呗借款吗,0
3,4,如何得知关闭借呗,想永久关闭借呗,0
4,5,花呗扫码付钱,二维码扫描可以用花呗吗,0


## missings

In [7]:
atec_train_all.count()

id        102477
text_1    102477
text_2    102477
label     102477
dtype: int64

In [8]:
atec_train_all.shape

(102477, 4)

In [9]:
atec_train_all['label'].unique()

array([1, 0], dtype=int64)

## Target Rate

In [10]:
atec_train_all['label'].mean()

0.1823335968070884

## Text Length （Char-level，non-preprocessed）

In [11]:
atec_text_1_lens = atec_train_all['text_1'].apply(lambda x: len(x.strip()))
atec_text_2_lens = atec_train_all['text_2'].apply(lambda x: len(x.strip()))
atec_text_lens = pd.concat([atec_text_1_lens, atec_text_2_lens], ignore_index=True)
atec_text_lens.head()

0    11
1    15
2    11
3     8
4     6
dtype: int64

In [12]:
atec_text_lens.describe([.80, 0.90, .95])

count    204954.000000
mean         13.397738
std           6.123094
min           3.000000
50%          12.000000
80%          17.000000
90%          20.000000
95%          25.000000
max         112.000000
dtype: float64

# CCKS

In [13]:
records = []
with open(CCKS_TRAIN_TEXT, 'r', encoding='utf8') as f:
    for line in f:
        text_1, text_2, label = line.split(CCKS_SEP)
        record = [text_1.strip(), text_2.strip(), int(label.strip())]
        records.append(record)

ccks_train_df = pd.DataFrame(records, columns=CCKS_COL_NAMES)
ccks_train_df.head()

Unnamed: 0,text_1,text_2,label
0,用微信都6年，微信没有微粒贷功能,4。 号码来微粒贷,0
1,微信消费算吗,还有多少钱没还,0
2,交易密码忘记了找回密码绑定的手机卡也掉了,怎么最近安全老是要改密码呢好麻烦,0
3,你好 我昨天晚上申请的没有打电话给我 今天之内一定会打吗？,什么时候可以到账,0
4,"“微粒贷开通""",你好，我的微粒贷怎么没有开通呢,0


## missings

In [14]:
ccks_train_df.shape

(100000, 3)

In [15]:
ccks_train_df.count()

text_1    100000
text_2    100000
label     100000
dtype: int64

In [16]:
ccks_train_df['label'].unique()

array([0, 1], dtype=int64)

## Target rate

In [17]:
ccks_train_df['label'].mean()

0.5

## Text Length （Char-level，non-preprocessed）

In [18]:
ccks_text_1_lens = ccks_train_df['text_1'].apply(lambda x: len(x.strip()))
ccks_text_2_lens = ccks_train_df['text_2'].apply(lambda x: len(x.strip()))
ccks_text_lens = pd.concat([ccks_text_1_lens, ccks_text_2_lens], ignore_index=True)
ccks_text_lens.head()

0    16
1     6
2    20
3    29
4     7
dtype: int64

In [19]:
ccks_text_lens.describe([.80, 0.90, .95])

count    200000.000000
mean         11.930255
std           7.393814
min           1.000000
50%          10.000000
80%          16.000000
90%          20.000000
95%          25.000000
max         153.000000
dtype: float64