In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
NLP_DATASETS_PATH = '../../../../NLP_Datasets/'

LCQMC_PATH = os.path.join(NLP_DATASETS_PATH, 'LCQMC')
LCQMC_TRAIN_CSV = os.path.join(LCQMC_PATH, 'train.txt')
LCQMC_DEV_CSV = os.path.join(LCQMC_PATH, 'dev.txt')
LCQMC_TEST_CSV = os.path.join(LCQMC_PATH, 'test.txt')

LCQMC_SEP = '\t'
LCQMC_COL_NAMES = ['text_1', 'text_2', 'label']

In [3]:
train_df = pd.read_csv(LCQMC_TRAIN_CSV, sep = LCQMC_SEP, header = None, names = LCQMC_COL_NAMES)
dev_df = pd.read_csv(LCQMC_DEV_CSV, sep = LCQMC_SEP, header = None, names = LCQMC_COL_NAMES)
test_df = pd.read_csv(LCQMC_TEST_CSV, sep = LCQMC_SEP, header = None, names = LCQMC_COL_NAMES)
all_df = pd.concat([train_df, dev_df, test_df], ignore_index=True)

In [4]:
train_df.head()

Unnamed: 0,text_1,text_2,label
0,喜欢打篮球的男生喜欢什么样的女生,爱打篮球的男生喜欢什么样的女生,1
1,我手机丢了，我想换个手机,我想买个新手机，求推荐,1
2,大家觉得她好看吗,大家觉得跑男好看吗？,0
3,求秋色之空漫画全集,求秋色之空全集漫画,1
4,晚上睡觉带着耳机听音乐有什么害处吗？,孕妇可以戴耳机听音乐吗?,0


In [5]:
dev_df.head()

Unnamed: 0,text_1,text_2,label
0,开初婚未育证明怎么弄？,初婚未育情况证明怎么开？,1
1,谁知道她是网络美女吗？,爱情这杯酒谁喝都会醉是什么歌,0
2,人和畜生的区别是什么？,人与畜生的区别是什么！,1
3,男孩喝女孩的尿的故事,怎样才知道是生男孩还是女孩,0
4,这种图片是用什么软件制作的？,这种图片制作是用什么软件呢？,1


In [6]:
test_df.head()

Unnamed: 0,text_1,text_2,label
0,谁有狂三这张高清的,这张高清图，谁有,0
1,英雄联盟什么英雄最好,英雄联盟最好英雄是什么,1
2,这是什么意思，被蹭网吗,我也是醉了，这是什么意思,0
3,现在有什么动画片好看呢？,现在有什么好看的动画片吗？,1
4,请问晶达电子厂现在的工资待遇怎么样要求有哪些,三星电子厂工资待遇怎么样啊,0


## missings

In [7]:
all_df.count()

text_1    260068
text_2    260068
label     260068
dtype: int64

In [8]:
all_df.shape

(260068, 3)

In [9]:
all_df['label'].unique()

array([1, 0])

## Target Rate

In [10]:
all_df['label'].mean(), train_df['label'].mean(), dev_df['label'].mean(), test_df['label'].mean()

(0.5737960841010813, 0.5803757653937328, 0.5001136105430584, 0.5)

## Text Length （Char-level，non-preprocessed）

In [11]:
text_1_lens = all_df['text_1'].apply(lambda x: len(x.strip()))
text_2_lens = all_df['text_2'].apply(lambda x: len(x.strip()))
text_lens = pd.concat([text_1_lens, text_2_lens], ignore_index=True)
text_lens.head()

0    16
1    12
2     8
3     9
4    18
dtype: int64

In [14]:
text_lens.describe([0.05, 0.1, .80, 0.90, .95, .99])

count    520136.000000
mean         10.932779
std           4.394407
min           2.000000
5%            6.000000
10%           7.000000
50%          10.000000
80%          13.000000
90%          15.000000
95%          19.000000
99%          31.000000
max         131.000000
dtype: float64