In [60]:
import pandas as pd
import jieba
import re
import numpy as np

In [49]:
def find_chinese(x):
    pattern = re.compile(r'[^\u4e00-\u9fa5]')
    chinese = re.sub(pattern, '', x)
    return chinese

def replace_puncs(x):
    return re.sub(r'[^\w\s]','',x)

def remove_blanks(x):
    return re.sub(r'\s{2,}','',x)

def cut_text(x):
    return list(jieba.cut(x,cut_all=False))

def tokenize(df):
    df.loc[:,'clean_sentence'] = df.sentence.apply(remove_blanks).apply(replace_puncs).apply(find_chinese).values
    df.loc[:,'cutted_sentence'] = df.clean_sentence.apply(cut_text).values

    return df[['sentence','cutted_sentence']].copy()

def join_tokens(df):
    df.loc[:,'reformed_sentence'] = df['cutted_sentence'].apply(lambda x:' '.join(x))
    return df.copy()

def add_label(df,label):
    df.loc[:,'label'] = label
    return df.copy()

def preprocessing(df, label):
    df = df.pipe(tokenize).pipe(join_tokens).pipe(add_label, label).copy()
    return df

In [17]:
negative = pd.read_csv("data/negative/negative_data.csv")
positive = pd.read_csv("data/positive/positive.csv")

In [51]:
positive_data = preprocessing(positive,1)
negative_data = preprocessing(negative.head(5000),0)

In [53]:
data = pd.concat([positive_data, negative_data], axis = 0)
data = data[['reformed_sentence','label']].rename(columns = {'reformed_sentence':'sentence'})
idx = np.arange(len(data))
np.random.shuffle(idx)
data = data.iloc[idx,:]

In [69]:
data.to_csv("data/train_data.csv", index = False)