In [None]:
import jieba
import pandas as pd
import re

In [None]:
df_train= pd.read_excel('data/data_3500.xlsx')

In [None]:
stopwords=[]

with open('data/stopwords.txt','r',encoding = 'utf8') as f:
    for w in f:
        stopwords.append(w.strip())
        
def load_corpus(path):
    """
    Load the corpus
    """
    data = []
    with open(path, "r", encoding="utf8") as f:
        for line in f:
            [_, seniment, content] = line.split(",", 2)
            content = processing(content)
            data.append((content, int(seniment)))
    return data


def load_corpus_bert(path):
    """
    Load the corpus
    """
    data = []
    with open(path, "r", encoding="utf8") as f:
        for line in f:
            [_, seniment, content] = line.split(",", 2)
            content = processing_bert(content)
            data.append((content, int(seniment)))
    return data

def get_stopword_list(file):
    with open(file,'r',encoding = 'utf-8') as f:
        stopword_list = [word.strip('\n') for word in f.readlines()]
        return stopword_list
    
def clean_stopword(str, stopword_list):
    result = ''
    word_list = jieba.lcut(str)
    for w in word_list:
        if w not in stopword_list:
            result += w
    return result
    
def processing(text):
    """
    data preprocessing
    """
    # datacleaning
    text = re.sub("\{%.+?%\}", " ", text)           # remove {%xxx%} (Geolocation, Microblogging topics, etc)
    text = re.sub("@.+?( |$)", " ", text)           # remove @xxx (user name)
    text = re.sub("【.+?】", " ", text)              # remove 【xx】 (content not written by user)
    text = re.sub("\u200b", " ", text)              # '\u200b'
    # 分词
    words = [w for w in jieba.lcut(text) if w.isalpha()]
    #  splice ["不" = No]  with the word after it
    while "不" in words:
        index = words.index("不")
        if index == len(words) - 1:
            break
        words[index: index+2] = ["".join(words[index: index+2])]  # list splice
    # Concatenate strings with Spaces
    result = " ".join(words)
    return result


def processing_bert(text):

    # data cleaning
    text = re.sub("\{%.+?%\}", " ", text)           # remove {%xxx%} (Geolocation, Microblogging topics, etc)
    text = re.sub("@.+?( |$)", " ", text)          # remove @xxx (user name)
    text = re.sub("【.+?】", " ", text)            # remove 【xx】 (content not written by user)         
    return text

In [None]:
for i in range(0,len(df_train)):
    df_train['context'][i] = processing(df_train['context'][i])
    df_train['context'][i] = clean_stopword(df_train['context'][i],stopwords)

In [None]:
# three classes cleaned data
df_train.to_csv('three_class.csv',encoding='utf_8_sig')


# two classes cleaned data
df_train = df_train.drop(df_train[df_train['emotion'] == 0].index)
df_train = df_train.dropna(axis=0, how = 'all')
df_train.to_csv('two_class.csv',encoding = "utf_8_sig")