In [1]:
import re
from pyvi import ViTokenizer
import ast
import random

In [2]:
def normalize(text):
    t = text.replace('\n', ' ')
    t = t.lower()
    return t

def delete_hashtag(text):
    return re.sub(r'#\w+', '', text)

def delete_link(text):
    return re.sub(r'http\S+', '', text)

def remove_emojis(text):
    emoj = re.compile(r"""[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF\U00002702-\U000027B0\U000024C2-\U0001F251\U0001f926-\U0001f937\U00010000-\U0010ffff\u200d\u23cf\u23e9\u231a\ufe0f\u3030-]+(?<!\n)""", re.UNICODE)
    return re.sub(emoj, '', text)

def encode_number(text):
    t = text.split(' ')
    t = map(lambda x: '<number>' if bool(re.match(r'^[0-9]+(\.[0-9]+)?$', x)) else x, t)
    return ' '.join(t)

def delete_onelen_token(text):
    t = text.split(' ')
    t = filter(lambda x: len(x)>1, t)
    return ' '.join(t)

def preprocessing(text):
    t = normalize(text)
    t = delete_hashtag(t)
    t = delete_link(t)
    t = remove_emojis(t)
    t = ViTokenizer.tokenize(t)
    t = encode_number(t)
    t = delete_onelen_token(t)
    return t

In [3]:
import pandas as pd

df = pd.read_csv("data.csv", encoding='utf-8-sig')
texts = list(df['Contents'])
labels = list(df['Hashtags'])

In [4]:
texts = [preprocessing(t) for t in texts]
p_labels = [ast.literal_eval(l) for l in labels]

In [17]:
classes = ['#Q&A', '#cv', '#data', '#deep_learning', '#machine_learning', '#math', '#nlp', '#python', '#sharing']
test_idxes = []

for clas in classes:
    idxes = [i for i in range(len(labels)) if clas in p_labels[i]]
    # Lấy ngẫu nhiên 20% của danh sách
    num_elements_to_take = int(len(idxes) * 0.1)
    random_sample = random.sample(idxes, num_elements_to_take)
    test_idxes = list(set(test_idxes).union(set(random_sample)))

In [18]:
train_texts = [texts[i] for i in range(len(texts)) if i not in test_idxes]
train_labels = [labels[i] for i in range(len(texts)) if i not in test_idxes]

test_texts = [texts[i] for i in range(len(texts)) if i in test_idxes]
test_labels = [labels[i] for i in range(len(texts)) if i in test_idxes]

In [19]:
train_df = pd.DataFrame({'text': train_texts, 'label': train_labels})
train_df.to_csv("train_data.csv", encoding='utf-8-sig', index=False)

test_df = pd.DataFrame({'text': test_texts, 'label': test_labels})
test_df.to_csv("test_data.csv", encoding='utf-8-sig', index=False)

In [20]:
len(train_texts)

1048

In [21]:
len(test_df)

234