### 파일 로드

In [None]:
import pandas as pd
import os
import gc
from collections import Counter

base_dir = os.getcwd() + "/"

FILE_SPLITS = 20

### 파일 분할

In [None]:
def split_dataframe(df, n):
    split_dfs = []
    chunk_size = len(df) // n
    for i in range(n):
        start_index = i * chunk_size
        if i == n - 1:  # 마지막 부분은 나머지를 포함
            end_index = len(df)
        else:
            end_index = (i + 1) * chunk_size
        split_dfs.append(df.iloc[start_index:end_index].copy())
    return split_dfs

In [None]:
df = pd.read_csv("datasets/written.csv")

df = df.sample(frac=1).reset_index(drop=True)

df.to_pickle("datasets/written/mixed_written.pkl")

df.head()

In [None]:
dfs = split_dataframe(df, FILE_SPLITS)

del df
gc.collect()

for i, df in enumerate(dfs):
    print(f"Writing : {i}")
    df.to_csv(f"datasets/written/written_{i}.csv", index=False)

## 토큰화

### BERT 토크나이저

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

def bert_tokenizer(sentence):
    encoding = tokenizer.encode_plus(
        sentence,
        add_special_tokens=True,
        max_length=64,
        padding="max_length",
        return_attention_mask=False,
        return_tensors='pt',
        truncation=True
    )
    tokens = encoding['input_ids'][0]
    token_list = tokenizer.convert_ids_to_tokens(tokens)
    return pd.Series([tokens, token_list])

In [None]:
#df1[["bert_encodes", "bert_tokens"]] = df1["sentence"].apply(bert_tokenizer)

In [None]:
"""df2[["bert_encodes", "bert_tokens"]] = df2["sentence"].apply(bert_tokenizer)"""

### Mecab 토크나이저

In [None]:
from mecab import MeCab

mecab = MeCab()

def mecab_tokenizer(sentence):
    tokens = mecab.morphs(sentence)
    return tokens

In [None]:
"""df1["mecab_tokens"] = df1["sentence"].apply(mecab_tokenizer)
df1.to_pickle(base_dir + "datasets/translate.pkl")"""

### Tiktokenizer

In [None]:
import tiktoken

# 사용할 인코딩 선택
encoding_name = "cl100k_base"  # "p50k_base", "r50k_base", "gpt2" 등으로 변경 가능

# tiktoken 토크나이저 초기화
tokenizer = tiktoken.get_encoding(encoding_name)

def tiktoken_tokenizer(sentence):
    # 문자열이 아닌 경우 빈 시리즈 반환
    if not isinstance(sentence, str):
        return []
    
    # tiktoken을 사용하여 문장을 토큰화하고 ID로 변환
    token_ids = tokenizer.encode(sentence)

    return token_ids

In [None]:
for i in range(FILE_SPLITS):
    print(f"tackling {i}")
    df = pd.read_csv(f"datasets/written/written_{i}.csv")
    df["tiktoken_tokens"] = df["sentence"].apply(tiktoken_tokenizer)
    df.to_pickle(f"datasets/written/written_{i}.pkl")

    del df
    gc.collect()

### 토큰 인코딩

In [None]:
vocab_counter = Counter()
print("start")


# 나머지 데이터셋 처리
for i in range(FILE_SPLITS):
    print(f"file : {i}")
    df = pd.read_pickle(base_dir + f"datasets/written/written_{i}.pkl")
    tokenized_sentences = df['tiktoken_tokens']
    
    for sentence in tokenized_sentences:
        for word in sentence:
            if type(word) == int:
                vocab_counter[word] += 1
            else:
                print(f"error {word}")

    del tokenized_sentences
    gc.collect()


print(f"Total unique words: {len(vocab_counter)}")


In [None]:
# 상위 3만 개 단어 추출
top_30000_words = vocab_counter.most_common(30000)

word_to_index = {word: index+2 for index, (word, _) in enumerate(top_30000_words)}
word_to_index['PAD'] = 0
word_to_index['OOV'] = 1

# dictionary 출력 확인
print(f"Top 30000 words mapping: {word_to_index}")

# dictionary를 파일로 저장 (옵션)
import pickle
with open(base_dir + "datasets/written/vocab_ver1.pkl", "wb") as f:
    pickle.dump(word_to_index, f)

In [None]:
for i in range(FILE_SPLITS):
    print(i)
    df = pd.read_pickle(base_dir + f"datasets/written/written_{i}.pkl")
    df['encoded_sentence'] = df['tiktoken_tokens'].apply(lambda x: [word_to_index.get(word, 1) if isinstance(word, int) else '' for word in x])
    df.to_pickle(base_dir + f"datasets/written/written_{i}.pkl")
    del df
    gc.collect()


## author 인코딩

In [None]:
author_set = set()


for i in range(FILE_SPLITS):
    print(i)
    gc.collect()
    df = pd.read_pickle(f"datasets/written/written_{i}.pkl")
    temp_set = set(df['author'].to_list())
    author_set = author_set.union(temp_set)
    df = None


author_dict = {author : i for i, author in enumerate(author_set)}

len(author_dict)

In [None]:
import json

with open(base_dir + "datasets/author_encoding.json", "w") as json_file:
    json.dump(author_dict, json_file, indent=4)


for i in range(FILE_SPLITS):
    print(i)
    df = pd.read_pickle(f"datasets/written/written_{i}.pkl")

    df['encoded_author'] = df['author'].apply(lambda x : author_dict[x])

    df.to_pickle(f"datasets/written/written_{i}.pkl") 

    df = None

    gc.collect()


len(author_set)

## Compactize

In [None]:
for i in range(FILE_SPLITS):
    print(i)
    df = pd.read_pickle(f"datasets/written/written_{i}.pkl")
    df = df[['encoded_sentence', 'encoded_author']]
    df.to_pickle(f"datasets/written/written_{i}.pkl" )
    df = None
    gc.collect()

In [15]:
df = pd.DataFrame()

for i in range(FILE_SPLITS):
    print(i)
    temp = pd.read_pickle(f"datasets/written/written_{i}.pkl")
    df = pd.concat([df, temp], ignore_index=True)
    temp = None
    gc.collect()

df.to_pickle("datasets/written_total.pkl")

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19


## 장르 인코딩

In [None]:
#df = pd.read_pickle(base_dir +  f'data/all.pkl')

In [None]:
df['class'] = df['Classify_1']+"/" + df['Classify_2']

classify_set = set(df['class'].to_list())

classify_dict = {classs : i for i, classs in enumerate(classify_set)}

import json

with open(base_dir + "data/class_encoding.json", "w") as json_file:
    json.dump(classify_dict, json_file, indent=4)

df['encoded_class'] = df['class'].apply(lambda x : classify_dict[x])

In [None]:
df.to_pickle(base_dir +  f'data/labeled_compact.pkl')