In [1]:
import pandas as pd
import re
from koeda import EDA

In [2]:
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^가-힣a-z\s]', ' ', text)
    # Remove extra spaces
    text = ' '.join(text.split())
    # 한국어 불용어 리스트
    stopwords = [
        '이', '있', '하', '것', '들', '그', '되', '수', '이', '보', '않', '없', '나', '사람', '주', '아니', 
        '등', '같', '우리', '때', '년', '가', '한', '지', '대하', '오', '말', '일', '그렇', '위하', 
        '때문', '그것', '두', '말하', '알', '그러나', '받', '못하', '일', '그런', '또', '문제', '더', '사회', 
        '많', '그리고', '좋', '크', '따르', '중', '나오', '가지', '씨', '시키', '만들', '지금', '생각하', 
        '그러', '속', '하나', '집', '살', '모르', '적', '월', '데', '자신', '안', '어떤', '내', '내', '경우',
        '명', '생각', '시간', '그녀', '다시', '이런', '앞', '보이', '번', '나', '다른', '어떻', '여자', '개',
        '전', '들', '사실', '이렇', '점', '싶', '말', '정도', '좀', '원', '잘', '통하', '소리', '놓'
    ]
    # 불용어 제거
    text = ' '.join(word for word in text.split() if word not in stopwords)
    return text

In [5]:
# Load train.csv
train = pd.read_csv("train.csv")

# 지정된 클래스를 숫자로 인코딩
label_dict = {
    '협박 대화': 0,
    '갈취 대화': 1,
    '직장 내 괴롭힘 대화': 2,
    '기타 괴롭힘 대화': 3
}
train['label_encoded'] = train['class'].map(label_dict)

# Initialize EDA
eda_sr = EDA(
    morpheme_analyzer="Okt", alpha_sr=0.3
)

eda_ri = EDA(
    morpheme_analyzer="Okt", alpha_ri=0.3
)

# Apply Synonym Replacement (SR)
def apply_sr(text):
    # Using `sr` instead of `synonym_replacement`
    augmented_texts = eda_sr([text])
    return augmented_texts[0] if augmented_texts else text

train['conversation_sr'] = train['conversation'].apply(apply_sr)

# Save the SR augmented dataset
train[['idx', 'class', 'conversation_sr']].to_csv("train_sr.csv", index=False)

# Apply Random Insertion (RI)
def apply_ri(text):
    # Using `ri` instead of `random_insertion`
    augmented_texts = eda_ri([text])
    return augmented_texts[0] if augmented_texts else text

train['conversation_ri'] = train['conversation'].apply(apply_ri)

# Save the RI augmented dataset
train[['idx', 'class', 'conversation_ri']].to_csv("train_ri.csv", index=False)

In [6]:
# 3. Clean the synonym replaced texts
train_sr = pd.read_csv('train_sr.csv')
train_sr['conversation_sr_cleaned'] = train_sr['conversation_sr'].apply(clean_text)
train_sr[['idx', 'class', 'conversation_sr_cleaned']].to_csv('train_sr_cleaned.csv', index=False)

In [7]:
# 4. Clean the random inserted texts
train_ri = pd.read_csv('train_ri.csv')
train_ri['conversation_ri_cleaned'] = train_ri['conversation_ri'].apply(clean_text)
train_ri[['idx', 'class', 'conversation_ri_cleaned']].to_csv('train_ri_cleaned.csv', index=False)