In [16]:
# 파일 읽기
import pandas as pd
file_path = "data/news-commentary-v18.en-zh.tsv"
data = pd.read_csv(file_path, sep="\t", header=None, names=["English", "Chinese"])

# 데이터 확인
print(data.head())  # 상위 5개 행 출력
data.shape

                                             English  \
0                    Is It Time to Give Up on 1.5°C?   
1     MILAN – Net-zero commitments are all the rage.   
2  Countries, companies, and others worldwide hav...   
3  But net-zero targets are not tantamount to lim...   
4             This is well understood among experts.   

                                             Chinese  
0                                    是时候放弃1.5°C目标了吗？  
1                                 发自米兰—净零承诺当前正处于风口上。  
2  世界各地的国家、企业和其他国家都承诺要在某个特定日期前消除温室气体净排放 — — 某些国家的...  
3  但净零目标并不等同于将全球变暖限制在巴黎气候协定的1.5°C目标或是任何特定变暖水平，而达成...  
4                                       专家们对此早已深有体会。  


(454819, 2)

In [17]:
# 결측치 확인
print(data.isnull().sum())
data = data.dropna()
print("결측치 제거 후")
print(data.isnull().sum())

English    11451
Chinese    11593
dtype: int64
결측치 제거 후
English    0
Chinese    0
dtype: int64


In [18]:
import re

def clean_text(text):
    text = re.sub(r"[^\w\s]", "", text) # 특수문자 제거
    text = re.sub(r"\s+", " ", text) # 중복 공백 제거

print(data["English"].head())
print(data["Chinese"].head())
# 정규화 적용
print("정규화 적용")
data["English"] = data["English"].apply(clean_text)
data["Chinese"] = data["Chinese"].apply(clean_text)
print(data["English"].head())
print(data["Chinese"].head())

0                      Is It Time to Give Up on 1.5°C?
1       MILAN – Net-zero commitments are all the rage.
2    Countries, companies, and others worldwide hav...
3    But net-zero targets are not tantamount to lim...
4               This is well understood among experts.
Name: English, dtype: object
0                                      是时候放弃1.5°C目标了吗？
1                                   发自米兰—净零承诺当前正处于风口上。
2    世界各地的国家、企业和其他国家都承诺要在某个特定日期前消除温室气体净排放 — — 某些国家的...
3    但净零目标并不等同于将全球变暖限制在巴黎气候协定的1.5°C目标或是任何特定变暖水平，而达成...
4                                         专家们对此早已深有体会。
Name: Chinese, dtype: object
정규화 적용
0    None
1    None
2    None
3    None
4    None
Name: English, dtype: object
0    None
1    None
2    None
3    None
4    None
Name: Chinese, dtype: object


In [20]:
from sklearn.model_selection import train_test_split

# 데이터셋 분할
train_data, val_data = train_test_split(data, test_size=0.1, random_state=42)

print(train_data.head())

       English Chinese
107596    None    None
302783    None    None
425620    None    None
136228    None    None
291444    None    None


In [24]:
from transformers import MarianMTModel, MarianTokenizer

model_name = "Helsinki-NLP/opus-mt-en-zh"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)  # 재시도b

In [27]:
sample_text = "Hello, how are you?"
inputs = tokenizer(sample_text, return_tensors="pt", padding=True, truncation=True)
translated = model.generate(inputs["input_ids"])
print(tokenizer.decode(translated[0], skip_special_tokens=True))

你好,你好吗?


In [29]:
from nltk.translate.bleu_score import sentence_bleu

# 참조 문장(리스트의 리스트 형태로 제공)
reference = [["你好", "你好吗"]]  # 참조 문장

# 후보 문장(리스트 형태로 제공)
candidate = ["你好", "你好吗"]  # 모델 출력

# BLEU 점수 계산
bleu_score = sentence_bleu(reference, candidate)
print(f"BLEU Score: {bleu_score:.4f}")


BLEU Score: 0.0000


In [22]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.46.3-py3-none-any.whl.metadata (44 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.5-cp312-none-win_amd64.whl.metadata (3.9 kB)
Downloading transformers-4.46.3-py3-none-any.whl (10.0 MB)
   ---------------------------------------- 0.0/10.0 MB ? eta -:--:--
   ---- ----------------------------------- 1.0/10.0 MB 16.7 MB/s eta 0:00:01
   --------------------------- ------------ 6.8/10.0 MB 24.6 MB/s eta 0:00:01
   ---------------------------------------- 10.0/10.0 MB 26.1 MB/s eta 0:00:00
Downloading safetensors-0.4.5-cp312-none-win_amd64.whl (286 kB)
Installing collected packages: safetensors, transformers
Successfully installed safetensors-0.4.5 transformers-4.46.3
