#### [26_01_22_과제]
- 알파벳을 사용하는 언어는 알파벳 빈도의 차이로 언어를 식별할 수 있습니다.
- 해당 데이터셋을 활용해서 언어 식별 모델을 생성하세요.
- 데이터셋
  * train 폴더 =>  나라영문2글자-숫자.txt
  * test 폴더  =>  나라영문2글자-숫자.txt

- 데이터셋 부족 시 Wikipedia 사이트에서 추가 가능 합니다.

In [1]:
import os
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader


In [2]:
# 프랑스어 악센트 매핑
accent_map = {
    'é':'e','è':'e','ê':'e','ë':'e',
    'à':'a','â':'a',
    'ç':'c',
    'î':'i','ï':'i',
    'ô':'o',
    'ù':'u','û':'u',
}


In [3]:

# ============================================================
# 1) train/test 폴더에서 파일 리스트 가져오기
#    - 파일명: "en-1.txt" 형태
# ============================================================
TRAIN_DIR = "./Data/train"
TEST_DIR  = "./Data/test"

train_files = sorted([fn for fn in os.listdir(TRAIN_DIR) if fn.endswith(".txt")])
test_files  = sorted([fn for fn in os.listdir(TEST_DIR)  if fn.endswith(".txt")])

print("train_files:", train_files[:5], "...")
print("test_files :", test_files[:5], "...")


alphabet = [chr(ord('a') + i) for i in range(26)]  # a~z

train_files: ['en-1.txt', 'en-2.txt', 'en-3.txt', 'en-4.txt', 'en-5.txt'] ...
test_files : ['en-1.txt', 'en-2.txt', 'fr-3.txt', 'fr-4.txt', 'id-5.txt'] ...


In [4]:

# ============================================================
# 2) 파일 읽어서 -> 26차원 알파벳 빈도 비율
#    - 소문자로 치환
#    - 악센트 -> 기본 알파벳
#    - a~z만 남기기
# ============================================================
def normalize_char_simple(c):
    return accent_map.get(c, c)

def file_to_freq_vec(folder, filename):
    with open(os.path.join(folder, filename), "r", encoding="utf-8", errors="ignore") as f:
        s = f.read()

    # 알파벳만 + 소문자 + 악센트 매핑
    chars = [normalize_char_simple(ch.lower()) for ch in s if ch.isalpha()]
    
    # 알파벳 외의 다른 문자들 지우기 ex) 한자
    chars = [c for c in chars if 'a' <= c <= 'z']


    # if len(chars) == 0:
    #     return [0.0] * 26

    cnt = Counter(chars)
    total = sum(cnt.values())
    vec = [cnt.get(a, 0) / total for a in alphabet]  # 비율(정규화)
    return vec


In [5]:

# ============================================================
# 3) 학습용 X,y 만들기
#    - 라벨: 파일명 앞 2글자
# ============================================================
X_train, y_train_str = [], []
for fn in train_files:
    label = fn.split("-")[0]  # 파일명에서 라벨 가져오기
    vec = file_to_freq_vec(TRAIN_DIR, fn)
    X_train.append(vec)
    y_train_str.append(label)

# 라벨 인코딩: 문자열 라벨 -> 정수 라벨
labels = sorted(set(y_train_str))
label_to_idx = {lab:i for i, lab in enumerate(labels)}
idx_to_label = {i:lab for lab, i in label_to_idx.items()}

y_train = [label_to_idx[lab] for lab in y_train_str]

print("labels:", labels)
print("label_to_idx:", label_to_idx)

# ============================================================
# 4) 테스트용 X,y 만들기
# ============================================================
X_test, y_test_str = [], []
for fn in test_files:
    label = fn.split("-")[0]
    vec = file_to_freq_vec(TEST_DIR, fn)
    X_test.append(vec)
    y_test_str.append(label)

y_test = [label_to_idx[lab] for lab in y_test_str]  # train에 없는 라벨이 있으면 에러 (보통 없음)


labels: ['en', 'fr', 'id', 'tl']
label_to_idx: {'en': 0, 'fr': 1, 'id': 2, 'tl': 3}


In [6]:

# ============================================================
# 5) Tensor로 변환 + DataLoader
# ============================================================
X_train_ts = torch.tensor(X_train, dtype=torch.float32)  
y_train_ts = torch.tensor(y_train, dtype=torch.long)     
X_test_ts  = torch.tensor(X_test,  dtype=torch.float32)
y_test_ts  = torch.tensor(y_test,  dtype=torch.long)

train_loader = DataLoader(TensorDataset(X_train_ts, y_train_ts), batch_size=8, shuffle=True)
test_loader  = DataLoader(TensorDataset(X_test_ts,  y_test_ts),  batch_size=16, shuffle=False)


In [7]:

# ============================================================
# 6) 딥러닝 모델(MLP) 정의
#    - 출력은 logits (CrossEntropyLoss가 softmax 포함)
# ============================================================
class LangMLP(nn.Module):
    def __init__(self, in_dim=26, num_classes=4):
        super().__init__()
        self.fc1 = nn.Linear(in_dim, 64)
        self.fc2 = nn.Linear(64, 32)
        self.out = nn.Linear(32, num_classes)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.out(x)  # logits

device = "cuda" if torch.cuda.is_available() else "cpu"
model = LangMLP(in_dim=26, num_classes=len(labels)).to(device)

loss_fn = nn.CrossEntropyLoss()
opt = torch.optim.Adam(model.parameters(), lr=1e-3)


In [8]:

# ============================================================
# 7) 학습 루프 + 테스트 평가
# ============================================================
def accuracy(logits, y):
    pred = logits.argmax(dim=1)
    return (pred == y).float().mean().item()

EPOCHS = 200
for epoch in range(1, EPOCHS + 1):
    # ---- train
    model.train()
    tr_loss_sum, tr_acc_sum = 0.0, 0.0

    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)

        logits = model(xb)
        loss = loss_fn(logits, yb)

        opt.zero_grad()
        loss.backward()
        opt.step()

        tr_loss_sum += loss.item()
        tr_acc_sum += accuracy(logits, yb)

    tr_loss = tr_loss_sum / len(train_loader)
    tr_acc  = tr_acc_sum  / len(train_loader)

    # ---- test
    model.eval()
    with torch.no_grad():
        te_loss_sum, te_acc_sum = 0.0, 0.0
        for xb, yb in test_loader:
            xb, yb = xb.to(device), yb.to(device)
            logits = model(xb)
            loss = loss_fn(logits, yb)
            te_loss_sum += loss.item()
            te_acc_sum += accuracy(logits, yb)

        te_loss = te_loss_sum / len(test_loader)
        te_acc  = te_acc_sum  / len(test_loader)

    if epoch % 10 == 0 or epoch == 1:
        print(f"[{epoch:03}] train loss={tr_loss:.4f}, acc={tr_acc:.4f} | test loss={te_loss:.4f}, acc={te_acc:.4f}")


[001] train loss=1.3899, acc=0.2500 | test loss=1.3873, acc=0.2500
[010] train loss=1.3857, acc=0.2500 | test loss=1.3853, acc=0.2500
[020] train loss=1.3800, acc=0.2917 | test loss=1.3825, acc=0.2500
[030] train loss=1.3757, acc=0.2917 | test loss=1.3780, acc=0.2500
[040] train loss=1.3644, acc=0.6250 | test loss=1.3684, acc=0.6250
[050] train loss=1.3410, acc=0.7083 | test loss=1.3463, acc=0.7500
[060] train loss=1.2774, acc=0.8750 | test loss=1.2977, acc=1.0000
[070] train loss=1.1916, acc=0.7083 | test loss=1.2109, acc=0.8750
[080] train loss=1.0612, acc=0.7500 | test loss=1.0907, acc=0.8750
[090] train loss=0.9084, acc=0.5417 | test loss=0.9678, acc=0.6250
[100] train loss=0.8081, acc=0.5000 | test loss=0.8680, acc=0.6250
[110] train loss=0.7187, acc=0.7500 | test loss=0.8038, acc=0.7500
[120] train loss=0.6882, acc=0.7500 | test loss=0.7476, acc=1.0000
[130] train loss=0.6240, acc=0.8750 | test loss=0.7020, acc=1.0000
[140] train loss=0.5487, acc=1.0000 | test loss=0.6616, acc=1.

In [None]:
# ============================================================
# 8) 예측 확인
# ============================================================
model.eval()
with torch.no_grad():
    for fn in test_files[:5]:
        vec = file_to_freq_vec('./Data/test2', fn)
        x = torch.tensor([vec], dtype=torch.float32).to(device)
        pred_idx = model(x).argmax(dim=1).item()
        print(f"{fn} -> pred: {idx_to_label[pred_idx]}")

en-1.txt -> pred: en


FileNotFoundError: [Errno 2] No such file or directory: './Data/test2\\en-2.txt'