<a href="https://colab.research.google.com/github/chaehoon1/Linear_Algebra_and_AI/blob/main/linear_algebra_and_AI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
import re
from collections import Counter
import math
import numpy as np

In [5]:
def build_vocab(text: str) -> list[str]:
    text = text.lower()
    text = re.sub(r'[^a-z\s]', ' ', text)
    tokens = text.split()
    vocab = sorted(list(set(tokens)))

    return vocab


V 생성 함수

In [6]:
def build_corpus(text: str, vocab: list[str]) -> list[list[int]]:
    word_to_id = {word: idx for idx, word in enumerate(vocab)}
    sentences = text.lower().split('.')

    corpus = []

    for sentence in sentences:
        clean = re.sub(r'[^a-z\s]', ' ', sentence)
        tokens = clean.split()

        if not tokens:
            continue

        token_ids = [word_to_id[token] for token in tokens if token in word_to_id]

        if token_ids:
            corpus.append(token_ids)

    return corpus


corpus 생성 함수

In [9]:
with open("/content/sample_data/multiverse.txt", "r", encoding="utf-8") as f:
    text = f.read()

vocab = build_vocab(text)
corpus = build_corpus(text, vocab)

텍스트 데이터로부터 V와 corpus 생성

In [15]:
def build_D(corpus, window_size):
    D = []
    for sentence in corpus:
        L = len(sentence)
        for i, w in enumerate(sentence):
            start = max(0, i - window_size)
            end = min(L, i + window_size + 1)
            for j in range(start, end):
                if i == j:
                    continue
                D.append((w, sentence[j]))
    return D


D 생성 함수

In [14]:
def p_wc(D, w, c):
  D_size = len(D)
  pair_count = Counter(D)
  return pair_count[(w, c)] / D_size if D_size > 0 else 0.0

def p_w(D, w):
  D_size = len(D)
  w_count = Counter([w for w, _ in D])
  return w_count[w] / D_size if D_size > 0 else 0.0

def p_c(D, c):
  D_size = len(D)
  c_count = Counter([c for _, c in D])
  return c_count[c] / D_size if D_size > 0 else 0.0

확률 함수

In [17]:
D = build_D(corpus, 5)

D 생성

In [18]:
c_counts = Counter([c for _, c in D])

c_pow = {c: count ** (3/4) for c, count in c_counts.items()}

total = sum(c_pow.values())

p_D = {c: value / total for c, value in c_pow.items()}

p_D(c) 확률 분포 정의

In [20]:
def build_SPPMI_matrix(v, D, k=10):
    """
    v: list[str] (vocab)
    D: list[(w_idx, c_idx)]  (co-occurrence pairs)
    k: negative sampling count (default 10)
    return: numpy ndarray shape (|V|, |V|)
    """
    V = len(v)
    SPPMI = np.zeros((V, V), dtype=np.float32)

    for w in range(V):
        for c in range(V):
            pwc = p_wc(D, w, c)
            if pwc == 0:
                continue

            pw = p_w(D, w)
            pc = p_c(D, c)

            # PMI
            pmi = math.log((pwc / (pw * pc)) + 1e-12)

            # SPMI
            spmi = pmi - math.log(k)

            # SPPMI
            if spmi > 0:
                SPPMI[w, c] = spmi

    return SPPMI