# 1. Implement Word 2 Vector

## 1) Load Packages

In [1]:
from abc import ABC
from typing import List, Dict, Tuple, Set
import random
import torch.nn.functional as F
from typing import List, Dict, Tuple
from random import randint
import re
from collections import Counter
import torch
from torch.utils.data import IterableDataset

In [3]:
# load data

import pandas as pd
train = pd.read_csv('../../../../data/klue clf/train_data.csv')

In [4]:
train.head()

Unnamed: 0,index,title,topic_idx
0,0,인천→핀란드 항공기 결항…휴가철 여행객 분통,4
1,1,실리콘밸리 넘어서겠다…구글 15조원 들여 美전역 거점화,4
2,2,이란 외무 긴장완화 해결책은 미국이 경제전쟁 멈추는 것,4
3,3,NYT 클린턴 측근韓기업 특수관계 조명…공과 사 맞물려종합,4
4,4,시진핑 트럼프에 중미 무역협상 조속 타결 희망,4


## 2) Make Tokenize

In [12]:
def tokenize(sentence) : 
    sentence = re.sub('[.]', ' . ', sentence)
    sentence = re.sub('[,]', ' , ', sentence)
    sentence = re.sub('[!]', ' ! ', sentence)
    sentence = re.sub('[?]', ' ? ', sentence)
    tokens = sentence.split(' ')
    
    assert type(tokens) == list
    return tokens

In [14]:
# Test Tokenize
sample_sentences = train.sample(frac = 1).iloc[:3].title.values

for sentence in sample_sentences : 
    print('Raw')
    print(sentence)
    print('Tokenize')
    print(tokenize(sentence))
    print('')

Raw
의총에서 발언하는 이정미
Tokenize
['의총에서', '발언하는', '이정미']

Raw
아스널 지루 허벅지 부상… 프랑스축구협회 독일과 A매치 결장
Tokenize
['아스널', '지루', '허벅지', '부상…', '프랑스축구협회', '독일과', 'A매치', '결장']

Raw
치유·감동의 하모니 제주국제합창축제 막 올라
Tokenize
['치유·감동의', '하모니', '제주국제합창축제', '막', '올라']



# 3) Build Vocab

In [27]:
def build_vocab(sentences : List[List[str]], min_freq : int) -> Tuple[List[str], Dict[str,int], List[int]] :
    """
    input
    sentences
    min_freq

    return
    idx2word
    word2idx
    word_freq
    """
    PAD = '<PAD>'
    PAD_IDX = 0
    UNK = '<UNK>'
    UNK_IDX = 1

    word2idx = {PAD : PAD_IDX, UNK : UNK_IDX}
    idx2word = {PAD_IDX : PAD, UNK_IDX : UNK}

    flatten = lambda x : [item for sublist in x for item in sublist]

    word_freq = dict(Counter(flatten(sentences)))
    
    for word, freq in word_freq.items() : 
        if freq < min_freq : 
            continue;
        else : 
            word2idx[word] = len(word2idx)
            idx2word[len(idx2word)] = word

    word_freq = {PAD : PAD_IDX, UNK : UNK_IDX}
    for word_list in sentences : 
        for word in word_list : 
            if word not in word2idx : 
                word_freq[UNK] += 1
            else : 
                try : 
                    word_freq[word] += 1
                except KeyError: 
                    word_freq[word] = 1

    word_freq = list(word_freq.values())
    idx2word = list(idx2word.values())

    assert idx2word[PAD_IDX] == PAD and word2idx[PAD] == PAD_IDX, \
        "PAD token should be placed properly"
    assert idx2word[UNK_IDX] == UNK and word2idx[UNK] == UNK_IDX, \
        "UNK token should be placed properly"
    assert len(idx2word) == len(word2idx) and len(idx2word) == len(word_freq), \
        "Size of idx2word, word2idx and word_freq should be same"

    return idx2word, word2idx, word_freq

In [30]:
tokenized_sentences = list(train['title'].apply(tokenize).values)
idx2word, word2idx, word_freq = build_vocab(tokenized_sentences, 3)

# 4) Build Skip-gram

In [77]:
def skipgram(sentence, window_size, center_word_loc) : 
    outside_words = []

    for w in range(-window_size, window_size + 1) : 
        context_word_loc = center_word_loc + w
            
        if context_word_loc < 0 or context_word_loc >= len(sentence) or center_word_loc == context_word_loc : 
            continue;
        outside_words.append(sentence[context_word_loc])

        assert type(sentence[center_word_loc]) == str and type(outside_words) == list
    return sentence[center_word_loc], outside_words

In [82]:
skipgram(tokenized_sentences[8], window_size=3, center_word_loc=3)

('진전', ['푸틴', '한반도', '상황', '위한', '방안', '김정은'])