In [48]:
import pandas as pd
import re

In [49]:
df = pd.read_csv("IMDB_Dataset.csv")

In [50]:
df.columns

Index(['review', 'sentiment'], dtype='object')

In [51]:
df['review'][49999]

"No one expects the Star Trek movies to be high art, but the fans do expect a movie that is as good as some of the best episodes. Unfortunately, this movie had a muddled, implausible plot that just left me cringing - this is by far the worst of the nine (so far) movies. Even the chance to watch the well known characters interact in another movie can't save this movie - including the goofy scenes with Kirk, Spock and McCoy at Yosemite.<br /><br />I would say this movie is not worth a rental, and hardly worth watching, however for the True Fan who needs to see all the movies, renting this movie is about the only way you'll see it - even the cable channels avoid this movie."

In [52]:
word_cnt_dict = {}

for sent in df['review']:
    rep_sent = sent.replace("<br />", " ")
    rep_sent = re.sub("\s{1,}", " ", rep_sent)
    for word in rep_sent.strip().split():
        if word not in word_cnt_dict:
            word_cnt_dict[word] = 1
        else:
            word_cnt_dict[word] += 1

In [53]:
len(word_cnt_dict)

412584

In [54]:
sorted_cnt_dict = sorted(word_cnt_dict.items(), key=lambda x: x[1], reverse=True)

In [55]:
sorted_cnt_dict[:10]

[('the', 569082),
 ('a', 307037),
 ('and', 302066),
 ('of', 283665),
 ('to', 261899),
 ('is', 203088),
 ('in', 170028),
 ('I', 139853),
 ('that', 126851),
 ('this', 113834)]

In [56]:
VOCAB_SIZE = 20000

## for <unk>, <pad>
word2idx = {"<pad>":0, "<unk>":1}
total_cnt = 0

for word,_ in sorted_cnt_dict:
    if total_cnt == VOCAB_SIZE-2:
        break
    word2idx[word] = len(word2idx)
    total_cnt += 1 

In [57]:
len(word2idx)

20000

In [58]:
word2idx

{'<pad>': 0,
 '<unk>': 1,
 'the': 2,
 'a': 3,
 'and': 4,
 'of': 5,
 'to': 6,
 'is': 7,
 'in': 8,
 'I': 9,
 'that': 10,
 'this': 11,
 'it': 12,
 'was': 13,
 'as': 14,
 'with': 15,
 'The': 16,
 'for': 17,
 'but': 18,
 'on': 19,
 'movie': 20,
 'are': 21,
 'film': 22,
 'his': 23,
 'have': 24,
 'not': 25,
 'you': 26,
 'be': 27,
 'at': 28,
 'by': 29,
 'he': 30,
 'one': 31,
 'an': 32,
 'from': 33,
 'who': 34,
 'like': 35,
 'all': 36,
 'they': 37,
 'has': 38,
 'so': 39,
 'just': 40,
 'or': 41,
 'about': 42,
 'her': 43,
 'This': 44,
 'out': 45,
 'some': 46,
 'very': 47,
 'more': 48,
 'would': 49,
 'It': 50,
 'what': 51,
 'when': 52,
 'good': 53,
 'if': 54,
 'their': 55,
 'only': 56,
 'really': 57,
 'had': 58,
 'up': 59,
 'even': 60,
 "it's": 61,
 'can': 62,
 'which': 63,
 'were': 64,
 'my': 65,
 'see': 66,
 'no': 67,
 'than': 68,
 'she': 69,
 '-': 70,
 'there': 71,
 'been': 72,
 'into': 73,
 'get': 74,
 'will': 75,
 'much': 76,
 'story': 77,
 'because': 78,
 'other': 79,
 'most': 80,
 'time': 8

# Corpus, Vocab 파일 생성

In [59]:
with open("imdb.corpus", "w") as f_corpus:
    for sent in df['review']:
        rep_sent = sent.replace("<br />", " ")
        rep_sent = re.sub("\s{1,}", " ", rep_sent)
        f_corpus.write(rep_sent.strip() + "\n")

In [54]:
with open("imdb.vocab", "w") as f_vocab:
    for vocab in word2idx:
        f_vocab.write(vocab + "\n")

In [30]:
vocab_path = "imdb.vocab"
word2idx = {}
idx2word = {}

f = open(vocab_path, 'r')
for vocab in f:
    word2idx[vocab.strip()] = len(word2idx)
    idx2word[len(idx2word)] = vocab.strip()

In [31]:
word2idx

{'<pad>': 0,
 '<unk>': 1,
 'the': 2,
 'a': 3,
 'and': 4,
 'of': 5,
 'to': 6,
 'is': 7,
 'in': 8,
 'I': 9,
 'that': 10,
 'this': 11,
 'it': 12,
 'was': 13,
 'as': 14,
 'with': 15,
 'The': 16,
 'for': 17,
 'but': 18,
 'on': 19,
 'movie': 20,
 'are': 21,
 'film': 22,
 'his': 23,
 'have': 24,
 'not': 25,
 'you': 26,
 'be': 27,
 'at': 28,
 'by': 29,
 'he': 30,
 'one': 31,
 'an': 32,
 'from': 33,
 'who': 34,
 'like': 35,
 'all': 36,
 'they': 37,
 'has': 38,
 'so': 39,
 'just': 40,
 'or': 41,
 'about': 42,
 'her': 43,
 'This': 44,
 'out': 45,
 'some': 46,
 'very': 47,
 'more': 48,
 'would': 49,
 'It': 50,
 'what': 51,
 'when': 52,
 'good': 53,
 'if': 54,
 'their': 55,
 'only': 56,
 'really': 57,
 'had': 58,
 'up': 59,
 'even': 60,
 "it's": 61,
 'can': 62,
 'which': 63,
 'were': 64,
 'my': 65,
 'see': 66,
 'no': 67,
 'than': 68,
 'she': 69,
 '-': 70,
 'there': 71,
 'been': 72,
 'into': 73,
 'get': 74,
 'will': 75,
 'much': 76,
 'story': 77,
 'because': 78,
 'other': 79,
 'most': 80,
 'time': 8

In [4]:
idx2word

{0: '<pad>',
 1: '<unk>',
 2: 'the',
 3: 'a',
 4: 'and',
 5: 'of',
 6: 'to',
 7: 'is',
 8: 'in',
 9: 'I',
 10: 'that',
 11: 'this',
 12: 'it',
 13: 'was',
 14: 'as',
 15: 'with',
 16: 'The',
 17: 'for',
 18: 'but',
 19: 'on',
 20: 'movie',
 21: 'are',
 22: 'film',
 23: 'his',
 24: 'have',
 25: 'not',
 26: 'you',
 27: 'be',
 28: 'at',
 29: 'by',
 30: 'he',
 31: 'one',
 32: 'an',
 33: 'from',
 34: 'who',
 35: 'like',
 36: 'all',
 37: 'they',
 38: 'has',
 39: 'so',
 40: 'just',
 41: 'or',
 42: 'about',
 43: 'her',
 44: 'This',
 45: 'out',
 46: 'some',
 47: 'very',
 48: 'more',
 49: 'would',
 50: 'It',
 51: 'what',
 52: 'when',
 53: 'good',
 54: 'if',
 55: 'their',
 56: 'only',
 57: 'really',
 58: 'had',
 59: 'up',
 60: 'even',
 61: "it's",
 62: 'can',
 63: 'which',
 64: 'were',
 65: 'my',
 66: 'see',
 67: 'no',
 68: 'than',
 69: 'she',
 70: '-',
 71: 'there',
 72: 'been',
 73: 'into',
 74: 'get',
 75: 'will',
 76: 'much',
 77: 'story',
 78: 'because',
 79: 'other',
 80: 'most',
 81: 'time

In [6]:
class 
corpus_path = "imdb.corpus"
window_size = 5
label_pairs = []

def get_corpus(corpus_path):
    f = open(corpus_path, 'r')
    for sent in f:
        words = sent.split()
        ids = list(map(lambda word: word2idx[word] , words))
        length = len(words)
        for i in range(0, length):
            # 왼쪽 window size 만큼
            for j in range(max(0, i - window_size), i):
                label_pairs.append((ids[i], ids[j]))
            # 오른쪽 window size 만큼
            for j in range(i + 1, min(length, i + window_size + 1)):
                label_pairs.append((ids[i], ids[j]))

def my_generator(batch_size):      
    for ndx in range(0, len(label_pairs), batch_size):
        yield label_pairs[ndx:min(ndx + batch_size, len(label_pairs))] 

In [8]:
gen = my_generator(corpus_path, batch_size=10)

In [9]:
next(gen)

[(274, 5),
 (274, 2),
 (274, 79),
 (274, 2301),
 (274, 38),
 (5, 274),
 (5, 2),
 (5, 79),
 (5, 2301),
 (5, 38)]

In [3]:
import numpy as np

In [4]:
class Generator(object):
    
    def __init__(self, corpus_path, vocab_path, batch_size, window_size, shuffle=True):
        self.corpus_path = corpus_path
        self.vocab_path = vocab_path
        self.batch_size = batch_size
        self.window_size = window_size
        self.shuffle = shuffle
        
        self.word2idx = {}
        self.idx2word = {}
        self._get_vocab()
        self.gen = self._generator()

    def _get_vocab(self):
        f = open(self.vocab_path, 'r')
        for vocab in f:
            self.word2idx[vocab.strip()] = len(self.word2idx)
            self.idx2word[len(self.idx2word)] = vocab.strip()
            
    def _convert_word2idx(self, word):
        if word not in self.word2idx:
            word = "<unk>"
        return self.word2idx[word]

    def _generator(self):
        f = open(self.corpus_path)
        for sent in f:
            words = sent.split()
            ids = list(map(self._convert_word2idx, words))
            length = len(words)
            for i in range(0, length):
                # 왼쪽 window size 만큼
                for j in range(max(0, i - self.window_size), i):
                    yield [ids[i], ids[j]]
                # 오른쪽 window size 만큼
                for j in range(i + 1, min(length, i + self.window_size + 1)):
                    yield [ids[i], ids[j]]

    def next(self):
        result_x = []
        result_y = []
        for i in range(self.batch_size):
            try:
                x,y = next(self.gen)
                result_x.append(x)
                result_y.append(y)
            except:
                self.gen = self._generator()
        if self.shuffle:
            shuffle_mask = np.random.permutation(len(result_x))
            result_x = np.array(result_x)[shuffle_mask]
            result_y = np.array(result_y)[shuffle_mask]
        return result_x[:self.batch_size], result_y[:self.batch_size]

In [5]:
gen = Generator("imdb.corpus", "imdb.vocab", 10, 5, False)

In [7]:
a = []
n = 0
while(1):
    if n == 3000000:
        break
    
    a.append(gen.next())
    n += 1

In [13]:
a[2890000]

([590, 590, 590, 590, 590, 68, 68, 68, 68, 68],
 [68, 3, 9039, 5, 6952, 1760, 2, 6557, 1958, 590])

In [122]:
a[2107940]

([], [])

In [8]:
f = open("imdb.vocab", 'r')
word2idx = {}
for vocab in f:
    word2idx[vocab.strip()] = len(word2idx)

In [9]:
def _convert_word2idx(word):
    if word not in word2idx:
        word = "<unk>"
    return word2idx[word]

In [10]:
f = open("imdb.corpus")
for sent in f:
    words = sent.split()
    ids = list(map(_convert_word2idx , words))

In [12]:
ids

[475,
 31,
 6588,
 2,
 1373,
 3326,
 128,
 6,
 27,
 352,
 4400,
 18,
 2,
 573,
 85,
 544,
 3,
 20,
 10,
 7,
 14,
 53,
 14,
 46,
 5,
 2,
 131,
 4153,
 1024,
 11,
 20,
 58,
 3,
 1,
 5290,
 136,
 10,
 40,
 306,
 83,
 14269,
 70,
 11,
 7,
 29,
 234,
 2,
 256,
 5,
 2,
 4818,
 7101,
 1,
 522,
 363,
 2,
 693,
 6,
 119,
 2,
 118,
 603,
 123,
 7851,
 8,
 186,
 20,
 180,
 596,
 11,
 20,
 70,
 659,
 2,
 3374,
 160,
 15,
 1,
 6730,
 4,
 10105,
 28,
 1,
 9,
 49,
 146,
 11,
 20,
 7,
 25,
 269,
 3,
 19647,
 4,
 1004,
 269,
 4392,
 999,
 17,
 2,
 5563,
 1,
 34,
 728,
 6,
 66,
 36,
 2,
 577,
 2840,
 11,
 20,
 7,
 42,
 2,
 56,
 114,
 500,
 66,
 12,
 70,
 60,
 2,
 2757,
 6381,
 1235,
 11,
 104]