In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

from tensorflow.keras.layers import Dense, Embedding, Dropout, LSTM, MaxPooling1D, Conv1D, GlobalMaxPooling1D
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
import re

from keras.utils import to_categorical

In [6]:
file = 'C:\\Users\\M\\Downloads\\Bigdata\\lastofus.txt'

In [7]:
with open(file, 'r', encoding='euckr') as f:
  lines = f.readlines()

In [8]:
len(lines)

9006

In [9]:
for line in lines:
  print(line)

    _____ _   _ _____   _          ____ _____   _____ _____   _   _ ____

   |_   _| | | |  ___| | |    /?\ |  __|_   _| |     |  ___| | | | |  __|

     | | | |_| | |_    | |   /   \| |__  | |   | (?) | |_    | | | | |__

     | | |  _  |  _|   | |  ( (?) )__  | | |   | | | |  _|   | | | |__  |

     | | | | | | |___  | |__|  ?  |__| | | |   | (_) | |     | |_| |__| |

     |_| |_| |_|_____| |____|_|?|_|____| |_|   |_____|_|     |_____|____|

 Game Script by Shotgunnova (P. Summers) / Email: shotgunnova(a+)gmail(d0t)com





   01) Prologue - Hometown ............................................. GS01

   02) Summer - The Quarantine Zone .................................... GS02

   03) Summer - The Outskirts .......................................... GS03

   04) Summer - Bill's Town ............................................ GS04

   05) Summer - Pittsburgh ............................................. GS05

   06) Summer - The Suburbs ............................................ 

In [10]:
conv = [line for line in lines if re.match(r'^[A-Z].*:', line)] # 첫 글짜가 대문자이고 ":"으로 끝나는 패턴 => 대화체만 추출

In [11]:
conv

["Joel: Tommy, I-...Tommy. Tommy, listen to me. He's the contractor, okay? I\n",
 'Sarah: Hey.\n',
 'Joel: Scoot.\n',
 'Sarah: Fun day at work, huh?\n',
 "Joel: What are you still doing up? It's late.\n",
 'Sarah: Oh crud. What time is it?\n',
 "Joel: It's way past your bedtime.\n",
 "Sarah: But it's still today.\n",
 'Joel: Honey, please not right now. I do not have the energy for this.\n',
 'Sarah: Here.\n',
 "Joel: What's this?\n",
 'Sarah: Your birthday.\n',
 'Sarah: You kept complaining about your broken watch... So I figured, you know.\n',
 'Joel: Honey, this is...\n',
 'Sarah: What?\n',
 "Joel: It's nice, but I-...I think it's stuck. It's not...\n",
 'Sarah: What? No, no, no, no.\n',
 'Sarah: Oh, ha, ha.\n',
 'Joel: Where did you get the money for this?\n',
 'Sarah: Drugs. I sell hardcore drugs.\n',
 'Joel: Oh, good. You can start helping out with the mortgage then.\n',
 'Sarah: Stsh - yeah, you wish.\n',
 'Joel: Goodnight, baby girl.\n',
 'Sarah: Hello?\n',
 'Tommy: Sarah, hone

In [12]:
text = ' '.join(conv)

In [13]:
text



In [14]:
arr = []
for line in text.split('\n'):
  arr.append(line.strip())

In [15]:
arr

["Joel: Tommy, I-...Tommy. Tommy, listen to me. He's the contractor, okay? I",
 'Sarah: Hey.',
 'Joel: Scoot.',
 'Sarah: Fun day at work, huh?',
 "Joel: What are you still doing up? It's late.",
 'Sarah: Oh crud. What time is it?',
 "Joel: It's way past your bedtime.",
 "Sarah: But it's still today.",
 'Joel: Honey, please not right now. I do not have the energy for this.',
 'Sarah: Here.',
 "Joel: What's this?",
 'Sarah: Your birthday.',
 'Sarah: You kept complaining about your broken watch... So I figured, you know.',
 'Joel: Honey, this is...',
 'Sarah: What?',
 "Joel: It's nice, but I-...I think it's stuck. It's not...",
 'Sarah: What? No, no, no, no.',
 'Sarah: Oh, ha, ha.',
 'Joel: Where did you get the money for this?',
 'Sarah: Drugs. I sell hardcore drugs.',
 'Joel: Oh, good. You can start helping out with the mortgage then.',
 'Sarah: Stsh - yeah, you wish.',
 'Joel: Goodnight, baby girl.',
 'Sarah: Hello?',
 'Tommy: Sarah, honey, I need you to get your daddy on the phone.',


In [16]:
# text

# 1. 불용어 제거
# 2. tokenizing : 문장을 토큰(최소의미 단위, 단어)
# 3. word index : 토큰을 index값으로 변환
# 4. 문장 => token index로 표현
# 5. 문장의 길이를 일치...
# 6. vectorizing :
      # 문장을 vector로 바꾸는 방법 : CountVectorizer, TfidfVectorizer
      # 단어(token)를 vector로 바꾸는 방법 : Embedding

In [17]:
# tokenizing => 문장을 token단위로 재구성...

In [18]:
token = Tokenizer(lower=False, filters='.,?;:\'\"-')
token.fit_on_texts(arr)

In [19]:
token.word_index

{'Joel': 1,
 'Ellie': 2,
 'I': 3,
 's': 4,
 'you': 5,
 'the': 6,
 'it': 7,
 'to': 8,
 't': 9,
 'a': 10,
 'Tess': 11,
 'You': 12,
 'that': 13,
 'we': 14,
 're': 15,
 'this': 16,
 'of': 17,
 'Hunter': 18,
 'on': 19,
 'We': 20,
 'Oh': 21,
 'Bill': 22,
 'What': 23,
 'get': 24,
 'me': 25,
 'up': 26,
 'here': 27,
 'Tommy': 28,
 'Henry': 29,
 'It': 30,
 'Yeah': 31,
 'Alright': 32,
 'is': 33,
 'm': 34,
 'there': 35,
 'go': 36,
 'your': 37,
 'll': 38,
 'know': 39,
 'in': 40,
 'out': 41,
 'just': 42,
 'are': 43,
 'Let': 44,
 'do': 45,
 'what': 46,
 'That': 47,
 '2': 48,
 'be': 49,
 'Sam': 50,
 'don': 51,
 'can': 52,
 'they': 53,
 'and': 54,
 'for': 55,
 'got': 56,
 'David': 57,
 'Okay': 58,
 'with': 59,
 'all': 60,
 'No': 61,
 'gonna': 62,
 'not': 63,
 'was': 64,
 'Well': 65,
 'way': 66,
 'mon': 67,
 'back': 68,
 'Hey': 69,
 'There': 70,
 'see': 71,
 'They': 72,
 'shit': 73,
 'How': 74,
 'have': 75,
 'us': 76,
 'like': 77,
 'Man': 78,
 'about': 79,
 'good': 80,
 'my': 81,
 'C': 82,
 'Sarah': 83,

In [20]:
# 문장 => token index값으로 표현

In [21]:
seq = token.texts_to_sequences(arr)
seq

[[1, 28, 3, 28, 28, 634, 8, 25, 93, 4, 6, 1321, 95, 3],
 [83, 69],
 [1, 1322],
 [83, 1323, 471, 101, 340, 200],
 [1, 23, 43, 5, 169, 224, 26, 30, 4, 1324],
 [83, 21, 1325, 23, 111, 33, 7],
 [1, 30, 4, 66, 472, 37, 1326],
 [83, 201, 7, 4, 169, 341],
 [1, 757, 635, 63, 88, 106, 3, 45, 63, 75, 6, 1327, 55, 16],
 [83, 118],
 [1, 23, 4, 16],
 [83, 473, 960],
 [83, 12, 758, 961, 79, 37, 636, 271, 127, 3, 759, 5, 39],
 [1, 757, 16, 33],
 [83, 23],
 [1, 30, 4, 533, 151, 3, 3, 87, 7, 4, 962, 30, 4, 63],
 [83, 23, 61, 90, 90, 90],
 [83, 21, 963, 963],
 [1, 156, 109, 5, 24, 6, 964, 55, 16],
 [83, 1328, 3, 760, 1329, 965],
 [1, 21, 80, 12, 52, 637, 966, 41, 59, 6, 1330, 170],
 [83, 1331, 272, 5, 474],
 [1, 1332, 215, 184],
 [83, 534],
 [28, 83, 967, 3, 102, 5, 8, 24, 37, 1333, 19, 6, 968],
 [83, 638, 28, 46, 111, 33, 7],
 [28, 3, 102, 8, 244, 8, 37, 1334, 106, 70, 4, 92],
 [83, 638, 28, 534],
 [83, 23, 64, 13, 60, 79, 430, 535, 12, 40, 27],
 [639, 30, 1335, 13, 46, 14, 1336, 1337, 171, 1338, 431, 

In [22]:
# 문장의 길이를 일정하게 조정

In [23]:
seq = pad_sequences(seq, maxlen=15)  # 문장의 앞을 자르거나 0으로 채워서 모든 문장의 길이를 15로 맞춤
seq

array([[   0,    1,   28, ..., 1321,   95,    3],
       [   0,    0,    0, ...,    0,   83,   69],
       [   0,    0,    0, ...,    0,    1, 1322],
       ...,
       [   0,    0,    0, ...,    1,    3,  591],
       [   0,    0,    0, ...,    0,    2,   58],
       [   0,    0,    0, ...,    0,    0,    0]])

In [24]:
seq.shape

(3416, 15)

In [25]:
# x, y 분리..

In [26]:
x = seq
y = np.vstack((seq[1:], seq[0]))
x = seq[:-1]
y = y[:-1]

In [29]:
x.shape

(3415, 15)

In [30]:
y.shape

(3415, 15)

In [46]:
len(token.word_index)

{'Joel': 1,
 'Ellie': 2,
 'I': 3,
 's': 4,
 'you': 5,
 'the': 6,
 'it': 7,
 'to': 8,
 't': 9,
 'a': 10,
 'Tess': 11,
 'You': 12,
 'that': 13,
 'we': 14,
 're': 15,
 'this': 16,
 'of': 17,
 'Hunter': 18,
 'on': 19,
 'We': 20,
 'Oh': 21,
 'Bill': 22,
 'What': 23,
 'get': 24,
 'me': 25,
 'up': 26,
 'here': 27,
 'Tommy': 28,
 'Henry': 29,
 'It': 30,
 'Yeah': 31,
 'Alright': 32,
 'is': 33,
 'm': 34,
 'there': 35,
 'go': 36,
 'your': 37,
 'll': 38,
 'know': 39,
 'in': 40,
 'out': 41,
 'just': 42,
 'are': 43,
 'Let': 44,
 'do': 45,
 'what': 46,
 'That': 47,
 '2': 48,
 'be': 49,
 'Sam': 50,
 'don': 51,
 'can': 52,
 'they': 53,
 'and': 54,
 'for': 55,
 'got': 56,
 'David': 57,
 'Okay': 58,
 'with': 59,
 'all': 60,
 'No': 61,
 'gonna': 62,
 'not': 63,
 'was': 64,
 'Well': 65,
 'way': 66,
 'mon': 67,
 'back': 68,
 'Hey': 69,
 'There': 70,
 'see': 71,
 'They': 72,
 'shit': 73,
 'How': 74,
 'have': 75,
 'us': 76,
 'like': 77,
 'Man': 78,
 'about': 79,
 'good': 80,
 'my': 81,
 'C': 82,
 'Sarah': 83,

In [32]:
y[:2]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,   83,   69],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    1, 1322]])

In [33]:
# token index => category => one hot

In [34]:
y_ohe = to_categorical(y)

In [35]:
y_ohe.shape  # token => 2627차원의 one hot vector

(3415, 15, 2627)

In [36]:
# LSTM data shape : (batch_size, sequence length, input dim)

In [37]:
#
x.shape

(3415, 15)

In [38]:
x_ohe = to_categorical(x)

In [39]:
x_ohe.shape

(3415, 15, 2627)

In [40]:
model = Sequential()
model.add(LSTM(1284, input_shape=(15, 2627), return_sequences=True))
model.add(Dense(2627, activation='relu'))
model.add(Dense(2627, activation='softmax'))

  super().__init__(**kwargs)


In [41]:
model.summary()

In [54]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [55]:
model.fit(x_ohe,y_ohe, epochs=100, batch_size=30)

Epoch 1/100
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 966ms/step - accuracy: 0.4856 - loss: 3.5555
Epoch 2/100
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m125s[0m 1s/step - accuracy: 0.4866 - loss: 3.4140
Epoch 3/100
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m124s[0m 1s/step - accuracy: 0.4930 - loss: 3.3295
Epoch 4/100
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m121s[0m 1s/step - accuracy: 0.4962 - loss: 3.2499
Epoch 5/100
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 1s/step - accuracy: 0.5018 - loss: 3.1473
Epoch 6/100
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m160s[0m 1s/step - accuracy: 0.5109 - loss: 3.0104
Epoch 7/100
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m129s[0m 1s/step - accuracy: 0.5357 - loss: 2.8102
Epoch 8/100
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m131s[0m 1s/step - accuracy: 0.5570 - loss: 2.6202
Epoch 9/100
[1m 81/1

KeyboardInterrupt: 

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

(3415, 15, 2627)

In [None]:
x[10]

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1, 23,  4, 16],
      dtype=int32)

In [47]:
idx_to_token = {}

for t, idx in token.word_index.items():
  idx_to_token[idx] = t

In [52]:
idx_to_token[0] = ''

In [53]:
' '.join([idx_to_token[i] for i in x[10]]).strip()

'Joel What s this'