In [1]:
from keras import datasets
from keras.preprocessing.sequence import pad_sequences

In [2]:
vocab_size=10000
(X_train,y_train),(X_test,y_test) = datasets.imdb.load_data(num_words=vocab_size)

In [3]:
print(X_train[:5])

[list([1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32])
 list([1, 194, 1153, 194, 8255, 78, 228, 5, 6, 1463, 

In [4]:
max_len = 200
X_train = pad_sequences(X_train,maxlen=max_len)
X_test = pad_sequences(X_test,maxlen=max_len)

In [7]:
X_train.shape,X_test.shape

((25000, 200), (25000, 200))

In [9]:
print(y_train[:5])

[1 0 0 1 0]


In [12]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, Conv1D, GlobalMaxPooling1D, Dropout
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import load_model

embedding_dim = 256
dropout_ratio = 0.3
num_filters = 256
kernel_size = 3
hidden_units =128

In [13]:
model = Sequential()
model.add(Embedding(vocab_size,embedding_dim))
model.add(Dropout(dropout_ratio))
model.add(Conv1D(num_filters, kernel_size, padding='valid',activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(hidden_units, activation='relu'))
model.add(Dropout(dropout_ratio))
model.add(Dense(1,activation='sigmoid'))


## 🔢 **1층: Embedding Layer**

### **역할**: 정수 → 실수 벡터 변환

### **처리 과정**:
```python
# 입력: [14, 22, 16, 43] (단어 인덱스)
# 내부 처리:
lookup_table = {
    14: [0.1, 0.3, 0.8, 0.2],  # 단어14의 임베딩 벡터
    22: [0.5, 0.2, 0.1, 0.9],  # 단어22의 임베딩 벡터  
    16: [0.7, 0.4, 0.6, 0.3],  # 단어16의 임베딩 벡터
    43: [0.2, 0.8, 0.1, 0.5]   # 단어43의 임베딩 벡터
}

# 출력: [[0.1, 0.3, 0.8, 0.2],
#        [0.5, 0.2, 0.1, 0.9], 
#        [0.7, 0.4, 0.6, 0.3],
#        [0.2, 0.8, 0.1, 0.5]]
# 형태: (4, 4) → (sequence_length, embedding_dim)
````

### **학습되는 것**: 각 단어의 의미 벡터 (유사한 단어끼리 비슷한 벡터)



## 🎯 **2층: Dropout**

### **역할**: 과적합 방지

### **처리 과정**:

```python
# 입력: [[0.1, 0.3, 0.8, 0.2],
#        [0.5, 0.2, 0.1, 0.9],
#        [0.7, 0.4, 0.6, 0.3],
#        [0.2, 0.8, 0.1, 0.5]]

# dropout_ratio=0.2 (20% 끄기)
# 랜덤하게 20% 뉴런을 0으로 만들고 나머지는 1.25배 증폭
# 출력: [[0.0, 0.375, 1.0, 0.25],
#        [0.625, 0.0, 0.125, 1.125],
#        [0.875, 0.5, 0.0, 0.375],
#        [0.25, 1.0, 0.125, 0.0]]
```

### **주의**: 훈련할 때만 작동, 예측할 때는 모든 뉴런 사용



## 🔍 **3층: Conv1D**

### **역할**: 연속된 단어들에서 패턴 추출

### **처리 과정** (kernel\_size=3, num\_filters=2 예시):

```python
# 입력: (4, 4) - 4개 단어, 각각 4차원 벡터
# 필터1과 필터2가 sliding window처럼 움직임

# 위치1: [단어1, 단어2, 단어3]에 필터 적용
input_window = [[0.1, 0.3, 0.8, 0.2],
                [0.5, 0.2, 0.1, 0.9], 
                [0.7, 0.4, 0.6, 0.3]]

filter1 = [[0.1, 0.2, 0.3, 0.4],
           [0.5, 0.6, 0.7, 0.8],
           [0.9, 1.0, 1.1, 1.2]]

result1_pos1 = sum(input_window * filter1) + bias1
             = 3.94 → ReLU → 3.94

# 위치2: [단어2, 단어3, 단어4]에 필터 적용
input_window = [[0.5, 0.2, 0.1, 0.9],
                [0.7, 0.4, 0.6, 0.3],
                [0.2, 0.8, 0.1, 0.5]]

result1_pos2 = 2.87

# 필터2도 동일하게 처리
result2_pos1 = 1.23
result2_pos2 = 4.56

# 출력: [[3.94, 1.23],
#        [2.87, 4.56]]
# 형태: (2, 2) → (new_sequence_length, num_filters)
```

### **학습되는 것**: 각 필터가 특정 패턴(예: "very good", "really bad")을 감지하도록 가중치 학습



## 🎯 **4층: GlobalMaxPooling1D**

### **역할**: 각 필터별 최댓값만 선택

### **처리 과정**:

```python
# 입력: [[3.94, 1.23],
#        [2.87, 4.56]]

max_filter1 = max(3.94, 2.87) = 3.94
max_filter2 = max(1.23, 4.56) = 4.56

# 출력: [3.94, 4.56]
# 형태: (2,) → (num_filters,)
```

### **의미**:

* 필터1: 문서에서 패턴1이 가장 강하게 나타난 정도 = 3.94
* 필터2: 문서에서 패턴2가 가장 강하게 나타난 정도 = 4.56



## 🧠 **5층: Dense (첫 번째)**

### **역할**: 특징들을 조합해서 고차원 패턴 학습

### **처리 과정**:

```python
# 입력: [3.94, 4.56]
# 가중치 W (2x3), 편향 b (3,)

W = [[0.5, 0.3, 0.8],
     [0.2, 0.7, 0.4]]
b = [0.1, 0.2, 0.3]

output = input × W + b
       = [2.88, 4.37, 4.97] + [0.1, 0.2, 0.3]
       = [2.98, 4.57, 5.27]

final_output = [2.98, 4.57, 5.27]
```

### **학습되는 것**: 여러 필터 결과를 조합하는 방법


## 🎯 **6층: Dropout (두 번째)**

### **역할**: 다시 과적합 방지

### **처리 과정**:

```python
# 입력: [2.98, 4.57, 5.27]
# 20% 드롭아웃 예시
# 출력: [0.0, 5.71, 6.59]  # 첫 번째 뉴런이 꺼지고 나머지는 1.25배
```


## 📊 **7층: Dense (출력층)**

### **역할**: 최종 분류 점수 계산

### **처리 과정**:

```python
# 입력: [0.0, 5.71, 6.59]
# 가중치 W (3x1), 편향 b (1,)

W = [[0.4],
     [0.6], 
     [0.3]]
b = [-1.2]

output = [0.0, 5.71, 6.59] × W + b
       = 0 + 3.426 + 1.977 - 1.2
       = 4.203

# Sigmoid 활성화
probability = 1 / (1 + e^(-4.203))
            ≈ 0.985
```

### **의미**: 문장이 긍정일 확률이 98.5%라는 의미의 최종 예측 결과

---



In [15]:
es = EarlyStopping(monitor='val_loss', mode='min', patience=3)
mc = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
history = model.fit(X_train, y_train, epochs=20, validation_data=(X_test,y_test), callbacks=[es,mc])

Epoch 1/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - acc: 0.7118 - loss: 0.5117
Epoch 1: val_acc improved from -inf to 0.87400, saving model to best_model.h5




[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 45ms/step - acc: 0.7119 - loss: 0.5116 - val_acc: 0.8740 - val_loss: 0.2917
Epoch 2/20
[1m781/782[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 38ms/step - acc: 0.9207 - loss: 0.2031
Epoch 2: val_acc improved from 0.87400 to 0.88028, saving model to best_model.h5




[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 45ms/step - acc: 0.9207 - loss: 0.2031 - val_acc: 0.8803 - val_loss: 0.2939
Epoch 3/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - acc: 0.9706 - loss: 0.0865
Epoch 3: val_acc improved from 0.88028 to 0.88288, saving model to best_model.h5




[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 44ms/step - acc: 0.9706 - loss: 0.0865 - val_acc: 0.8829 - val_loss: 0.3057
Epoch 4/20
[1m781/782[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 37ms/step - acc: 0.9893 - loss: 0.0341
Epoch 4: val_acc did not improve from 0.88288
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 45ms/step - acc: 0.9893 - loss: 0.0341 - val_acc: 0.8821 - val_loss: 0.3799


In [16]:
loaded_model = load_model('best_model.h5')
print("\n 테스트 정확도: %.4f" % (loaded_model.evaluate(X_test, y_test)[1]))



[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 8ms/step - acc: 0.8838 - loss: 0.3054

 테스트 정확도: 0.8829
