In [12]:
!pip install tensorflow tensorboardX
!git clone https://github.com/GuyKabiri/language_models

fatal: destination path 'language_models' already exists and is not an empty directory.


In [13]:
import tensorflow as tf
import sys

print("Available GPUs:", tf.config.list_physical_devices('GPU'))

import tensorflow.keras as keras
sys.modules['keras'] = keras

if "language_models" not in sys.path:
    sys.path.append("language_models")

print("Keras version:", keras.__version__)

Available GPUs: []
Keras version: 3.8.0


In [14]:
import sys, types
import tensorflow as tf

# ודא שה-GPU זמין
print("Available GPUs:", tf.config.list_physical_devices('GPU'))

# מיפוי מודולי keras ל-tf.keras
sys.modules['keras'] = types.ModuleType('keras')
sys.modules['keras.preprocessing.text'] = tf.keras.preprocessing.text
sys.modules['keras.preprocessing.sequence'] = tf.keras.preprocessing.sequence
sys.modules['keras.layers'] = tf.keras.layers
sys.modules['keras.utils'] = tf.keras.utils
sys.modules['keras.backend'] = tf.keras.backend

# הוספת ספריית language_models לנתיב
if "language_models" not in sys.path:
    sys.path.append("language_models")

Available GPUs: []


In [15]:
import tensorflow as tf
import tensorflow.keras as keras
import sys

# מיפוי מודול keras ל-tf.keras
sys.modules['keras'] = keras
sys.modules['keras.preprocessing'] = keras.preprocessing
sys.modules['keras.preprocessing.sequence'] = keras.preprocessing.sequence

print("Keras version:", keras.__version__)
print("Available GPUs:", tf.config.list_physical_devices('GPU'))

Keras version: 3.8.0
Available GPUs: []


In [16]:
import scipy
import numpy as np
import random
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Dense, Embedding, Input, TimeDistributed
from tensorboardX import SummaryWriter
from tqdm import tqdm

from language_models import util

In [17]:
import os

class Args:
    epochs = 12                # 12 אפוקים בלבד
    embedding_size = 300
    lr = 0.001
    batch = 128
    task = 'wikisimple'
    data = './data'
    lstm_capacity = 256
    top_words = 10000
    limit = None
    tb_dir = './runs/words'
    seed = -1
    extra = None             # None -> 1 שכבת LSTM, 1 -> 2 שכבות

options = Args()
tbw = SummaryWriter(log_dir=options.tb_dir)

if options.seed < 0:
    seed_val = random.randint(0, 1000000)
    np.random.seed(seed_val)
    options.seed = seed_val
else:
    np.random.seed(options.seed)

data_source = os.path.join(util.DIR, 'datasets', 'wikisimple.txt')
X, w2i, i2w = util.load_words(data_source, vocab_size=options.top_words, limit=options.limit)

X_train, X_temp = train_test_split(X, test_size=0.2, random_state=options.seed)
X_val, X_test = train_test_split(X_temp, test_size=0.5, random_state=options.seed)

X_train = util.batch_pad(X_train, options.batch, add_eos=True)
X_val = util.batch_pad(X_val, options.batch, add_eos=True)
X_test = util.batch_pad(X_test, options.batch, add_eos=True)
numwords = len(i2w)
print(numwords, "distinct words")
print("Finished data loading:", sum(b.shape[0] for b in X_train), "sentences loaded.")

raw data read
max length per batch:  [15, 16, 16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 28, 28, 28, 28, 28, 28, 28, 29, 29, 29, 29, 29, 30, 30, 30, 30, 30, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 37, 38, 38, 39, 39, 40, 41, 42, 42, 44, 45, 46, 47, 49, 52, 55, 62, 133]
max length per batch:  [17, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 24, 24, 25, 26, 27, 28, 30, 31, 33, 36, 41, 55, 133]
max length per batch:  [17, 18, 19, 19, 20, 20, 21, 21, 22, 23, 23, 24, 25, 25, 26, 27, 29, 30, 

In [18]:
def create_lstm_model(extra=None, lr=0.001):
    inp = Input(shape=(None,))
    x = Embedding(numwords, options.embedding_size)(inp)
    h = LSTM(options.lstm_capacity, return_sequences=True)(x)
    if extra is not None:
        for _ in range(extra):
            h = LSTM(options.lstm_capacity, return_sequences=True)(h)
    out = TimeDistributed(Dense(numwords, activation='linear'))(h)
    model = Model(inp, out)
    model.compile(optimizer=tf.keras.optimizers.Adam(lr),
                  loss=lambda y_true, y_pred: tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred, from_logits=True))
    return model

def train_model(model, X_data, direction='forward'):
    for epoch in range(options.epochs):
        for batch in tqdm(X_data, desc=f"Epoch {epoch+1}"):
            n, l = batch.shape
            if direction == 'backward':
                batch_reversed = np.flip(batch, axis=1)
                batch_shifted = np.concatenate([np.ones((n, 1)), batch_reversed], axis=1)
                batch_out = np.concatenate([batch_reversed, np.zeros((n, 1))], axis=1)
            else:
                batch_shifted = np.concatenate([np.ones((n, 1)), batch], axis=1)
                batch_out = np.concatenate([batch, np.zeros((n, 1))], axis=1)
            loss = model.train_on_batch(batch_shifted, batch_out[:, :, None])
            tbw.add_scalar('lm/batch_loss', float(loss), epoch)
        print(f"Epoch {epoch+1} loss: {loss}")

def compute_perplexity(model, X_data, direction='forward'):
    total_loss = 0.0
    total_tokens = 0
    for batch in tqdm(X_data, desc="Computing Perplexity"):
        n, l = batch.shape
        if direction == 'backward':
            batch_reversed = np.flip(batch, axis=1)
            batch_shifted = np.concatenate([np.ones((n, 1)), batch_reversed], axis=1)
            batch_out = np.concatenate([batch_reversed, np.zeros((n, 1))], axis=1)
        else:
            batch_shifted = np.concatenate([np.ones((n, 1)), batch], axis=1)
            batch_out = np.concatenate([batch, np.zeros((n, 1))], axis=1)
        loss = model.evaluate(batch_shifted, batch_out[:, :, None], verbose=0)
        non_pad = np.sum(batch_out != 0)
        total_loss += loss * non_pad
        total_tokens += non_pad
    avg_loss = total_loss / total_tokens
    return np.exp(avg_loss)

def sentence_probability(model, sentence, direction='forward'):
    tokens = [w2i.get(word, w2i['<UNK>']) for word in sentence.split()]
    if direction == 'backward':
        tokens = tokens[::-1]
    inp_seq = np.array([1] + tokens)  # 1 מייצג <START>
    inp_seq = inp_seq[None, :]
    logits = model.predict(inp_seq)
    prob = 1.0
    for i in range(1, inp_seq.shape[1]):
        logit_i = logits[0, i-1, :]
        exp_logits = np.exp(logit_i - np.max(logit_i))
        softmax = exp_logits / np.sum(exp_logits)
        prob *= softmax[inp_seq[0, i]]
    return prob

In [19]:
models = []
model1 = create_lstm_model(extra=None, lr=0.01)      # 1 שכבת LSTM, forward
models.append(('Model 1 - 1L forward', model1, 'forward'))
model2 = create_lstm_model(extra=None, lr=0.01)      # 1 שכבת LSTM, backward
models.append(('Model 2 - 1L backward', model2, 'backward'))
model3 = create_lstm_model(extra=1, lr=0.001)        # 2 שכבות LSTM, forward
models.append(('Model 3 - 2L forward', model3, 'forward'))
model4 = create_lstm_model(extra=1, lr=0.001)        # 2 שכבות LSTM, backward
models.append(('Model 4 - 2L backward', model4, 'backward'))

for name, model, direction in models:
    print(name)
    train_model(model, X_train, direction)
    print(f"{name} training complete.")

Model 1 - 1L forward


Epoch 1: 100%|██████████| 186/186 [06:30<00:00,  2.10s/it]


Epoch 1 loss: 6.112192153930664


Epoch 2: 100%|██████████| 186/186 [06:17<00:00,  2.03s/it]


Epoch 2 loss: 5.621982097625732


Epoch 3: 100%|██████████| 186/186 [06:21<00:00,  2.05s/it]


Epoch 3 loss: 5.318646430969238


Epoch 4: 100%|██████████| 186/186 [06:11<00:00,  2.00s/it]


Epoch 4 loss: 5.086594104766846


Epoch 5: 100%|██████████| 186/186 [06:20<00:00,  2.05s/it]


Epoch 5 loss: 4.891504287719727


Epoch 6: 100%|██████████| 186/186 [06:18<00:00,  2.03s/it]


Epoch 6 loss: 4.720603942871094


Epoch 7: 100%|██████████| 186/186 [06:11<00:00,  2.00s/it]


Epoch 7 loss: 4.5683698654174805


Epoch 8: 100%|██████████| 186/186 [06:12<00:00,  2.00s/it]


Epoch 8 loss: 4.4310832023620605


Epoch 9: 100%|██████████| 186/186 [06:20<00:00,  2.04s/it]


Epoch 9 loss: 4.306709289550781


Epoch 10: 100%|██████████| 186/186 [06:12<00:00,  2.00s/it]


Epoch 10 loss: 4.193889141082764


Epoch 11: 100%|██████████| 186/186 [06:10<00:00,  1.99s/it]


Epoch 11 loss: 4.0908966064453125


Epoch 12: 100%|██████████| 186/186 [06:10<00:00,  1.99s/it]


Epoch 12 loss: 3.9964277744293213
Model 1 - 1L forward training complete.
Model 2 - 1L backward


Epoch 1: 100%|██████████| 186/186 [06:15<00:00,  2.02s/it]


Epoch 1 loss: 5.982174396514893


Epoch 2: 100%|██████████| 186/186 [06:12<00:00,  2.00s/it]


Epoch 2 loss: 5.528874397277832


Epoch 3: 100%|██████████| 186/186 [06:09<00:00,  1.99s/it]


Epoch 3 loss: 5.243752479553223


Epoch 4: 100%|██████████| 186/186 [06:08<00:00,  1.98s/it]


Epoch 4 loss: 5.015181064605713


Epoch 5: 100%|██████████| 186/186 [06:10<00:00,  1.99s/it]


Epoch 5 loss: 4.820666790008545


Epoch 6: 100%|██████████| 186/186 [06:08<00:00,  1.98s/it]


Epoch 6 loss: 4.6485395431518555


Epoch 7: 100%|██████████| 186/186 [06:12<00:00,  2.00s/it]


Epoch 7 loss: 4.495271682739258


Epoch 8: 100%|██████████| 186/186 [06:09<00:00,  1.99s/it]


Epoch 8 loss: 4.356474876403809


Epoch 9: 100%|██████████| 186/186 [06:08<00:00,  1.98s/it]


Epoch 9 loss: 4.228768348693848


Epoch 10: 100%|██████████| 186/186 [06:08<00:00,  1.98s/it]


Epoch 10 loss: 4.11252498626709


Epoch 11: 100%|██████████| 186/186 [06:11<00:00,  2.00s/it]


Epoch 11 loss: 4.006430149078369


Epoch 12: 100%|██████████| 186/186 [06:11<00:00,  2.00s/it]


Epoch 12 loss: 3.908710241317749
Model 2 - 1L backward training complete.
Model 3 - 2L forward


Epoch 1: 100%|██████████| 186/186 [07:08<00:00,  2.31s/it]


Epoch 1 loss: 6.645418643951416


Epoch 2: 100%|██████████| 186/186 [06:57<00:00,  2.24s/it]


Epoch 2 loss: 6.5245842933654785


Epoch 3: 100%|██████████| 186/186 [06:58<00:00,  2.25s/it]


Epoch 3 loss: 6.466971397399902


Epoch 4: 100%|██████████| 186/186 [06:59<00:00,  2.25s/it]


Epoch 4 loss: 6.402833461761475


Epoch 5: 100%|██████████| 186/186 [07:00<00:00,  2.26s/it]


Epoch 5 loss: 6.329519748687744


Epoch 6: 100%|██████████| 186/186 [07:01<00:00,  2.26s/it]


Epoch 6 loss: 6.258840560913086


Epoch 7: 100%|██████████| 186/186 [06:59<00:00,  2.25s/it]


Epoch 7 loss: 6.196720123291016


Epoch 8: 100%|██████████| 186/186 [07:00<00:00,  2.26s/it]


Epoch 8 loss: 6.1404852867126465


Epoch 9: 100%|██████████| 186/186 [06:59<00:00,  2.26s/it]


Epoch 9 loss: 6.091336727142334


Epoch 10: 100%|██████████| 186/186 [06:57<00:00,  2.25s/it]


Epoch 10 loss: 6.048396587371826


Epoch 11: 100%|██████████| 186/186 [06:59<00:00,  2.26s/it]


Epoch 11 loss: 6.005643844604492


Epoch 12: 100%|██████████| 186/186 [06:58<00:00,  2.25s/it]


Epoch 12 loss: 5.965763092041016
Model 3 - 2L forward training complete.
Model 4 - 2L backward


Epoch 1: 100%|██████████| 186/186 [07:07<00:00,  2.30s/it]


Epoch 1 loss: 6.623945236206055


Epoch 2: 100%|██████████| 186/186 [06:58<00:00,  2.25s/it]


Epoch 2 loss: 6.450170993804932


Epoch 3: 100%|██████████| 186/186 [06:57<00:00,  2.25s/it]


Epoch 3 loss: 6.380579471588135


Epoch 4: 100%|██████████| 186/186 [06:59<00:00,  2.25s/it]


Epoch 4 loss: 6.336943626403809


Epoch 5: 100%|██████████| 186/186 [07:00<00:00,  2.26s/it]


Epoch 5 loss: 6.303666591644287


Epoch 6: 100%|██████████| 186/186 [06:59<00:00,  2.25s/it]


Epoch 6 loss: 6.269862174987793


Epoch 7: 100%|██████████| 186/186 [06:59<00:00,  2.26s/it]


Epoch 7 loss: 6.235731601715088


Epoch 8: 100%|██████████| 186/186 [06:59<00:00,  2.26s/it]


Epoch 8 loss: 6.203881740570068


Epoch 9: 100%|██████████| 186/186 [06:58<00:00,  2.25s/it]


Epoch 9 loss: 6.1813459396362305


Epoch 10: 100%|██████████| 186/186 [06:58<00:00,  2.25s/it]


Epoch 10 loss: 6.150386333465576


Epoch 11: 100%|██████████| 186/186 [06:57<00:00,  2.25s/it]


Epoch 11 loss: 6.115967273712158


Epoch 12: 100%|██████████| 186/186 [06:56<00:00,  2.24s/it]

Epoch 12 loss: 6.079244136810303
Model 4 - 2L backward training complete.





In [20]:
for name, model, direction in models:
    train_ppl = compute_perplexity(model, X_train, direction)
    val_ppl = compute_perplexity(model, X_val, direction)
    test_ppl = compute_perplexity(model, X_test, direction)
    print(f"{name} - Train Perplexity: {train_ppl:.2f}, Val Perplexity: {val_ppl:.2f}, Test Perplexity: {test_ppl:.2f}")

sentence = "I love cupcakes"
print("Sentence probability (forward) for 'I love cupcakes':", sentence_probability(model1, sentence, direction='forward'))
print("Sentence probability (backward) for 'I love cupcakes':", sentence_probability(model2, sentence, direction='backward'))

Computing Perplexity: 100%|██████████| 186/186 [03:45<00:00,  1.21s/it]
Computing Perplexity: 100%|██████████| 24/24 [00:30<00:00,  1.26s/it]
Computing Perplexity: 100%|██████████| 24/24 [00:28<00:00,  1.19s/it]


Model 1 - 1L forward - Train Perplexity: 21.80, Val Perplexity: 73.70, Test Perplexity: 68.90


Computing Perplexity: 100%|██████████| 186/186 [03:45<00:00,  1.21s/it]
Computing Perplexity: 100%|██████████| 24/24 [00:30<00:00,  1.25s/it]
Computing Perplexity: 100%|██████████| 24/24 [00:28<00:00,  1.19s/it]


Model 2 - 1L backward - Train Perplexity: 20.42, Val Perplexity: 75.28, Test Perplexity: 71.11


Computing Perplexity: 100%|██████████| 186/186 [04:18<00:00,  1.39s/it]
Computing Perplexity: 100%|██████████| 24/24 [00:32<00:00,  1.35s/it]
Computing Perplexity: 100%|██████████| 24/24 [00:33<00:00,  1.38s/it]


Model 3 - 2L forward - Train Perplexity: 279.55, Val Perplexity: 250.00, Test Perplexity: 248.79


Computing Perplexity: 100%|██████████| 186/186 [04:08<00:00,  1.34s/it]
Computing Perplexity: 100%|██████████| 24/24 [00:32<00:00,  1.35s/it]
Computing Perplexity: 100%|██████████| 24/24 [00:31<00:00,  1.31s/it]


Model 4 - 2L backward - Train Perplexity: 316.83, Val Perplexity: 310.18, Test Perplexity: 308.58
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 337ms/step
Sentence probability (forward) for 'I love cupcakes': 1.843243473559138e-08
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 286ms/step
Sentence probability (backward) for 'I love cupcakes': 6.643910378335513e-09
