In [2]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.utils import to_categorical
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

D0602 21:50:07.697585670    2751 config.cc:196]                        gRPC EXPERIMENT call_status_override_on_cancellation   OFF (default:OFF)
D0602 21:50:07.697612431    2751 config.cc:196]                        gRPC EXPERIMENT call_v3                                OFF (default:OFF)
D0602 21:50:07.697616093    2751 config.cc:196]                        gRPC EXPERIMENT canary_client_privacy                  ON  (default:ON)
D0602 21:50:07.697618575    2751 config.cc:196]                        gRPC EXPERIMENT capture_base_context                   ON  (default:ON)
D0602 21:50:07.697621015    2751 config.cc:196]                        gRPC EXPERIMENT client_idleness                        ON  (default:ON)
D0602 21:50:07.697623338    2751 config.cc:196]                        gRPC EXPERIMENT client_privacy                         ON  (default:ON)
D0602 21:50:07.697625755    2751 config.cc:196]                        gRPC EXPERIMENT dapper_request_wire_size               OFF (default:O

In [3]:
train = pd.read_csv('/kaggle/input/daigt-proper-train-dataset/train_drcat_03.csv')

In [4]:
def prepare_text_data(text, max_sequence_len, max_vocab_size=10000):
    # Initialize the tokenizer with a limited vocabulary size
    tokenizer = Tokenizer(num_words=max_vocab_size)
    tokenizer.fit_on_texts([text])
    total_words = min(max_vocab_size, len(tokenizer.word_index) + 1)

    input_sequences = []
    for line in text.split('\n'):
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)

    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

    predictors, label = input_sequences[:,:-1], input_sequences[:,-1]
    label = to_categorical(label, num_classes=total_words)

    return predictors, label, tokenizer, total_words

In [5]:
train.head()

Unnamed: 0,essay_id,text,label,source,prompt,fold
0,6BB4BEB51A80,People can learn without making mistakes. Peop...,0,original_moth,Task: \n\nWrite an essay examining the ways in...,1
1,30A8FB981469,PHONES AND DRIVING\n\nIn this world in which w...,0,persuade_corpus,,2
2,B403A4E28BCE,"Okay, here's my essay:\n\nMaking Your Own Deci...",1,llama2_chat,Task: Write an essay exploring why teenagers s...,8
3,B8F0ECC9DC86,Dear : Principal\n\nI believe that allowing s...,1,mistral7binstruct_v2,\nTask: Should students be encouraged to parti...,8
4,159424F57C24,Well for one if you seek more then one person ...,0,persuade_corpus,,8


In [6]:
prepare_text_data(train.iloc[1].text,15)

(array([[  0,   0,   0, ...,   0,   0,  17],
        [  0,   0,   0, ...,   0,  17,   3],
        [  0,   0,   0, ...,   0,   0,  10],
        ...,
        [  3,   9,  15, ...,  32,  33, 191],
        [  9,  15,   5, ...,  33, 191,  69],
        [ 15,   5,   8, ..., 191,  69,  12]], dtype=int32),
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 1.]]),
 <keras.src.legacy.preprocessing.text.Tokenizer at 0x7c10aa6e3b20>,
 193)

In [7]:
# Combine text data for tokenization
if train is not None:
    #train_data, _ = train_test_split(train, test_size=0.5, random_state=5)
    combined_text = ' '.join(train['text'].tolist())

    # Tokenize and prepare sequences
    max_sequence_len = 40
    max_vocab_size = 1500  # Limit vocabulary size
    predictors, label, tokenizer, total_words = prepare_text_data(combined_text, max_sequence_len,max_vocab_size)

In [10]:
print(predictors[0])
print(label[0])

[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0 25]
[0. 0. 0. ... 0. 0. 0.]


In [8]:
# detect and init the TPU
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()

tf.tpu.experimental.initialize_tpu_system(tpu)
tpu_strategy = tf.distribute.TPUStrategy(tpu)

def build_model(max_sequence_len, total_words):
    with tpu_strategy.scope():
        model = tf.keras.Sequential([
            tf.keras.layers.Embedding(total_words, 32, input_length=max_sequence_len-1),
            tf.keras.layers.LSTM(32),
            tf.keras.layers.Dense(total_words, activation='softmax')
        ])
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        return model

INFO:tensorflow:Deallocate tpu buffers before initializing tpu system.
INFO:tensorflow:Initializing the TPU system: local


I0000 00:00:1717365215.474427    2751 service.cc:145] XLA service 0x5579417a1980 initialized for platform TPU (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1717365215.474530    2751 service.cc:153]   StreamExecutor device (0): TPU, 2a886c8
I0000 00:00:1717365215.474535    2751 service.cc:153]   StreamExecutor device (1): TPU, 2a886c8
I0000 00:00:1717365215.474538    2751 service.cc:153]   StreamExecutor device (2): TPU, 2a886c8
I0000 00:00:1717365215.474542    2751 service.cc:153]   StreamExecutor device (3): TPU, 2a886c8
I0000 00:00:1717365215.474544    2751 service.cc:153]   StreamExecutor device (4): TPU, 2a886c8
I0000 00:00:1717365215.474547    2751 service.cc:153]   StreamExecutor device (5): TPU, 2a886c8
I0000 00:00:1717365215.474550    2751 service.cc:153]   StreamExecutor device (6): TPU, 2a886c8
I0000 00:00:1717365215.474552    2751 service.cc:153]   StreamExecutor device (7): TPU, 2a886c8


INFO:tensorflow:Finished initializing TPU system.
INFO:tensorflow:Found TPU system:
INFO:tensorflow:*** Num TPU Cores: 8
INFO:tensorflow:*** Num TPU Workers: 1
INFO:tensorflow:*** Num TPU Cores Per Worker: 8
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:0, TPU, 0, 0)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:1, TPU, 0, 0)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:2, TPU, 0, 0)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:3, TPU, 0, 0)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:4, TPU, 0, 0)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:5, TPU, 0, 0)
I

In [9]:
 # Build and compile the model
model = build_model(max_sequence_len, total_words)

# Train the model
model.fit(predictors, label, epochs=20,batch_size=256, verbose=1)
#model.fit(predictors, label, epochs=5, verbose=1, callbacks=[])

I0000 00:00:1717365220.084697    2751 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 1/20


2024-06-02 21:59:13.213768: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:961] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node StatefulPartitionedCall.
I0000 00:00:1717365554.291269    3559 tpu_compilation_cache_interface.cc:441] TPU host compilation cache miss: cache_key(1dfcdfefcdf31df8:0:0), session_name()


[1m   15/57482[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m11:26[0m 12ms/step - accuracy: 0.0245 - loss: 7.3105  

I0000 00:00:1717365555.783506    3559 tpu_compile_op_common.cc:245] Compilation of 1dfcdfefcdf31df8:0:0 with session name  took 1.492175926s and succeeded
I0000 00:00:1717365555.790423    3559 tpu_compilation_cache_interface.cc:475] TPU host compilation cache: compilation complete for cache_key(1dfcdfefcdf31df8:0:0), session_name(), subgraph_key(std::string(property.function_name) = "cluster_one_step_on_iterator_10443328887483456123", property.function_library_fingerprint = 7656539742441471266, property.mlir_module_fingerprint = 0, property.num_replicas = 8, topology.chip_bounds().x = 2, topology.chip_bounds().y = 2, topology.chip_bounds().z = 1, topology.wrap().x = false, topology.wrap().y = false, topology.wrap().z = false, std::string(property.shapes_prefix) = "32,39,;32,1500,;", property.guaranteed_constants_size = 0, embedding_partitions_fingerprint = "1688352644216761960")
I0000 00:00:1717365555.790472    3559 tpu_compilation_cache_interface.cc:541] After adding entry for key 1df

[1m57479/57482[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 10ms/step - accuracy: 0.1179 - loss: 5.1781

I0000 00:00:1717366156.772570    3507 tpu_compilation_cache_interface.cc:441] TPU host compilation cache miss: cache_key(68e7849b6daf4c89:0:0), session_name()


[1m57482/57482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m610s[0m 10ms/step - accuracy: 0.1179 - loss: 5.1781
Epoch 2/20


I0000 00:00:1717366157.783694    3507 tpu_compile_op_common.cc:245] Compilation of 68e7849b6daf4c89:0:0 with session name  took 1.011065568s and succeeded
I0000 00:00:1717366157.790371    3507 tpu_compilation_cache_interface.cc:475] TPU host compilation cache: compilation complete for cache_key(68e7849b6daf4c89:0:0), session_name(), subgraph_key(std::string(property.function_name) = "cluster_one_step_on_iterator_10443328887483456123", property.function_library_fingerprint = 7656539742441471266, property.mlir_module_fingerprint = 0, property.num_replicas = 8, topology.chip_bounds().x = 2, topology.chip_bounds().y = 2, topology.chip_bounds().z = 1, topology.wrap().x = false, topology.wrap().y = false, topology.wrap().z = false, std::string(property.shapes_prefix) = "12,39,;12,1500,;", property.guaranteed_constants_size = 0, embedding_partitions_fingerprint = "1688352644216761960")
I0000 00:00:1717366157.790407    3507 tpu_compilation_cache_interface.cc:541] After adding entry for key 68e

[1m57482/57482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m604s[0m 10ms/step - accuracy: 0.1832 - loss: 4.4918
Epoch 3/20
[1m57482/57482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m604s[0m 10ms/step - accuracy: 0.1958 - loss: 4.3614
Epoch 4/20
[1m57482/57482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m604s[0m 10ms/step - accuracy: 0.2027 - loss: 4.2942
Epoch 5/20
[1m57482/57482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m605s[0m 10ms/step - accuracy: 0.2069 - loss: 4.2535
Epoch 6/20
[1m57482/57482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m602s[0m 10ms/step - accuracy: 0.2099 - loss: 4.2311
Epoch 7/20
[1m57482/57482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m601s[0m 10ms/step - accuracy: 0.2118 - loss: 4.2093
Epoch 8/20
[1m57482/57482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m603s[0m 10ms/step - accuracy: 0.2134 - loss: 4.1918
Epoch 9/20
[1m57482/57482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m602s[0m 10ms/step - accuracy: 0.2133 - loss: 4.

<keras.src.callbacks.history.History at 0x7c10753b7d90>

In [None]:
import tensorflow as tf
import keras
print(tf.__version__)
print(keras.__version__)

In [23]:
from IPython.display import FileLink

# Save the model to a file
model.save('text_generation_model01.h5')

# Create a download link
FileLink('text_generation_model01.h5')



ValueError: Device /job:localhost/replica:0/task:0/device:CPU:0 is not found

In [14]:
# Function to generate text
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict(token_list, verbose=0)
        predicted_word_index = np.argmax(predicted, axis=-1)
        output_word = tokenizer.index_word[predicted_word_index[0]]
        seed_text += " " + output_word
    return seed_text

In [17]:
# Generate and print sample text
print(generate_text("Once upon a time making", 10, model, max_sequence_len))

Once upon a time making a difference in the world and the people can be


In [25]:
print(generate_text("The cat is ", 10, model, max_sequence_len))

The cat is  a good idea to be able to do it and


In [None]:
print(generate_text("I'am  ", 10, model, max_sequence_len))