In [2]:
import os
os.chdir("drive/MyDrive/DA6401/DA6401_A3/")

In [2]:
import pandas as pd
train_file = "lexicons/ta.translit.sampled.train.tsv"
dev_file = "lexicons/ta.translit.sampled.dev.tsv"
test_file = "lexicons/ta.translit.sampled.test.tsv"

train_data = pd.read_csv (train_file, header=None, sep='\t')
dev_data = pd.read_csv (dev_file, header=None, sep='\t')
test_data = pd.read_csv (test_file, header=None, sep='\t')


In [3]:
train_data.head(10)

Unnamed: 0,0,1,2
0,ஃபியட்,fiat,2
1,ஃபியட்,phiyat,1
2,ஃபியட்,piyat,1
3,ஃபிரான்ஸ்,firaans,1
4,ஃபிரான்ஸ்,france,2
5,ஃபிரான்ஸ்,francis,1
6,ஃபிரான்ஸ்,piraance,1
7,ஃபிலிம்,filim,2
8,ஃபிலிம்,film,3
9,ஃபிலிம்,pilim,1


In [4]:
import numpy as np
import keras

In [5]:
batch_size = 64  # Batch size for training.
epochs = 10  # Number of epochs to train for.
latent_dim = 256  # Latent dimensionality of the encoding space.

In [None]:
CELL_MAP = {
    "RNN" : keras.layers.RNN,
    "LSTM" : keras.layers.LSTM,
    "GRU" : keras.layers.GRU
}

In [42]:
class Char2CharModel:
  def __init__(self):

    #hyperparameters
    self.latent_dim = 256
    self.hidden_size = 64
    self.epochs = 10
    self.batch_size = 64
    self.cell_type = "LSTM"
    self.num_encoder_layers = 1
    self.num_decoder_layers = 1

    #model reqs
    self.num_encoder_tokens = 0
    self.num_decoder_tokens = 0
    self.max_encoder_seq_length = 0
    self.max_decoder_seq_length = 0
    self.input_token_index = None
    self.target_token_index = None
    self.reverse_input_char_index = None
    self.reverse_target_char_index = None
    self.model = None

  def preprocess(self, data, train=False):

    input_texts = []
    target_texts = []
    #Adding "_" as a padding character
    input_characters = set('_')
    target_characters = set('_')
    for index, row in data.iterrows():
        input_text, target_text, attesters = row[1], row[0], row[2]
        # We use "tab" as the "start sequence" character
        # for the targets, and "\n" as "end sequence" character.
        if isinstance(target_text, str) != True or isinstance(input_text, str) != True:
          continue
        target_text = "\t" + target_text + "\n"
        input_texts.append(input_text)
        target_texts.append(target_text)
        for char in input_text:
            if char not in input_characters:
                input_characters.add(char)
        for char in target_text:
            if char not in target_characters:
                target_characters.add(char)

    input_characters = sorted(list(input_characters))
    target_characters = sorted(list(target_characters))
    num_encoder_tokens = len(input_characters)
    num_decoder_tokens = len(target_characters)
    max_encoder_seq_length = max([len(txt) for txt in input_texts])
    max_decoder_seq_length = max([len(txt) for txt in target_texts])

    # print("Number of samples:", len(input_texts))
    # print("Number of unique input tokens:", num_encoder_tokens)
    # print("Number of unique output tokens:", num_decoder_tokens)
    # print("Max sequence length for inputs:", max_encoder_seq_length)
    # print("Max sequence length for outputs:", max_decoder_seq_length)

    input_token_index = dict([(char, i) for i, char in enumerate(input_characters)])
    target_token_index = dict([(char, i) for i, char in enumerate(target_characters)])

    reverse_input_char_index = dict((i, char) for char, i in input_token_index.items())
    reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())

    encoder_input_data = np.zeros(
        (len(input_texts), max_encoder_seq_length, num_encoder_tokens),
        dtype="float32",
    )
    decoder_input_data = np.zeros(
        (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
        dtype="float32",
    )
    decoder_target_data = np.zeros(
        (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
        dtype="float32",
    )

    for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
        for t, char in enumerate(input_text):
            encoder_input_data[i, t, input_token_index[char]] = 1.0
        encoder_input_data[i, t + 1 :, input_token_index["_"]] = 1.0
        for t, char in enumerate(target_text):
            # decoder_target_data is ahead of decoder_input_data by one timestep
            decoder_input_data[i, t, target_token_index[char]] = 1.0
            if t > 0:
                # decoder_target_data will be ahead by one timestep
                # and will not include the start character.
                decoder_target_data[i, t - 1, target_token_index[char]] = 1.0
        decoder_input_data[i, t + 1 :, target_token_index["_"]] = 1.0
        decoder_target_data[i, t:, target_token_index["_"]] = 1.0

    if train==True:
      self.num_encoder_tokens = num_encoder_tokens
      self.num_decoder_tokens = num_decoder_tokens
      self.max_encoder_seq_length = max_encoder_seq_length
      self.max_decoder_seq_length = max_decoder_seq_length
      self.input_token_index = input_token_index
      self.target_token_index = target_token_index
      self.reverse_input_char_index = reverse_input_char_index
      self.reverse_target_char_index = reverse_target_char_index

    return input_characters, target_characters, encoder_input_data, decoder_input_data, decoder_target_data

  def train(self, train_data, dev_data):

    (train_input_characters, train_target_characters, train_encoder_input_data,
     train_decoder_input_data, train_decoder_target_data) = self.preprocess(train_data, train=True)

    (_, _, dev_encoder_input_data,
     dev_decoder_input_data, dev_decoder_target_data) = self.preprocess(dev_data)

    latent_dim = self.latent_dim
    num_encoder_tokens = len(train_input_characters)
    num_decoder_tokens = len(train_target_characters)
    encoder_inputs = keras.Input(shape=(None, num_encoder_tokens))

    encoder_states = []
    for i in range(self.num_encoder_layers):
      if i==0:
        embedding_len = self.latent_dim
      else:
        embedding_len = self.hidden_size
      encoder = CELL_MAP[self.cell_type](embedding_len, return_state=True)
      #cell state is s_t, hidden state is h_t
      encoder_outputs, state_h, state_c = encoder(encoder_inputs)
      encoder_inputs = encoder_outputs
      encoder_states.extend([state_h, state_c])

    # We discard `encoder_outputs` and only keep the states.

    # Set up the decoder, using `encoder_states` as initial state.
    decoder_inputs = keras.Input(shape=(None, num_decoder_tokens))

    # We set up our decoder to return full output sequences,
    # and to return internal states as well. We don't use the
    # return states in the training model, but we will use them in inference.

    for i in range(self.num_decoder_layers):
      decoder_cell = CELL_MAP[self.cell_type](latent_dim, return_sequences=True, return_state=True)
      decoder_outputs, _, _ = decoder_cell(decoder_inputs, initial_state=encoder_states)
      decoder_inputs = decoder_outputs

    decoder_dense = keras.layers.Dense(num_decoder_tokens, activation="softmax")
    decoder_outputs = decoder_dense(decoder_outputs)

    # Define the model that will turn
    # `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
    model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

    model.compile(
    optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"]
    )
    model.fit(
        [train_encoder_input_data, train_decoder_input_data],
        train_decoder_target_data,
        batch_size=self.batch_size,
        epochs=self.epochs,
        validation_data=([dev_encoder_input_data, dev_decoder_input_data],
        dev_decoder_target_data)
    )
    # Save model
    # model.save("s2s_model.keras")
    self.model = model
    self.predictor_setup()

  def predictor_setup(self):

    model = self.model
    latent_dim = self.latent_dim

    encoder_inputs = model.input[0]  # input_1

    encoder_outputs, state_h_enc, state_c_enc = model.layers[1+self.num_encoder_layers].output  # lstm_1
    encoder_states = [state_h_enc, state_c_enc]
    encoder_model = keras.Model(encoder_inputs, encoder_states)

    decoder_inputs = model.input[1]  # input_2
    decoder_state_input_h = keras.Input(shape=(latent_dim,))
    decoder_state_input_c = keras.Input(shape=(latent_dim,))
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    decoder_cell = model.layers[2+self.num_decoder_layers]
    decoder_outputs, state_h_dec, state_c_dec = decoder_cell(
        decoder_inputs, initial_state=decoder_states_inputs
    )
    decoder_states = [state_h_dec, state_c_dec]
    decoder_dense = model.layers[4]
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = keras.Model(
        [decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states
    )

    self.encoder_model = encoder_model
    self.decoder_model = decoder_model

  def decode(self, word):

    input_seq = np.zeros((1, self.max_encoder_seq_length, self.num_encoder_tokens), dtype="float32")
    for t, char in enumerate(word):
        input_seq[0, t, self.input_token_index[char]] = 1.0
    input_seq[0, t + 1 :, self.input_token_index["_"]] = 1.0

    # Encode the input as state vectors.
    states_value = self.encoder_model.predict(input_seq, verbose=0)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, self.num_decoder_tokens))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, self.target_token_index["\t"]] = 1.0

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_word = ""
    while not stop_condition:
        output_tokens, h, c = self.decoder_model.predict(
            [target_seq] + states_value, verbose=0
        )

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = self.reverse_target_char_index[sampled_token_index]
        decoded_word += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if sampled_char == "\n" or len(decoded_word) > self.max_decoder_seq_length:
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, self.num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.0

        # Update states
        states_value = [h, c]
    return decoded_word

  def evaluate(self, test_data):
    test_data[3] = test_data.iloc[:,1].apply(self.decode)
    test_data[3] = test_data[3].str.replace("\n", "").str.replace("_", "")
    accuracy = (test_data.iloc[:,1] == test_data.iloc[:,3]).sum()/len(test_data)
    print(f"Accuracy on test set: {accuracy}")

In [43]:
agent = Char2CharModel()

In [44]:
agent.train(train_data, dev_data)

Epoch 1/10
[1m1066/1066[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 9ms/step - accuracy: 0.7225 - loss: 1.0722 - val_accuracy: 0.7387 - val_loss: 0.9022
Epoch 2/10
[1m1066/1066[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 8ms/step - accuracy: 0.8174 - loss: 0.6264 - val_accuracy: 0.7535 - val_loss: 0.8397
Epoch 3/10
[1m1066/1066[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 9ms/step - accuracy: 0.8418 - loss: 0.5366 - val_accuracy: 0.7572 - val_loss: 0.8461
Epoch 4/10
[1m1066/1066[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 8ms/step - accuracy: 0.8621 - loss: 0.4644 - val_accuracy: 0.7824 - val_loss: 0.7548
Epoch 5/10
[1m1066/1066[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 9ms/step - accuracy: 0.8858 - loss: 0.3840 - val_accuracy: 0.8380 - val_loss: 0.5469
Epoch 6/10
[1m1066/1066[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 8ms/step - accuracy: 0.9103 - loss: 0.3016 - val_accuracy: 0.8237 - val_loss: 0.5885
Epoch 7/10


In [51]:
agent.decode("shrey")

'ச்ரீ\n'

In [28]:
!git push origin main

fatal: could not read Username for 'https://github.com': No such device or address


In [9]:
username = "JG-0212"
# token = "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"
# remote_url = f"https://{username}:{token}@github.com/{username}/DA6401_A3.git"

In [30]:
!git push remote_url main

fatal: 'remote_url' does not appear to be a git repository
fatal: Could not read from remote repository.

Please make sure you have the correct access rights
and the repository exists.


In [31]:
# remote_url

'https://JG-0212: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@github.com/JG-0212/DA6401_A3.git'

In [6]:
!git config --global user.email "jpsai6594@gmail.com"
!git config --global user.name "JG-0212"

In [8]:
!git push origin main

Enumerating objects: 5, done.
Counting objects:  20% (1/5)Counting objects:  40% (2/5)Counting objects:  60% (3/5)Counting objects:  80% (4/5)Counting objects: 100% (5/5)Counting objects: 100% (5/5), done.
Delta compression using up to 2 threads
Compressing objects:  33% (1/3)Compressing objects:  66% (2/3)Compressing objects: 100% (3/3)Compressing objects: 100% (3/3), done.
Writing objects:  33% (1/3)Writing objects:  66% (2/3)Writing objects: 100% (3/3)Writing objects: 100% (3/3), 4.03 KiB | 374.00 KiB/s, done.
Total 3 (delta 2), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (2/2), completed with 2 local objects.[K
remote: [1;31merror[m: GH013: Repository rule violations found for refs/heads/main.[K
remote: 
remote: - GITHUB PUSH PROTECTION[K
remote:   —————————————————————————————————————————[K
remote:     Resolve the following violations before pushing again[K
remote: 
remote:     - Push cannot contain secrets[K
remote: 
remote:     [K
remote: