In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
import pickle

In [2]:
Data = pd.read_csv("train.csv")

## Data Preprocessing and vocabulary creation

In [3]:
class Vocabulary:

    def __init__(self, name):
        PAD_token = 0   # Used for padding short sentences
        SOS_token = 1   # Start-of-sentence token
        EOS_token = 2   # End-of-sentence token
        OOV_token = 3
        self.name = name
        self.token2index = {}
        self.token2count = {}
        self.index2token = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS", OOV_token: "OOV"}
        self.sorted_dict = {}
        self.num_tokens = 4  # unique tokens
        self.num_sentences = 0
        self.longest_sentence = 0
        self.top_500 = []
        

    def add_token(self, token):
        if token not in self.token2index:
            # First entry of token into vocabulary
            self.token2index[token] = self.num_tokens
            self.token2count[token] = 1
            self.index2token[self.num_tokens] = token
            self.num_tokens += 1  # increase number of unique tokens
        else:
            # token exists; increase token count
            self.token2count[token] += 1
            
    def add_sentence(self, sentence):
        sentence_len = 0
        for token in sentence:
            sentence_len += 1
            self.add_token(token)
        if sentence_len > self.longest_sentence:
            # This is the longest sentence
            self.longest_sentence = sentence_len
        # Count the number of sentences
        self.num_sentences += 1

    def to_token(self, index):
        return self.index2token[index]

    def to_index(self, token):
        return self.token2index[token]
    
    def print_longest_sentence_length(self):
        print(f"Longest sentence = {self.longest_sentence}, #sentences = {self.num_sentences}")
    
    def top_k(self):
        self.sorted_dict = {key: val for key,val in sorted(self.token2count.items(), key=lambda item: item[1], reverse = True)}
        for k,v in self.sorted_dict.items():
            self.top_500.append(k)
            
        # generate new mapping
        self.index2token.clear()
        self.token2index.clear()
        self.index2token = {0: "PAD", 1: "SOS", 2: "EOS", 3: "OOV"}
        self.token2index = {"PAD": 0, "SOS":1, "EOS":2, "OOV": 3}
        
        cnt = 4
        for element in self.top_500:
            self.token2index[element] = cnt
            cnt += 1
            
        for k,v in self.token2index.items():
            self.index2token[v] = k

    
    def unique_tokens(self):
        return len(self.token2index.keys())
        
    
    
    def postprocess_embedding(self,tokenList,length):
        tokenList.insert(0,1)
        if len(tokenList) > length+1:
            return tokenList[:length+1] + [2]
        elif len(tokenList) < length+1:
            tokenList.append(2)
            for i in range(len(tokenList)-1,length+1):
                tokenList.append(0)  # PAD appended
            return tokenList
        return tokenList + [2]
    
    def create_embedding(self,tokenList,embedding_length):
        embedded_list = []
        for element in tokenList:
            if element in self.top_500[:500]:
                embedded_list.append(self.token2index[element])
            else:
                embedded_list.append(3)  # index for OOV_token
        return self.postprocess_embedding(embedded_list,embedding_length)

In [4]:
codeToken = Data['targetTokens'].values  #  tokenized correct code
targetToken = list(codeToken)
#print(codeToken)

codeToken = Data['sourceTokens'].values  # tokenized incorrect code
srcToken = list(codeToken)

In [5]:
def NormaliseList(Data,_name,Tokens):
    X = [] # empty list for tokenised code
    list_of_dictionaries = []

    CodeToken = Data[_name].values
    CodeToken = list(CodeToken)


    dataTypes = ["int","char","double","float"]
    invalid = ['^','&','(',')','{','}','[',']','||','==','=','!=',';',':',',','!','+','++','--','-','*','**','/','/=','%=','*=','-=','+=','%','<=','<','>=','>',"main","int","char","double","float","long"]

    
    
    for codeNum,code in enumerate(Tokens): # iterate over codes
        #print(code)
        tempDict = {}
        VAR_normalisation = {} # dictionary to convert the normalised parameters back to actual parameters
        var = 0
        pos = -1
        Code = eval(code)

        for line in Code:                   # iterate over lines of a single code
            pos = -1
            for index,word in enumerate(line):  # iterate over words/tokens of a single line of a code
                if index <= pos:
                    continue
                if word in dataTypes:  # token is a datatype
                    pos = index + 1
                    while pos < len(line):
                        if line[pos] in tempDict.keys():
                            pos += 1
                            continue
                        if line[pos] not in invalid and line[pos].isnumeric() == False: # and line[pos].isalpha() == False:
                            tempDict[line[pos]] = "VAR_" + word + '_' + str(var)
                            VAR_normalisation["VAR_" + word + '_' + str(var)] = line[pos]
                            var += 1
                        pos += 1

        list_of_dictionaries.append(VAR_normalisation)

        temp = []
        for token in eval(CodeToken[codeNum]):
            if token in tempDict.keys():
                temp.append(tempDict[token])
            else:
                temp.append(token)
        X.append(temp)

#     for codeNum,code in enumerate(Tokens):
#         temp = []
#         for token in eval(CodeToken[codeNum]):
#             temp.append(token)
#         X.append(temp)  

    return X,list_of_dictionaries


In [8]:
# Normalised code tokens
X, dict_x = NormaliseList_new(Data,"sourceLineTokens",srcToken)
Y, dict_y = NormaliseList_new(Data,"targetLineTokens",targetToken)

In [7]:
# creating separate vocabularies for source and target code token lists
from Assignment_2 import Vocabulary
vocab = []
srcVocabulary = Vocabulary("Assignment-2-src")
targetVocabulary = Vocabulary("Assignment-2-target")

ModuleNotFoundError: No module named 'Assignment_2'

In [11]:

for index,tokenList in enumerate(X):
    srcVocabulary.add_sentence(tokenList)
    
srcVocabulary.print_longest_sentence_length()
srcVocabulary.top_k()
#print(top_250,top_500,top_1000)

for index,tokenList in enumerate(Y):
    targetVocabulary.add_sentence(tokenList)
        
targetVocabulary.print_longest_sentence_length()
targetVocabulary.top_k()

vocab.append(srcVocabulary)
vocab.append(targetVocabulary)
with open("Dictionaries.pkl",'wb') as f:
    pickle.dump(vocab,f,pickle.HIGHEST_PROTOCOL)

Longest sentence = 154, #sentences = 14643
Longest sentence = 169, #sentences = 14643


In [84]:
X_train = []
Y_train = []
embedding_length = 35
for index,tokenList in enumerate(X):
    X_train.append(srcVocabulary.create_embedding(tokenList,embedding_length))

for index,tokenList in enumerate(Y):
    Y_train.append(targetVocabulary.create_embedding(tokenList,embedding_length))

In [85]:
for i in range(len(Y_train)-10000):
    print((X_train[i]))


[1, 19, 4, 8, 48, 9, 46, 23, 17, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 8, 9, 4, 13, 20, 14, 5, 24, 62, 20, 29, 20, 43, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 12, 4, 3, 97, 330, 5, 7, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 90, 94, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 19, 4, 3, 42, 8, 42, 9, 15, 64, 5, 21, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 360, 4, 8, 6, 9, 6, 18, 6, 14, 5, 7, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 37, 4, 92, 27, 8, 5, 7, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

[1, 88, 8, 4, 9, 13, 6, 14, 24, 6, 29, 43, 6, 50, 58, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 12, 4, 3, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 26, 23, 192, 4, 4, 34, 22, 41, 5, 102, 40, 22, 4, 47, 22, 53, 5, 102, 40, 5, 7, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 69, 4, 4, 8, 48, 62, 25, 17, 5, 27, 4, 27, 9, 48, 164, 46, 17, 5, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 49, 104, 7, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 37, 4, 3, 6, 27, 4, 6, 3, 5, 5, 7, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 21, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 269, 4, 18, 6, 126, 6, 3, 5, 7, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 37, 4, 110, 27, 8, 11, 72, 10, 5, 7, 2, 0, 0, 0, 0,

[1, 254, 4, 18, 6, 8, 15, 16, 6, 9, 5, 7, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 12, 4, 30, 6, 8, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 111, 76, 84, 17, 6, 95, 7, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 12, 4, 3, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 32, 8, 9, 40, 13, 14, 16, 7, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 37, 4, 79, 6, 27, 26, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 32, 8, 4, 63, 18, 11, 13, 10, 6, 32, 14, 5, 21, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 139, 6, 143, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

## LSTM Model training 

In [86]:
latent_dim = 32
batch_size = 64
epochs = 35

#input_characters = sorted(list(input_characters))
#target_characters = sorted(list(target_characters))
num_encoder_tokens = 504 #len(input_characters)  # 250
num_decoder_tokens = 504 #len(target_characters)  # 250
max_encoder_seq_length = 37 #max([len(txt) for txt in input_texts])  # 22
max_decoder_seq_length = 37 #max([len(txt) for txt in target_texts]) # 22

print("Number of samples:", len(X_train))
print("Number of unique input tokens:", srcVocabulary.unique_tokens())
print("Number of unique output tokens:", targetVocabulary.unique_tokens())
print("Max sequence length for inputs:", 37)
print("Max sequence length for outputs:", 37)

#input_token_index = dict([(char, i) for i, char in enumerate(input_characters)])
#target_token_index = dict([(char, i) for i, char in enumerate(target_characters)])

# one-hot encoder



Number of samples: 14643
Number of unique input tokens: 3699
Number of unique output tokens: 2929
Max sequence length for inputs: 37
Max sequence length for outputs: 37


In [87]:
encoder_input_data = np.zeros(
    (len(X_train), max_encoder_seq_length, num_encoder_tokens), dtype="float32"
)
decoder_input_data = np.zeros(
    (len(X_train), max_decoder_seq_length, num_decoder_tokens), dtype="float32"
)
decoder_target_data = np.zeros(
    (len(X_train), max_decoder_seq_length, num_decoder_tokens), dtype="float32"
)

for i, (input_text, target_text) in enumerate(zip(X_train, Y_train)):
    for t, token in enumerate(input_text):
        encoder_input_data[i, t, token] = 1.0
    #encoder_input_data[i, t + 1 :, input_token_index[" "]] = 1.0
    for t, token in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t, token] = 1.0
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, token] = 1.0
    #decoder_input_data[i, t + 1 :, target_token_index[" "]] = 1.0
    #decoder_target_data[i, t:, target_token_index[" "]] = 1.0



In [59]:
"""
## Build the model
"""

# Define an input sequence and process it.
encoder_inputs = keras.Input(shape=(None, num_encoder_tokens))
encoder = keras.layers.LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)

# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = keras.Input(shape=(None, num_decoder_tokens))

# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = keras.layers.LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = keras.layers.Dense(num_decoder_tokens, activation="softmax")
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

"""
## Train the model
"""

model.compile(
    optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"]
)
model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_target_data,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.2,
)
# Save model
model.save("s2s2")

"""
## Run inference (sampling)
1. encode input and retrieve initial decoder state
2. run one step of decoder with this initial state
and a "start of sequence" token as target.
Output will be the next target token.
3. Repeat with the current target token and current states
"""

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
INFO:tensorflow:Assets written to: s2s2/assets


'\n## Run inference (sampling)\n1. encode input and retrieve initial decoder state\n2. run one step of decoder with this initial state\nand a "start of sequence" token as target.\nOutput will be the next target token.\n3. Repeat with the current target token and current states\n'

In [5]:
# Define sampling models
# Restore the model and construct the encoder and decoder.
model = keras.models.load_model("s2s2")

latent_dim = 32
batch_size = 64
epochs = 15

#input_characters = sorted(list(input_characters))
#target_characters = sorted(list(target_characters))
num_encoder_tokens = 504 #len(input_characters)  # 250
num_decoder_tokens = 504 #len(target_characters)  # 250
max_encoder_seq_length = 37 #max([len(txt) for txt in input_texts])  # 22
max_decoder_seq_length = 37 #max([len(txt) for txt in target_texts]) # 22

encoder_inputs = model.input[0]  # input_1
encoder_outputs, state_h_enc, state_c_enc = model.layers[2].output  # lstm_1
encoder_states = [state_h_enc, state_c_enc]
encoder_model = keras.Model(encoder_inputs, encoder_states)

decoder_inputs = model.input[1]  # input_2
decoder_state_input_h = keras.Input(shape=(latent_dim,), name="input_3")
decoder_state_input_c = keras.Input(shape=(latent_dim,), name="input_5")
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_lstm = model.layers[3]
decoder_outputs, state_h_dec, state_c_dec = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs
)
decoder_states = [state_h_dec, state_c_dec]
decoder_dense = model.layers[4]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = keras.Model(
    [decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states
)

# Reverse-lookup token index to decode sequences back to
# something readable.
#reverse_input_char_index = dict((i, char) for char, i in input_token_index.items())
#reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())


def decode_sequence(input_seq,encoder_model,decoder_model,num_decoder_tokens,targetVocabulary):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # Populate the first character of target sequence with the start character.

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = []
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = targetVocabulary.to_token(sampled_token_index)
        decoded_sentence.append(sampled_char)

        # Exit condition: either hit max length
        # or find stop character.
        if sampled_char == "EOS" or len(decoded_sentence) > num_decoder_tokens:
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.0

        # Update states
        states_value = [h, c]
    return decoded_sentence


"""
You can now generate decoded sentences as such:
"""
def Revert_back(X,dict_):
    temp = []
    for element in X:
        if element in dict_.keys():
            temp.append(dict_[element])
        elif element == "OOV":
            temp.append("OOV_Token")
        else :
            temp.append(element)
        
    return temp
    
# for seq_index in range(len(Valid_X)):
#     #Take one sequence (part of the training set)model = keras.models.load_model("s2s")
#     input_seq = encoder_input_data[seq_index : seq_index + 1]
#     #decoded_sentence = Revert_back(decode_sequence(input_seq),dict_y[seq_index],seq_index)
#     decoded_sentence = Revert_back(decode_sequence(input_seq,encoder_model,decoder_model,num_decoder_tokens,targetVocabulary),dict_x[seq_index])
#     print("-")
#     print("Tokenised Output sentence:", Revert_back(Y[seq_index],dict_y[seq_index],seq_index))
#     print("Tokenised Output sentence:", Revert_back(Y_val[seq_index],dict_y[seq_index]))
#     print("Decoded sentence:", Revert_back(decoded_sentence[:-1],dict_y[seq_index]))
    
# reader = csv.reader(open('valid.csv', 'rb'))
# #reader1 = csv.reader(open('output1.csv', 'rb'))
# writer = csv.writer(open('output.csv', 'wb'))
# writer.writerow(["Unnamed:0","sourceText","targetText","sourceLineText","targetLineText","lineNums_Text","sourceTokens","targetTokns","sourceLineTokens","taretLineTokens","fixedTokens"])
#--------------------------------------------------------------------------
# write_Back = []
    
# for seq_index in range(len(Valid_X)):
#     # Take one sequence (part of the training set)model = keras.models.load_model("s2s")
#     input_seq = encoder_input_data[seq_index : seq_index + 1]
#     #decoded_sentence = Revert_back(decode_sequence(input_seq),dict_y[seq_index],seq_index)
#     decoded_sentence = Revert_back(decode_sequence(input_seq,encoder_model,decoder_model,num_decoder_tokens,targetVocabulary),dict_x[seq_index])
#     #print("-")
#     #print("Tokenised Output sentence:", Revert_back(Y[seq_index],dict_y[seq_index],seq_index))
#     #print("Tokenised Output sentence:", Revert_back(Y_val[seq_index],dict_y[seq_index]))
#     #print("Decoded sentence:", Revert_back(decoded_sentence[:-1],dict_y[seq_index]))
#     write_Back.append(Revert_back(decoded_sentence[:-1],dict_y[seq_index]))

# Data["fixedTokens"] = write_Back
# Data.to_csv("output.csv")
        

## Try Model on Validation data

In [3]:
with open("Dictionaries.pkl",'rb') as f:
    vocab = pickle.load(f)
    srcVocabulary = vocab[0]
    targetVocabulary = vocab[1]

AttributeError: Can't get attribute 'Vocabulary' on <module '__main__'>

In [13]:
Valid = pd.read_csv("valid.csv")

In [14]:
codeToken = Valid['targetTokens'].values  #  tokenized correct code
targetToken = list(codeToken)
#print(codeToken)

codeToken = Valid['sourceTokens'].values  # tokenized incorrect code
srcToken = list(codeToken)

In [15]:
# Normalised code tokens
X_val, dict_val_x = NormaliseList_new(Valid,"sourceLineTokens",srcToken)
Y_val, dict_val_y = NormaliseList_new(Valid,"targetLineTokens",targetToken)

In [16]:
# for index,tokenList in enumerate(X_val):
#     srcVocabulary.add_sentence(tokenList)
    
#srcVocabulary.print_longest_sentence_length()
#srcVocabulary.top_k()
#print(top_250,top_500,top_1000)

# for index,tokenList in enumerate(Y_val):
#      targetVocabulary.add_sentence(tokenList)
        
# targetVocabulary.print_longest_sentence_length()
# targetVocabulary.top_k()

# for i in range(len(Y_train)-10000):
#     print((X_train[i]))

latent_dim = 32
batch_size = 64
epochs = 15

#input_characters = sorted(list(input_characters))
#target_characters = sorted(list(target_characters))
num_encoder_tokens = 504 #len(input_characters)  # 250
num_decoder_tokens = 504 #len(target_characters)  # 250
max_encoder_seq_length = 37 #max([len(txt) for txt in input_texts])  # 22
max_decoder_seq_length = 37 #max([len(txt) for txt in target_texts]) # 22

In [93]:
Valid_X = []
Valid_Y = []
embedding_length = 35
for index,tokenList in enumerate(X_val):
    Valid_X.append(srcVocabulary.create_embedding(tokenList,embedding_length))

for index,tokenList in enumerate(Y_val):
    Valid_Y.append(targetVocabulary.create_embedding(tokenList,embedding_length))

In [96]:
for i in range(len(Valid_X)):
    print((Valid_X[i]))

[1, 19, 4, 4, 8, 4, 9, 5, 55, 13, 5, 28, 4, 3, 4, 14, 5, 42, 24, 5, 5, 21, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 37, 4, 30, 6, 27, 8, 11, 72, 10, 5, 7, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 8, 9, 13, 44, 67, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 8, 9, 6, 13, 7, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 231, 23, 8, 4, 18, 11, 9, 10, 5, 7, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 142, 11, 8, 10, 9, 18, 7, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 8, 9, 13, 4, 14, 6, 24, 11, 127, 15, 16, 10, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 3, 17, 7, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 49, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

[1, 12, 4, 3, 6, 26, 6, 34, 6, 41, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 12, 4, 30, 6, 3, 39, 8, 5, 7, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 36, 4, 8, 9, 17, 6, 18, 11, 13, 10, 46, 100, 6, 14, 33, 5, 7, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 19, 21, 8, 35, 9, 5, 21, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 19, 4, 80, 6, 120, 5, 46, 4, 72, 6, 86, 5, 28, 4, 80, 6, 120, 5, 46, 4, 72, 15, 16, 6, 86, 15, 16, 5, 28, 4, 80, 6, 120, 5, 46, 2]
[1, 21, 8, 9, 13, 15, 4, 4, 14, 48, 67, 5, 20, 128, 4, 67, 6, 24, 5, 7, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 8, 9, 13, 15, 14, 11, 24, 10, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 19, 4, 87, 42, 18, 11, 8, 10, 42, 125, 28, 101, 42, 38, 11, 9, 15, 16, 10, 42, 129, 28, 54, 11, 13, 15, 40, 10, 14, 24, 5, 2, 0, 0, 0, 0]
[1, 19, 4, 1

In [97]:
encoder_input_data = np.zeros(
    (len(Valid_X), max_encoder_seq_length, num_encoder_tokens), dtype="float32"
)
decoder_input_data = np.zeros(
    (len(Valid_Y), max_decoder_seq_length, num_decoder_tokens), dtype="float32"
)
decoder_target_data = np.zeros(
    (len(Valid_Y), max_decoder_seq_length, num_decoder_tokens), dtype="float32"
)

for i, (input_text, target_text) in enumerate(zip(Valid_X, Valid_Y)):
    for t, token in enumerate(input_text):
        encoder_input_data[i, t, token] = 1.0
    #encoder_input_data[i, t + 1 :, input_token_index[" "]] = 1.0
    for t, token in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t, token] = 1.0
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, token] = 1.0
    #decoder_input_data[i, t + 1 :, target_token_index[" "]] = 1.0
    #decoder_target_data[i, t:, target_token_index[" "]] = 1.0

In [98]:
# Define sampling models
# Restore the model and construct the encoder and decoder.
model = keras.models.load_model("s2s2")

latent_dim = 32

encoder_inputs = model.input[0]  # input_1
encoder_outputs, state_h_enc, state_c_enc = model.layers[2].output  # lstm_1
encoder_states = [state_h_enc, state_c_enc]
encoder_model = keras.Model(encoder_inputs, encoder_states)

decoder_inputs = model.input[1]  # input_2
decoder_state_input_h = keras.Input(shape=(latent_dim,), name="input_3")
decoder_state_input_c = keras.Input(shape=(latent_dim,), name="input_5")
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_lstm = model.layers[3]
decoder_outputs, state_h_dec, state_c_dec = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs
)
decoder_states = [state_h_dec, state_c_dec]
decoder_dense = model.layers[4]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = keras.Model(
    [decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states
)


In [71]:
reader = csv.reader(open('valid.csv', 'rb'))
#reader1 = csv.reader(open('output1.csv', 'rb'))
writer = csv.writer(open('output.csv', 'wb'))
#writer.writerow(["Unnamed:0","sourceText","targetText","sourceLineText","targetLineText","lineNums_Text","sourceTokens","targetTokns","sourceLineTokens","taretLineTokens","fixedTokens"])
    
write_Back = []    
for seq_index in range(len(Valid_X)):
    # Take one sequence (part of the training set)model = keras.models.load_model("s2s")
    input_seq = encoder_input_data[seq_index : seq_index + 1]
    #decoded_sentence = Revert_back(decode_sequence(input_seq),dict_y[seq_index],seq_index)
    decoded_sentence = Revert_back(decode_sequence(input_seq,encoder_model,decoder_model,num_decoder_tokens,targetVocabulary),dict_val_x[seq_index])
    #print("-")
    #print("Tokenised Output sentence:", Revert_back(Y[seq_index],dict_y[seq_index],seq_index))
    #print("Tokenised Output sentence:", (Revert_back(Y_val[seq_index],dict_val_y[seq_index])))
    #print("Decoded sentence:", (Revert_back(decoded_sentence[:-1],dict_val_x[seq_index])))
    write_Back.append(Revert_back(decoded_sentence[:-1],dict_val_y[seq_index]))
    

Valid["fixedTokens"] = write_Back    
Valid.to_csv("output.csv")
    


In [7]:
def NormaliseList_new(Data,_name,Tokens):
    X = [] # empty list for tokenised code
    list_of_dictionaries = []

    CodeToken = Data[_name].values
    CodeToken = list(CodeToken)


    dataTypes = ["int","char","double","float"]
    invalid = ["main",';',':','(',')','{','}','[',']']

    
    
    for codeNum,code in enumerate(Tokens): # iterate over codes
        #print(code)
        Dict = {"int":[],"float":[],"char":[],"double":[]}
        VAR_normalisation = {}
        
        Code = eval(code)

        for line in Code:                   # iterate over lines of a single code
            for index in range(len(line)):  # iterate over words/tokens of a single line of a code
                word = line[index]
                if word in dataTypes:
                    for pos in range(index+1,len(line),1):
                        if line[pos] in invalid:
                            break
                        elif line[pos] != ',' and not line[pos].isnumeric() and not (len(line[pos])>=2 and line[pos][0] == '"' and line[pos][-1] == '"'):
                            Dict[word].append(line[pos])
                
                
                
        #list_of_dictionaries.append(VAR_normalisation)

#         temp = []
#         for token in eval(CodeToken[codeNum]):
#             if token in tempDict.keys():
#                 temp.append(tempDict[token])
#             else:
#                 temp.append(token)
#         X.append(temp)
        
        var_int = 0
        var_char = 0
        var_float = 0
        var_double = 0
        #VAR_normalisation
        temp = []
        
        for token in eval(CodeToken[codeNum]):
            temp.append(token)
            for dType in dataTypes:
                for values in Dict[dType]:
                    if values == token:
                        
                        del temp[-1]
                        var_int += int(dType == "int")
                        var_char += int(dType == "char")
                        var_float += int(dType == "float")
                        var_double += int(dType == "double")
                        
                        if dType == 'int':
                            VAR_normalisation["VAR_" + dType + '_' + str(var_int)] = token
                            temp.append("VAR_" + dType + '_' + str(var_int))
                        if dType == 'char':
                            VAR_normalisation["VAR_" + dType + '_' + str(var_char)] = token
                            temp.append("VAR_" + dType + '_' + str(var_char))
                        if dType == 'float':
                            VAR_normalisation["VAR_" + dType + '_' + str(var_float)] = token
                            temp.append("VAR_" + dType + '_' + str(var_float))
                        if dType == 'double':
                            VAR_normalisation["VAR_" + dType + '_' + str(var_double)] = token
                            temp.append("VAR_" + dType + '_' + str(var_double))
                            
                        break
                        
        
        X.append(temp)                         
        list_of_dictionaries.append(VAR_normalisation)               
                        
                        
                        

#     for codeNum,code in enumerate(Tokens):
#         temp = []
#         for token in eval(CodeToken[codeNum]):
#             temp.append(token)
#         X.append(temp)  

    return X,list_of_dictionaries
