In [65]:
import nltk
import re
import pandas as pd
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
import string
from string import punctuation
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from heapq import nlargest

# PREPROCESSING

In [66]:
# Load the dataset
train = pd.read_csv('E:/sub/NLP/project_dataset/train.csv', encoding='iso-8859-1')
test = pd.read_csv('E:/sub/NLP/project_dataset/eval.csv', encoding='iso-8859-1')

In [67]:
# Define function for text preprocessing
def preprocess_text(text):
    # Remove non-alphabetic characters
    text = re.sub('[^a-zA-Z]', ' ', text)
    
    # Convert to lowercase
    text = text.lower()
    
    #Remove punctuation
    text="".join([char for char in text if char not in string.punctuation])
    
    #adding start and end tokens
    text = '<start> ' + text + ' <end>'
    
    # Tokenize into sentences
    sentences = sent_tokenize(text)
     # Tokenize the text
    words = word_tokenize(text)
  
    # Join the preprocessed sentences
    preprocessed_sentences = []
    preprocessed_sentences.append(' '.join(words))
    
    # Join the preprocessed sentences
    preprocessed_text = ' '.join(preprocessed_sentences)
    
    return preprocessed_text

# Apply the preprocessing function to the train text and headlines columns
train['input'] = train['input'].apply(preprocess_text)

# Save the preprocessed train dataset
train.to_csv('preprocessed_train.csv', index=False)
# Print the preprocessed train dataset
print(train.head())

print("-----------------------------------------------------------------------------")

# Apply the preprocessing function to the test text and headlines columns
test['input'] = test['input'].apply(preprocess_text)

# Save the preprocessed test dataset
test.to_csv('preprocessed_test.csv', index=False)
# Print the preprocessed test dataset
print(test.head())


                                               input  \
0  < start > so i think we can not live if old pe...   
1  < start > so i think we can not live if old pe...   
2  < start > so i think we can not live if old pe...   
3  < start > so i think we can not live if old pe...   
4                  < start > for not use car < end >   

                                              target  
0  So I think we would not be alive if our ancest...  
1  So I think we could not live if older people d...  
2  So I think we can not live if old people could...  
3  So I think we can not live if old people can n...  
4                          Not for use with a car .   
-----------------------------------------------------------------------------
                                               input  \
0  < start > new and new technology has been intr...   
1  < start > new and new technology has been intr...   
2  < start > new and new technology has been intr...   
3  < start > new and new techno

# POS

In [68]:
# Define function for train POS tagging
def pos_tagging(text):
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    # Perform POS tagging
    pos_tags = nltk.pos_tag(tokens)
    # Extract the POS tags
    pos_tags_only = [tag[1] for tag in pos_tags]
    # Join the POS tags into a string
    pos_tags_string = " ".join(pos_tags_only)
    return pos_tags_string

In [69]:
# Apply POS tagging to the train text and headlines columns
train['input_POS'] = train['input'].apply(pos_tagging)

# Apply POS tagging to the test text and headlines columns
test['input_POS'] = test['input'].apply(pos_tagging)

In [70]:
# Save the POS-tagged train dataset
train.to_csv('preprocessed_pos.csv', index=False)

# Save the POS-tagged test dataset
test.to_csv('preprocessed_pos_test.csv', index=False)

In [71]:
print(train['input_POS'].head())
print("-----------------------------------------------------------------------------")
print(test['input_POS'].head())

0    JJ NN NNP RB JJ VBP PRP MD RB VB IN JJ NNS MD ...
1    JJ NN NNP RB JJ VBP PRP MD RB VB IN JJ NNS MD ...
2    JJ NN NNP RB JJ VBP PRP MD RB VB IN JJ NNS MD ...
3    JJ NN NNP RB JJ VBP PRP MD RB VB IN JJ NNS MD ...
4                        JJ NN NN IN RB JJ NN JJ NN NN
Name: input_POS, dtype: object
-----------------------------------------------------------------------------
0    JJ NN NNP JJ CC JJ NN VBZ VBN VBN TO DT NN JJ ...
1    JJ NN NNP JJ CC JJ NN VBZ VBN VBN TO DT NN JJ ...
2    JJ NN NNP JJ CC JJ NN VBZ VBN VBN TO DT NN JJ ...
3    JJ NN NNP JJ CC JJ NN VBZ VBN VBN TO DT NN JJ ...
4    JJ NN NNP CD JJ NN VBZ IN DT RB JJ NN IN NN NN...
Name: input_POS, dtype: object


# Count Vectorization

In [72]:
# create a document term matrix
from sklearn.feature_extraction.text import CountVectorizer
cv_count = CountVectorizer()

In [73]:
X_count = cv_count.fit(train['input'])
# automatically remove any one character word like a
print(X_count.vocabulary_)
# unique words in corpus
print(cv_count.get_feature_names_out())

{'start': 2055, 'so': 1986, 'think': 2224, 'we': 2418, 'can': 300, 'not': 1477, 'live': 1274, 'if': 1085, 'old': 1510, 'people': 1589, 'could': 491, 'find': 849, 'siences': 1960, 'and': 107, 'tecnologies': 2171, 'they': 2216, 'did': 591, 'developped': 588, 'end': 713, 'for': 874, 'use': 2360, 'car': 305, 'here': 1040, 'was': 2407, 'no': 1466, 'promise': 1724, 'of': 1498, 'morning': 1419, 'except': 771, 'that': 2195, 'looked': 1288, 'up': 2353, 'through': 2239, 'the': 2197, 'trees': 2299, 'saw': 1889, 'how': 1069, 'low': 1303, 'forest': 878, 'had': 996, 'swung': 2133, 'thus': 2244, 'even': 752, 'today': 2253, 'sex': 1939, 'is': 1172, 'considered': 457, 'as': 144, 'least': 1241, 'important': 1103, 'topic': 2267, 'in': 1111, 'many': 1339, 'parts': 1573, 'india': 1121, 'image': 1089, 'you': 2491, 'salf': 1879, 'are': 130, 'wark': 2404, 'factory': 810, 'just': 1194, 'to': 2252, 'do': 623, 'one': 1518, 'thing': 2222, 'like': 1261, 'pot': 1661, 'taire': 2139, 'on': 1516, 'fire': 851, 'will': 

In [74]:
# create a document term matrix
X_count = cv_count.transform(train['input'])
print(X_count.shape)
#document term matrix
print(X_count.toarray())
df = pd.DataFrame(X_count.toarray(),columns=cv_count.get_feature_names_out())
print(df)

(3016, 2500)
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
      ability  able  aboue  about  above  absence  academic  acadmic  accept  \
0           0     0      0      0      0        0         0        0       0   
1           0     0      0      0      0        0         0        0       0   
2           0     0      0      0      0        0         0        0       0   
3           0     0      0      0      0        0         0        0       0   
4           0     0      0      0      0        0         0        0       0   
...       ...   ...    ...    ...    ...      ...       ...      ...     ...   
3011        0     0      0      0      0        0         0        0       0   
3012        0     0      0      0      0        0         0        0       0   
3013        0     0      0      0      0        0         0        0       0   
3014        0     0      0      0      0        0         0        

# N-Grams Vecorization

In [75]:
#bigram only
cv_N_Grams = CountVectorizer(ngram_range=(2,2))
X_N_Grams = cv_N_Grams.fit_transform(train['input'])
print(X_N_Grams.shape)
#document term matrix
print(X_N_Grams.toarray())
df = pd.DataFrame(X_N_Grams.toarray(),columns=cv_N_Grams.get_feature_names_out())
df

(3016, 9124)
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


Unnamed: 0,ability for,ability to,able to,aboue advance,about adding,about alternative,about any,about commensence,about community,about contracts,...,your teeth,your time,your toys,your views,your way,your work,youth end,youth have,youths who,ypu fact
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3011,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3012,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3013,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3014,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [76]:
# mean bigram and trigram
cv_N_Grams = CountVectorizer(ngram_range=(2,3))
X_N_Grams = cv_N_Grams.fit_transform(train['input'])
print(X_N_Grams.shape)
#document term matrix
print(X_N_Grams.toarray())
df = pd.DataFrame(X_N_Grams.toarray(),columns=cv_N_Grams.get_feature_names_out())
df

(3016, 20704)
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


Unnamed: 0,ability for,ability for students,ability to,ability to perform,ability to think,able to,able to buy,able to deal,able to do,able to end,...,your work,your work in,your work your,youth end,youth have,youth have less,youths who,youths who acquire,ypu fact,ypu fact end
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3011,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3012,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3013,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3014,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# TF-IDF Vectorization

In [77]:
import string
from nltk import word_tokenize
import nltk
from nltk.stem import PorterStemmer
ps = PorterStemmer()
stopwords = nltk.corpus.stopwords.words('english')
def clean_text(txt):
    txt_nopunct = "".join([c for c in txt if c not in string.punctuation])
    tokens = word_tokenize(txt_nopunct)
    txt_clean = [word for word in tokens if word not in stopwords]
    tokens_stem = [ps.stem(word) for word in txt_clean]
    return tokens_stem

In [78]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vec_train = TfidfVectorizer(analyzer=clean_text)
tfidf_vec_train_fit = tfidf_vec_train.fit(train['input'])
X_tfidf = tfidf_vec_train.fit_transform(train['input'])
print(X_tfidf.shape)
df = pd.DataFrame(X_tfidf.toarray(),columns=tfidf_vec_train.get_feature_names_out())
df.head()

(3016, 1877)


Unnamed: 0,abil,abl,abou,absenc,academ,acadm,accept,access,accid,accord,...,yellowston,yet,yong,york,youg,young,younger,youngster,youth,ypu
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Feature Engineering

In [79]:
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
# Function to calculate sentence length and complexity
def calculate_sentence_features(text):
    # Tokenize text into sentences
    sentences = sent_tokenize(text)
    
    # Calculate average sentence length
    total_sentence_length = sum(len(sentence.split()) for sentence in sentences)
    avg_sentence_length = total_sentence_length / len(sentences)
    
    # Calculate type-token ratio
    words = nltk.word_tokenize(text)
    words = [word.lower() for word in words if word.isalpha()]
    words = [word for word in words if word not in stopwords.words('english')]
    types = len(set(words))
    tokens = len(words)
    ttr = types / tokens
    
    return avg_sentence_length, ttr


# Example usage of the function
text = "This is a sample sentence. It has multiple clauses and is of average length."
avg_sentence_length, ttr = calculate_sentence_features(text)
print(avg_sentence_length)
print(ttr)

train['input_len'] = train['input'].apply(lambda x: len(x))
train.head()

train['target_len'] = train['target'].apply(lambda x: len(x))
train.head()


7.0
1.0


Unnamed: 0,input,target,input_POS,input_len,target_len
0,< start > so i think we can not live if old pe...,So I think we would not be alive if our ancest...,JJ NN NNP RB JJ VBP PRP MD RB VB IN JJ NNS MD ...,125,94
1,< start > so i think we can not live if old pe...,So I think we could not live if older people d...,JJ NN NNP RB JJ VBP PRP MD RB VB IN JJ NNS MD ...,125,88
2,< start > so i think we can not live if old pe...,So I think we can not live if old people could...,JJ NN NNP RB JJ VBP PRP MD RB VB IN JJ NNS MD ...,125,108
3,< start > so i think we can not live if old pe...,So I think we can not live if old people can n...,JJ NN NNP RB JJ VBP PRP MD RB VB IN JJ NNS MD ...,125,111
4,< start > for not use car < end >,Not for use with a car .,JJ NN NN IN RB JJ NN JJ NN NN,33,25


# Padding

In [80]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Define a list of input text strings
texts = train['input']
# Create a tokenizer and fit on the text data
vocab_size = 10000000
oov_tok = '0' # Out of Vocabulary
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
word_index = tokenizer.word_index
tokenizer.fit_on_texts(texts)
# Convert the text data to a sequence of integer-encoded tokens
sequences = tokenizer.texts_to_sequences(texts)
# Pad the sequences to a maximum length of 20
max_length = 20
input_padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')
print(tokenizer.sequences_to_texts(input_padded_sequences)[1])

# Define a list of text strings
texts2 = train['target']
# Create a tokenizer and fit on the text data
tokenizer2 = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
word_index2 = tokenizer2.word_index
tokenizer2.fit_on_texts(texts2)
# Convert the text data to a sequence of integer-encoded tokens
sequences2 = tokenizer2.texts_to_sequences(texts2)
target_padded_sequences = pad_sequences(sequences2, maxlen=max_length, padding='post')
print(tokenizer.sequences_to_texts(target_padded_sequences)[1])

think we can not live if old people could not find siences and tecnologies and they did not developped end
so that example you go this should more agree are bring this almost transportation the concerned 0 0 0 0


# Model

In [81]:
import numpy as np
def decoder_input_data (text):
    # Define a 2D numpy array
    data = np.array(text)
    # Shift the data to the right by one position and replace first column with zeros
    shifted_data = np.roll(data, 1, axis=1)
    shifted_data[:, 0] = 0
    return shifted_data

In [82]:
from keras.layers import Input, LSTM, Dense
from keras.models import Model
from sklearn.model_selection import train_test_split

latent_dim = 256
batch_size = 32 
epochs = 10 
input_dim = 20
output_dim = 20

encoder_input_data = input_padded_sequences
decoder_input_data  = decoder_input_data (input_padded_sequences)
decoder_target_data = target_padded_sequences

# Define the input sequence
encoder_inputs = Input(shape=(None, input_dim))

# Define the encoder LSTM
encoder_lstm = LSTM(latent_dim, return_state=True)

# Get the encoder outputs and states
encoder_outputs, state_h, state_c = encoder_lstm(encoder_inputs)


# Discard the encoder outputs and only keep the states
encoder_states = [state_h, state_c]

# Define the decoder inputs
decoder_inputs = Input(shape=(None, output_dim))

# Define the decoder LSTM
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)

# Get the decoder outputs and states
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)

# Define the output layer
decoder_dense = Dense(output_dim, activation='softmax')

# Apply the output layer to the decoder outputs
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model inputs and outputs
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

In [83]:
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.2)

Epoch 1/10


ValueError: in user code:

    File "C:\Users\TREIKA\anaconda3\lib\site-packages\keras\engine\training.py", line 1249, in train_function  *
        return step_function(self, iterator)
    File "C:\Users\TREIKA\anaconda3\lib\site-packages\keras\engine\training.py", line 1233, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\TREIKA\anaconda3\lib\site-packages\keras\engine\training.py", line 1222, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\TREIKA\anaconda3\lib\site-packages\keras\engine\training.py", line 1023, in train_step
        y_pred = self(x, training=True)
    File "C:\Users\TREIKA\anaconda3\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "C:\Users\TREIKA\anaconda3\lib\site-packages\keras\engine\input_spec.py", line 232, in assert_input_compatibility
        raise ValueError(

    ValueError: Exception encountered when calling layer 'model_3' (type Functional).
    
    Input 0 of layer "lstm_6" is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: (None, 20)
    
    Call arguments received by layer 'model_3' (type Functional):
      • inputs=('tf.Tensor(shape=(None, 20), dtype=int32)', 'tf.Tensor(shape=(None, 20), dtype=int32)')
      • training=True
      • mask=None


# Translation

In [None]:
def translate(sentence, encoder, decoder):
    
    attention_plot = np.zeros((max_length_targ, max_length_inp))
    sentence = preprocess_sentence(sentence)

    inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],maxlen=max_length_inp, padding='post')

    inputs = tf.convert_to_tensor(inputs)

    result = ''

    hidden = [tf.zeros((1,units))]
    enc_out, enc_hidden = encoder(inputs, hidden)

    dec_hidden = enc_hidden

    dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0)

    for t in range(max_length_targ):
        predictions, dec_hidden, attention_weights = decoder(dec_input, dec_hidden, enc_out)
        predicted_id = tf.argmax(predictions[0]).numpy()
        result += targ_lang.index_word[predicted_id] + ' '

    if targ_lang.index_word[predicted_id] == '<end>':
    return result, sentence
    
    dec_input = tf.expand_dims([predicted_id], 0)
return result, sentence