In [None]:
!pip install keras --quiet
!pip install tensorflow --quiet

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
nltk.download('stopwords')
nltk.download('punkt')
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding
from keras.utils import to_categorical
import numpy as np
from gensim.models import FastText
from gensim.models import KeyedVectors

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
df = pd.read_csv('arxiv_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51774 entries, 0 to 51773
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   titles     51774 non-null  object
 1   summaries  51774 non-null  object
 2   terms      51774 non-null  object
dtypes: object(3)
memory usage: 1.2+ MB


In [None]:
df.drop(['terms'],inplace =True, axis=1)

In [None]:
df.drop_duplicates(inplace=True)
df.duplicated().sum()

0

In [None]:
df= df.iloc[:1000, :]


In [None]:
df['titles']

0      survey on semantic stereo matching semantic de...
1      futureai guiding principles and consensus reco...
2      enforcing mutual consistency of hard regions f...
3      parameter decoupling strategy for semisupervis...
4      backgroundforeground segmentation for interior...
                             ...                        
995    deepigeos a deep interactive geodesic framewor...
996    d densely convolutional networks for volumetri...
997    uinet interactive artificial neural networks f...
998           oneshot learning for semantic segmentation
999    exploring and exploiting diversity for image s...
Name: titles, Length: 1000, dtype: object

In [None]:
df['titles'] = df['titles'].apply(preprocess_text)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['titles'])
sequences = tokenizer.texts_to_sequences(df['titles'])
max_len = max([len(seq) for seq in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')

# Create input and target sequences
input_sequences = padded_sequences[:,:-1]
target_sequences = padded_sequences[:,1:]

In [None]:
input_sequences

array([[  99,   21,   13, ...,    0,    0,    0],
       [ 828,  829,  830, ...,    0,    0,    0],
       [ 509,  219,  143, ...,    0,    0,    0],
       ...,
       [2269,   38,  342, ...,    0,    0,    0],
       [ 319,    5,    2, ...,    0,    0,    0],
       [ 826,    4,  477, ...,    0,    0,    0]], dtype=int32)

In [None]:
import re

def add_end_token(text):
    # Add end token after every sentence
    text = re.sub(r'([^.]*\.)', r'\1 <end>', text)
    # Remove trailing spaces and <end> tokens
    text = text.strip().replace(' <end>', '<end>')
    # Add <end> token if missing
    if not text.endswith('<end>'):
        text += ' <end>'
    return text

# Example usage:
df['titles'] = df['titles'].apply(add_end_token)


In [None]:
from keras.layers import Dropout

# define the input and output data
data = df['titles']

# tokenize the data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data)
sequences = tokenizer.texts_to_sequences(data)

# prepare the input and output data
max_sequence_length = max([len(seq) for seq in sequences])
input_data = []
output_data = []
for sequence in sequences:
    for i in range(1, len(sequence)):
        input_seq = sequence[:i]
        input_seq = pad_sequences([input_seq], maxlen=max_sequence_length)[0]
        output_seq = to_categorical(sequence[i], num_classes=len(tokenizer.word_index) + 1)
        input_data.append(input_seq)
        output_data.append(output_seq)
input_data = np.array(input_data)
output_data = np.array(output_data)

fasttext = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/wiki-news-300d-1M.vec')

# create an embedding matrix for the words in our vocabulary
embedding_dim = 300
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, embedding_dim))
for word, i in tokenizer.word_index.items():
    if word in fasttext:
        embedding_matrix[i] = fasttext[word]

# define the model
model = Sequential()
model.add(Embedding(len(tokenizer.word_index) + 1, embedding_dim, weights=[embedding_matrix], trainable=True))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dense(len(tokenizer.word_index) + 1, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

In [None]:
history=model.fit(input_data, output_data, initial_epoch=100, epochs=200)

In [None]:
# Load the test data
test_data = df['titles'][0:500]
test_sequences = tokenizer.texts_to_sequences(test_data)
test_input = pad_sequences(test_sequences, maxlen=max_sequence_length)

test_output = model.predict(test_input)

# Evaluate the performance of the model on the test data
test_loss, test_acc = model.evaluate(test_input, test_output)
print(f'Test loss: {test_loss:.4f}, Test accuracy: {test_acc:.4f}')


Test loss: 0.5311, Test accuracy: 1.0000


In [None]:
from google.colab import files

model.save('lstm_model_4.h5')
files.download(model_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import json
from tensorflow.keras.models import model_from_json
model_json = model.to_json()
with open("lstm_model_4.json", "w") as json_file:
    json_file.write(model_json)

# Download JSON file
files.download("lstm_model_4.json")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import json
from keras.preprocessing.text import tokenizer_from_json

with open('/content/lstm_tokenizer.json', 'r') as f:
    data = json.load(f)
    tokenizer_json = json.dumps(data)  # Convert dictionary to JSON-formatted string
    tokenizer = tokenizer_from_json(tokenizer_json)

model = tf.keras.models.load_model('/content/lstm_model_4.h5')


In [None]:
def generate_text(model, tokenizer, input_text, max_length=20):
    # initialize the generated output text with the input text
    generated_text = input_text
    # set the stop condition to False
    stop_condition = False
    while not stop_condition:
        # tokenize the input text
        input_sequence = tokenizer.texts_to_sequences([generated_text])[0]
        # pad the input sequence
        input_sequence = pad_sequences([input_sequence], maxlen=max_length-1, padding='pre')
        # make a prediction
        prediction = model.predict(input_sequence)[0]
        # get the index of the predicted word
        predicted_index = np.argmax(prediction)
        # get the predicted word
        predicted_word = tokenizer.index_word.get(predicted_index, '')
        # check if we've generated the maximum length or found the end token
        if len(generated_text.split()) == max_length or predicted_word == 'end':
            stop_condition = True
        else:
            # append the predicted word to the generated text
            generated_text += ' ' + predicted_word
    return generated_text.strip()


In [None]:
input_text = 'ensemble learning based'
generated_text = generate_text(model, tokenizer, input_text)
print(generated_text)

ensemble learning based on classifier prediction confidence and comprehensive learning particle swarm optimisation for polyp localisation
