In [2]:
!pip install keras --quiet
!pip install tensorflow --quiet

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
nltk.download('stopwords')
nltk.download('punkt')

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding
from keras.utils import to_categorical
import numpy as np
from gensim.models import FastText
from gensim.models import KeyedVectors

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
df = pd.read_csv('arxiv_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51774 entries, 0 to 51773
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   titles     51774 non-null  object
 1   summaries  51774 non-null  object
 2   terms      51774 non-null  object
dtypes: object(3)
memory usage: 1.2+ MB


In [6]:
df.drop(['terms'],inplace =True, axis=1)

In [7]:
df.drop_duplicates(inplace=True)
df.duplicated().sum()

0

In [8]:
df['summaries']

0        Stereo matching is one of the widely used tech...
1        The recent advancements in artificial intellig...
2        In this paper, we proposed a novel mutual cons...
3        Consistency training has proven to be an advan...
4        To ensure safety in automated driving, the cor...
                               ...                        
51767    Diffusion Tensor Imaging (DTI) is a non-invasi...
51768    Single molecule fluorescence microscopy is a p...
51770    We discuss a method for tracking individual mo...
51771    We attempt to set a mathematical foundation of...
51772    Diffusion Tensor Imaging (DTI) allows estimati...
Name: summaries, Length: 38985, dtype: object

In [9]:
df= df.iloc[:1000, :]


In [10]:
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()

    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]|[\d]', '', text)

    # Tokenize the text
    tokens = word_tokenize(text)
    # Join the tokens back into a string
    text = ' '.join(tokens)

    return text
df['summaries'] = df['summaries'].apply(preprocess_text)

In [11]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['summaries'])
sequences = tokenizer.texts_to_sequences(df['summaries'])
max_len = max([len(seq) for seq in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')

# Create input and target sequences
input_sequences = padded_sequences[:,:-1]
target_sequences = padded_sequences[:,1:]

In [12]:

# define the input data
data = df['summaries']

# tokenize the data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data)
sequences = tokenizer.texts_to_sequences(data)

# prepare the input and output data
input_data = []
output_data = []
for sequence in sequences:
    for i in range(1, len(sequence)):
        input_data.append(sequence[:i])
        output_data.append(sequence[i])
input_data = pad_sequences(input_data)
output_data = to_categorical(output_data)

fasttext = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/wiki-news-300d-1M.vec')

# create an embedding matrix for the words in our vocabulary
embedding_dim = 300
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, embedding_dim))
for word, i in tokenizer.word_index.items():
    if word in fasttext:
        embedding_matrix[i] = fasttext[word]


# define the model
model = Sequential()
model.add(Embedding(len(tokenizer.word_index) + 1, embedding_dim, weights=[embedding_matrix], trainable=False))
model.add(LSTM(128))
model.add(Dense(len(tokenizer.word_index) + 1, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

# fit the model
#model.fit(input_data, output_data, epochs=2)



In [13]:
import tensorflow as tf

# Load the pre-trained model
model = tf.keras.models.load_model('/content/lstm_model.h5')
model.fit(input_data, output_data, initial_epoch=2, epochs=5)


from google.colab import files
model_path = 'lstm_model_2.h5'
model.save(model_path)
files.download(model_path)

Epoch 3/5
Epoch 4/5
Epoch 5/5


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [14]:
#from google.colab import files

#model_path = 'lstm_model.h5'
#model.save(model_path)
#files.download(model_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [18]:
total_words = df['summaries'].str.split().str.len().sum()
average_words = df['summaries'].str.split().str.len().mean()
print(total_words)
print(average_words)

176448
176.448


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   titles     1000 non-null   object
 1   summaries  1000 non-null   object
dtypes: object(2)
memory usage: 23.4+ KB


In [23]:
df['summaries'].sample(5)

391    to assist researchers to identify environmenta...
306    magnetic resonance mr protocols rely on severa...
273    in image segmentation there is often more than...
961    we propose a novel active learning framework c...
17     deep neural networks have been a prevailing te...
Name: summaries, dtype: object

In [None]:

# define a function to generate text using the trained model
def generate_text(model, tokenizer, input_text, num_words):
    for _ in range(num_words):
        # tokenize the input text
        input_sequence = tokenizer.texts_to_sequences([input_text])[0]
        # pad the input sequence
        input_sequence = pad_sequences([input_sequence])
        # make a prediction
        prediction = model.predict(input_sequence)
        # get the index of the predicted word
        predicted_index = np.argmax(prediction)
        # get the predicted word
        predicted_word = tokenizer.index_word[predicted_index]
        # update the input text
        input_text += ' ' + predicted_word
    return input_text


In [27]:
input_text = 'we propose a novel active learning framework'
generated_text = generate_text(model, tokenizer, input_text, 10)
print(generated_text)

# visualize the predictions made by the model
input_sequence = tokenizer.texts_to_sequences([input_text])[0]
input_sequence = pad_sequences([input_sequence])
prediction = model.predict(input_sequence)[0]
predicted_indices = np.argsort(prediction)[::-1][:5]
predicted_words = [tokenizer.index_word[i] for i in predicted_indices]
predicted_probabilities = prediction[predicted_indices]
for word, probability in zip(predicted_words, predicted_probabilities):
    print(f'{word}: {probability:.2f}')

we propose a novel active learning framework to effectively generate the segmentation masks to be segmented into
to: 0.34
that: 0.16
for: 0.15
based: 0.06
which: 0.05


# Titles

In [48]:
df['titles']

0      survey on semantic stereo matching semantic de...
1      futureai guiding principles and consensus reco...
2      enforcing mutual consistency of hard regions f...
3      parameter decoupling strategy for semisupervis...
4      backgroundforeground segmentation for interior...
                             ...                        
995    deepigeos a deep interactive geodesic framewor...
996    d densely convolutional networks for volumetri...
997    uinet interactive artificial neural networks f...
998           oneshot learning for semantic segmentation
999    exploring and exploiting diversity for image s...
Name: titles, Length: 1000, dtype: object

In [31]:
df['titles'] = df['titles'].apply(preprocess_text)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['titles'])
sequences = tokenizer.texts_to_sequences(df['titles'])
max_len = max([len(seq) for seq in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')

# Create input and target sequences
input_sequences = padded_sequences[:,:-1]
target_sequences = padded_sequences[:,1:]

In [33]:
input_sequences

array([[  99,   21,   13, ...,    0,    0,    0],
       [ 828,  829,  830, ...,    0,    0,    0],
       [ 509,  219,  143, ...,    0,    0,    0],
       ...,
       [2269,   38,  342, ...,    0,    0,    0],
       [ 319,    5,    2, ...,    0,    0,    0],
       [ 826,    4,  477, ...,    0,    0,    0]], dtype=int32)

In [76]:
import re

def add_end_token(text):
    # Add end token after every sentence
    text = re.sub(r'([^.]*\.)', r'\1 <end>', text)
    # Remove trailing spaces and <end> tokens
    text = text.strip().replace(' <end>', '<end>')
    # Add <end> token if missing
    if not text.endswith('<end>'):
        text += ' <end>'
    return text

# Example usage:
df['titles'] = df['titles'].apply(add_end_token)


In [77]:
from keras.layers import Dropout

# define the input and output data
data = df['titles']

# tokenize the data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data)
sequences = tokenizer.texts_to_sequences(data)

# prepare the input and output data
max_sequence_length = max([len(seq) for seq in sequences])
input_data = []
output_data = []
for sequence in sequences:
    for i in range(1, len(sequence)):
        input_seq = sequence[:i]
        input_seq = pad_sequences([input_seq], maxlen=max_sequence_length)[0]
        output_seq = to_categorical(sequence[i], num_classes=len(tokenizer.word_index) + 1)
        input_data.append(input_seq)
        output_data.append(output_seq)
input_data = np.array(input_data)
output_data = np.array(output_data)

fasttext = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/wiki-news-300d-1M.vec')

# create an embedding matrix for the words in our vocabulary
embedding_dim = 300
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, embedding_dim))
for word, i in tokenizer.word_index.items():
    if word in fasttext:
        embedding_matrix[i] = fasttext[word]

# define the model
model = Sequential()
model.add(Embedding(len(tokenizer.word_index) + 1, embedding_dim, weights=[embedding_matrix], trainable=True))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dense(len(tokenizer.word_index) + 1, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
#model.fit(input_data, output_data, epochs=40)

model.summary()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_8 (Embedding)     (None, None, 300)         681000    
                                                                 
 lstm_9 (LSTM)               (None, None, 256)         570368    
                                                                 
 dropout_1 (Dropout)         (None, None, 256)         0         
                                                                 
 lstm_10 (LSTM)              (None, 128)               197120    
                                                                 
 dense_7 (Dense)             (None, 2270)              292830    
                                                                 
Total params: 1,741,318
Trainable params: 1,741,318
Non-trainable params: 0
_________________________________________________________________


In [78]:
history=model.fit(input_data, output_data, initial_epoch=100, epochs=200)

Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200
Epoch 153/200
Epoch 154/200
Epoch 155/200
Epoch 156/200
Epoch 157/200
Epoch 158/200
Epoch 159/200
Epoch 160/200
Epoch 161/200
Epoch 162/200
Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 

In [79]:
# Load the test data
test_data = df['titles'][0:500]
test_sequences = tokenizer.texts_to_sequences(test_data)
test_input = pad_sequences(test_sequences, maxlen=max_sequence_length)

test_output = model.predict(test_input)

# Evaluate the performance of the model on the test data
test_loss, test_acc = model.evaluate(test_input, test_output)
print(f'Test loss: {test_loss:.4f}, Test accuracy: {test_acc:.4f}')


Test loss: 0.5311, Test accuracy: 1.0000


In [81]:
from google.colab import files

model.save('lstm_model_4.h5')
files.download(model_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [84]:
import json
from tensorflow.keras.models import model_from_json
model_json = model.to_json()
with open("lstm_model_4.json", "w") as json_file:
    json_file.write(model_json)

# Download JSON file
files.download("lstm_model_4.json")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [83]:
def generate_text(model, tokenizer, input_text, max_length=20):
    # initialize the generated output text with the input text
    generated_text = input_text
    # set the stop condition to False
    stop_condition = False
    while not stop_condition:
        # tokenize the input text
        input_sequence = tokenizer.texts_to_sequences([generated_text])[0]
        # pad the input sequence
        input_sequence = pad_sequences([input_sequence], maxlen=max_length-1, padding='pre')
        # make a prediction
        prediction = model.predict(input_sequence)[0]
        # get the index of the predicted word
        predicted_index = np.argmax(prediction)
        # get the predicted word
        predicted_word = tokenizer.index_word.get(predicted_index, '')
        # check if we've generated the maximum length or found the end token
        if len(generated_text.split()) == max_length or predicted_word == '':
            stop_condition = True
        else:
            # append the predicted word to the generated text
            generated_text += ' ' + predicted_word
    return generated_text.strip()


In [75]:
input_text = 'rethinking the skip connections'
generated_text = generate_text(model, tokenizer, input_text)
print(generated_text)

# visualize the predictions made by the model
input_sequence = tokenizer.texts_to_sequences([input_text])[0]
input_sequence = pad_sequences([input_sequence])
prediction = model.predict(input_sequence)[0]
predicted_indices = np.argsort(prediction)[::-1][:5]
predicted_words = [tokenizer.index_word[i] for i in predicted_indices]
predicted_probabilities = prediction[predicted_indices]
for word, probability in zip(predicted_words, predicted_probabilities):
    print(f'{word}: {probability:.2f}')

rethinking the skip connections in unet from a channelwise perspective with transformer of active contours for image segmentation based on
in: 0.98
with: 0.01
towards: 0.00
for: 0.00
and: 0.00


In [85]:
import json
from google.colab import files


tokenizer_json = tokenizer.to_json()
with open('lstm_tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(tokenizer_json)

files.download('lstm_tokenizer.json')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>