In [1]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import pandas as pd
import plotly.express as px

project_folder = '/content/drive/MyDrive/ProjectTCNER'
data_folder = os.path.join(project_folder, "data")
trainset_file = os.path.join(data_folder, "DBLPTrainset.txt")
testset_file = os.path.join(data_folder, "DBLPTestset.txt")
ground_truth_file = os.path.join(data_folder, "DBLPTestGroundTruth.txt")

In [3]:
dataset_train = pd.read_table(os.path.join(trainset_file), sep="\t", header=None, names=['y', 'X'], index_col=0)
dataset_train

Unnamed: 0,y,X
0,ISCAS,Scalable Serial-parallel Multiplier over GF(2m...
1,SIGGRAPH,Plenoptic sampling.
2,ISCAS,Sensitivity and uniformity of a 0.18micrometer...
3,WWW,A survey of web archive search architectures.
4,ISCAS,Understanding dynamic behavior of mm-wave CML ...
...,...,...
21638,ISCAS,Decoding a Family of Dense Codes using the Sum...
21639,VLDB,CoHadoop: Flexible Data Placement and Its Expl...
21640,ISCAS,Full system simulation with QEMU: An approach ...
21641,INFOCOM,Localization in non-localizable sensor and ad-...


In [4]:
dataset_test = pd.read_table(os.path.join(testset_file), sep="\t", header=None, names=['X'], index_col=0)
dataset_test

Unnamed: 0,X
0,Fast recursive adaptation for nonlinear filters.
1,High-Throughput Data Compressor Designs Using ...
2,Functional Verification of ECL Circuits Includ...
3,Efficient network generation under general pre...
4,Creating the earth as a backdrop in <i>Gravity...
...,...
3368,GEM: A Geometric Algorithm for Scheduling.
3369,On the geographic patterns of a large-scale mo...
3370,NScale: Neighborhood-centric Analytics on Larg...
3371,Sufficient Conditions for Finding Multiple Ope...


In [5]:
dataset_gt = pd.read_table(os.path.join(ground_truth_file), sep="\t", header=None, names=['y'], index_col=0)
dataset_gt

Unnamed: 0,y
0,ISCAS
1,ISCAS
2,ISCAS
3,WWW
4,SIGGRAPH
...,...
3368,ISCAS
3369,INFOCOM
3370,VLDB
3371,ISCAS


In [6]:
dataset_test = pd.merge(dataset_test, dataset_gt, left_index=True, right_index=True)
dataset_test

Unnamed: 0,X,y
0,Fast recursive adaptation for nonlinear filters.,ISCAS
1,High-Throughput Data Compressor Designs Using ...,ISCAS
2,Functional Verification of ECL Circuits Includ...,ISCAS
3,Efficient network generation under general pre...,WWW
4,Creating the earth as a backdrop in <i>Gravity...,SIGGRAPH
...,...,...
3368,GEM: A Geometric Algorithm for Scheduling.,ISCAS
3369,On the geographic patterns of a large-scale mo...,INFOCOM
3370,NScale: Neighborhood-centric Analytics on Larg...,VLDB
3371,Sufficient Conditions for Finding Multiple Ope...,ISCAS


In [8]:
out_folder = os.path.join(project_folder, "out")
os.makedirs(out_folder, exist_ok=True)
SEED = 46532

In [20]:
import numpy as np
import re
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense,Conv1D,MaxPooling1D
from keras.layers import LSTM,Dropout
from keras.layers import Embedding
from keras.preprocessing import sequence
from keras.callbacks import ModelCheckpoint

np.random.seed(7)
from prettytable import PrettyTable

import warnings
warnings.filterwarnings('ignore')

In [9]:
X_train = dataset_train['X']
y_train = dataset_train['y']

In [10]:
# Test set
y_test = pd.read_table(os.path.join(ground_truth_file), sep="\t", header=None, names=['y'],
                              index_col=0)['y']
X_test = pd.read_table(os.path.join(testset_file), sep="\t", header=None, names=['X'],
                       index_col=0)['X']

In [13]:
X_train,X_cv,y_train,y_cv = train_test_split(X_train,y_train,test_size = 0.2)
print("Shape of train data:", X_train.shape)
print("Shape of Test data:", X_test.shape)
print("Shape of CV data:", X_cv.shape)

Shape of train data: (17314,)
Shape of Test data: (3373,)
Shape of CV data: (4329,)


In [51]:
from keras.utils import to_categorical

y_train = to_categorical(y_train, num_classes=5)
y_cv = to_categorical(y_cv, num_classes=5)

In [54]:
print("Shape of training labels:", y_train.shape)
print("Shape of CV labels:", y_cv.shape)

Shape of training labels: (17314, 5)
Shape of CV labels: (4329, 5)


In [15]:
paper_lengths = [len(paper) for paper in X_train]

print("Mean length:", np.mean(paper_lengths))
print("Median length:", np.median(paper_lengths))
print("Maximum length:", np.max(paper_lengths))

Mean length: 67.78728196834932
Median length: 67.0
Maximum length: 214


In [16]:
paper_lengths = [len(paper) for paper in X_test]

print("Mean length:", np.mean(paper_lengths))
print("Median length:", np.median(paper_lengths))
print("Maximum length:", np.max(paper_lengths))

Mean length: 69.20693744441151
Median length: 68.0
Maximum length: 168


In [17]:
paper_lengths = [len(paper) for paper in X_cv]

print("Mean length:", np.mean(paper_lengths))
print("Median length:", np.median(paper_lengths))
print("Maximum length:", np.max(paper_lengths))

Mean length: 67.46431046431046
Median length: 67.0
Maximum length: 157


Word Embedding

In [21]:
!curl -o '/content/glove_embeddings.zip' 'https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip'

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  822M  100  822M    0     0  5300k      0  0:02:38  0:02:38 --:--:-- 5118k


In [22]:
!unzip '/content/glove_embeddings.zip'

Archive:  /content/glove_embeddings.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [23]:
def load_glove_embeddings(path):
    embeddings_dict = {}
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.split()
            word = parts[0]
            vector = np.array(parts[1:], dtype=np.float32)
            embeddings_dict[word] = vector
    return embeddings_dict

In [24]:
def preprocess_text(text):
    text = text.lower()  # convert to lowercase
    text = re.sub(r'\d+', '', text)  # remove numbers
    text = re.sub(r'\s+', ' ', text).strip()  # remove extra spaces
    return text

In [25]:
embeddings_index = load_glove_embeddings('glove.6B.100d.txt')

In [27]:
X_train_preprocessed = [preprocess_text(doc) for doc in X_train]
X_test_preprocessed = [preprocess_text(doc) for doc in X_test]
X_cv_preprocessed = [preprocess_text(doc) for doc in X_cv]

In [28]:
def document_vector(doc, embeddings_index):
    words = doc.split()
    valid_words = [word for word in words if word in embeddings_index]
    if not valid_words:
        return np.zeros(100)  # Return a zero vector if no valid words are found
    word_vectors = [embeddings_index[word] for word in valid_words]
    return np.mean(word_vectors, axis=0)

X_train_vect = np.array([document_vector(doc, embeddings_index) for doc in X_train_preprocessed])
X_test_vect = np.array([document_vector(doc, embeddings_index) for doc in X_test_preprocessed])
X_cv_vect = np.array([document_vector(doc, embeddings_index) for doc in X_cv_preprocessed])

In [29]:
print("Shape of train data:", X_train_vect.shape)
print("Shape of Test data:", X_test_vect.shape)
print("Shape of CV data:", X_cv_vect.shape)

Shape of train data: (17314, 100)
Shape of Test data: (3373, 100)
Shape of CV data: (4329, 100)


In [46]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

vocabulary = tokenizer.word_index

vocab_size = len(vocabulary)+1
print("Vocabulary Size:", vocab_size)

Vocabulary Size: 13247


LSTM

In [59]:
model = Sequential()

embedding_dim = 100
vocab_size = 13247
max_sequence_length = 100
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_sequence_length))

model.add(LSTM(100, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dropout(0.2))

model.add(Dense(units=5, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Checking shape incompatibility
assert X_train_vect.shape[1] == max_sequence_length, "Error: Sequence length mismatch in X_train_vect"
assert X_cv_vect.shape[1] == max_sequence_length, "Error: Sequence length mismatch in X_cv_vect"
assert y_train.shape[1] == 5, "Error: Number of classes mismatch in y_train"
assert y_cv.shape[1] == 5, "Error: Number of classes mismatch in y_cv"

print(model.summary())

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_9 (Embedding)     (None, 100, 100)          1324700   
                                                                 
 lstm_17 (LSTM)              (None, 100, 100)          80400     
                                                                 
 dropout_16 (Dropout)        (None, 100, 100)          0         
                                                                 
 lstm_18 (LSTM)              (None, 100)               80400     
                                                                 
 dropout_17 (Dropout)        (None, 100)               0         
                                                                 
 dense_9 (Dense)             (None, 5)                 505       
                                                                 
Total params: 1486005 (5.67 MB)
Trainable params: 1486

In [60]:
history = model.fit(X_train_vect, y_train, epochs=10, batch_size=32, validation_data=(X_cv_vect, y_cv))

Epoch 1/10


InvalidArgumentError: Graph execution error:

Detected at node sequential_9/embedding_9/embedding_lookup defined at (most recent call last):
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main

  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code

  File "/usr/local/lib/python3.10/dist-packages/colab_kernel_launcher.py", line 37, in <module>

  File "/usr/local/lib/python3.10/dist-packages/traitlets/config/application.py", line 992, in launch_instance

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelapp.py", line 619, in start

  File "/usr/local/lib/python3.10/dist-packages/tornado/platform/asyncio.py", line 195, in start

  File "/usr/lib/python3.10/asyncio/base_events.py", line 603, in run_forever

  File "/usr/lib/python3.10/asyncio/base_events.py", line 1909, in _run_once

  File "/usr/lib/python3.10/asyncio/events.py", line 80, in _run

  File "/usr/local/lib/python3.10/dist-packages/tornado/ioloop.py", line 685, in <lambda>

  File "/usr/local/lib/python3.10/dist-packages/tornado/ioloop.py", line 738, in _run_callback

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 825, in inner

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 786, in run

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelbase.py", line 361, in process_one

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 234, in wrapper

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelbase.py", line 261, in dispatch_shell

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 234, in wrapper

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelbase.py", line 539, in execute_request

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 234, in wrapper

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/ipkernel.py", line 302, in do_execute

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/zmqshell.py", line 539, in run_cell

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 2975, in run_cell

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3030, in _run_cell

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/async_helpers.py", line 78, in _pseudo_sync_runner

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3257, in run_cell_async

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3473, in run_ast_nodes

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code

  File "<ipython-input-60-20ef57db8c62>", line 1, in <cell line: 1>

  File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1807, in fit

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1401, in train_function

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1384, in step_function

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1373, in run_step

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1150, in train_step

  File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 590, in __call__

  File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/base_layer.py", line 1149, in __call__

  File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 96, in error_handler

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/sequential.py", line 398, in call

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/functional.py", line 515, in call

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/functional.py", line 672, in _run_internal_graph

  File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/base_layer.py", line 1149, in __call__

  File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 96, in error_handler

  File "/usr/local/lib/python3.10/dist-packages/keras/src/layers/core/embedding.py", line 272, in call

indices[31,55] = -1 is not in [0, 13247)
	 [[{{node sequential_9/embedding_9/embedding_lookup}}]] [Op:__inference_train_function_45198]