<a href="https://colab.research.google.com/github/LastPudding/Text_Mining_LSTM/blob/main/TextMiningCoursework2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import spacy
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Concatenate, Input
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.utils import to_categorical
import tensorflow as tf


  # Load spaCy English tokenizer
nlp = spacy.load("en_core_web_sm")

In [None]:
!ls
#Check GPU availability
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))


dev.pointer  dev.tup  relations.txt  test.pointer  test.tup  train.pointer  train.tup
dev.sent     dev.txt  sample_data    test.sent	   test.txt  train.sent     train.txt
Num GPUs Available:  1


In [None]:

  #Function to extract Annotations of Sentences
def extract_triplets(file_path):
      triplets = []
      with open(file_path, 'r') as file:
          for line in file:
              line = line.strip()  # Remove leading/trailing whitespace
              if line:  # Check if the line is not empty
                  # Split the line into individual triplets using the delimiter "|"
                  individual_triplets = line.split('|')
                  triplet_list = []
                  # Split each individual triplet into entities and relation
                  for triplet in individual_triplets:
                      triplet_parts = triplet.strip().split(';')
                      if len(triplet_parts) == 3:
                          subject, related_entity, relation = [part.strip() for part in triplet_parts]
                          triplet_list.append((subject, related_entity, relation))
                  triplets.append(triplet_list)
      return triplets

def read_document(file_path):
      with open(file_path, 'r') as file:
          lines = [line.strip() for line in file]
      return lines

  # Function to tokenize sentences
def tokenize_sentences(sentences):
      tokenized_sentences = []
      for sentence in sentences:
          tokens = [token.text for token in nlp(sentence)]
          tokenized_sentences.append(tokens)
      return tokenized_sentences

In [None]:

  # Extracting sentences and Triplets
train_annotations, test_annotations = extract_triplets('train.tup'), extract_triplets('test.tup')
train_sentences, test_sentences = read_document('train.sent'), read_document('test.sent')

  # Read unique relations from relations.txt
relations = read_document('relations.txt')
unique_relations={}
for relation in relations:
          doc = nlp(relation)
          tokens = [token.text for token in doc]
          unique_relations[relation] = tokens

num_unique_relations = len(unique_relations)

  #Create a mapping between each relation and its index
relation_to_index = {relation.replace(" ", "_"): index for index, relation in enumerate(unique_relations)}
print(relation_to_index)



{'product/material_produced': 0, 'manufacturer': 1, 'distributed_by': 2, 'industry': 3, 'position_held': 4, 'original_broadcaster': 5, 'owned_by': 6, 'founded_by': 7, 'distribution_format': 8, 'headquarters_location': 9, 'stock_exchange': 10, 'currency': 11, 'parent_organization': 12, 'chief_executive_officer': 13, 'director/manager': 14, 'owner_of': 15, 'operator': 16, 'member_of': 17, 'employer': 18, 'chairperson': 19, 'platform': 20, 'subsidiary': 21, 'legal_form': 22, 'publisher': 23, 'developer': 24, 'brand': 25, 'business_division': 26, 'location_of_formation': 27, 'creator': 28}


In [None]:
#Prepare input data
# X_sentences = tokenize_sentences(train_sentences)
# X_triplets = np.array(encode_relations(train_triplets, relation_to_index))

# labels = to_categorical(df['labels'], num_classes=len(df.labels.unique()))
#Define maximum sequence length


In [None]:
#GPT

def count_tokens(sentences, min_count=None):
      token_counts = {}  # Create an empty dictionary to store tokens and their counts

      # Iterate over each sentence
      for sentence in sentences:
          # Iterate over each token in the sentence
          for token in sentence:
              # Add the token to the dictionary and update the count
              token_counts[token] = token_counts.get(token, 0) + 1

      # Filter tokens by minimum count if specified
      if min_count is not None:
          token_counts = {token: count for token, count in token_counts.items() if count > min_count}

      # Sort the dictionary by value (token counts) in descending order
      token_counts = dict(sorted(token_counts.items(), key=lambda x: x[1], reverse=True))

      return token_counts

  # Build vocabulary
vocabulary = count_tokens(train_sentences)

  # Create word_to_index mapping
word_to_index = {word: index for index, (word, count) in enumerate(vocabulary.items(), start=1)}



In [None]:

def convert_to_sequences(sentences, annotations, word_to_index, annotation_to_index):
      X = []
      y = []
      for i in range(len(sentences)):
          sentence = sentences[i]
          annotation = annotations[i]
          sequence = []  # Convert sentence to sequence of word indices
          for word in sentence.split():
              if word in word_to_index:
                  sequence.append(word_to_index[word])
              else:
                  sequence.append(0)  # Use 0 for out-of-vocabulary words
          X.append(sequence)
          # Convert annotation to multi-class label
          label = np.zeros(len(annotation_to_index))  # Initialize with zeros for all classes
          for anno in annotation:
              if anno[1] in annotation_to_index:  # Check if the annotation type exists in the index
                  label[annotation_to_index[anno[1]]] = 1  # Set the corresponding index to 1
          y.append(label)
      return X, y


#with tf.device('/GPU:0'):
  # Convert training data to sequences
X_train, y_train = convert_to_sequences(train_sentences, train_annotations, word_to_index, relation_to_index)
y_train=np.array(y_train)
y_train = np.argmax(y_train, axis=1)

  # Hyperparameters
EMBEDDING_DIM = 100  # Dimension of word embeddings
NUM_CLASSES = len(relation_to_index)  # Number of classes for relation extraction
BATCH_SIZE = 32
EPOCHS = 10
MAX_SEQ_LENGTH = max(len(tokens) for tokens in X_train)


  # Convert testing data to sequences
X_test, y_test = convert_to_sequences(test_sentences, test_annotations, word_to_index, relation_to_index)
y_test=np.array(y_test)
y_test = np.argmax(y_test, axis=1)


  # Padding sequences to ensure uniform length
X_train = pad_sequences(X_train, maxlen=MAX_SEQ_LENGTH)
X_test = pad_sequences(X_test, maxlen=MAX_SEQ_LENGTH)



  #Define LSTM model
model = Sequential()
model.add(Embedding(len(word_to_index) + 1, output_dim=EMBEDDING_DIM, input_shape=(MAX_SEQ_LENGTH,)))
model.add(Bidirectional(LSTM(128, dropout=0.7, recurrent_dropout=0.7)))
model.add(Dense(NUM_CLASSES, activation='softmax'))

  # Compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])





In [None]:
# Train model

model.fit(X_train, y_train, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=(X_test, y_test))
# Feb 25 13:48: model is not actually training, something to do with pre-process of y_test

Epoch 1/10


ValueError: in user code:

    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1401, in train_function  *
        return step_function(self, iterator)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1384, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1373, in run_step  **
        outputs = model.train_step(data)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1151, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1209, in compute_loss
        return self.compiled_loss(
    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/compile_utils.py", line 277, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/losses.py", line 143, in __call__
        losses = call_fn(y_true, y_pred)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/losses.py", line 270, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/losses.py", line 2221, in categorical_crossentropy
        return backend.categorical_crossentropy(
    File "/usr/local/lib/python3.10/dist-packages/keras/src/backend.py", line 5573, in categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)

    ValueError: Shapes (None, 1) and (None, 29) are incompatible


In [None]:
y_test_predicted=model.predict(X_test)
y_test_predicted = tf.convert_to_tensor(y_test_predicted)
loss = tf.keras.losses.categorical_crossentropy(y_test, y_test_predicted)



  return dispatch_target(*args, **kwargs)


ValueError: Shapes (1068,) and (1068, 1) are incompatible