<a href="https://colab.research.google.com/github/LastPudding/Text_Mining_LSTM/blob/main/TextMiningCoursework2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import spacy
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Concatenate, Input
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.utils import to_categorical
import tensorflow as tf


  # Load spaCy English tokenizer
nlp = spacy.load("en_core_web_sm")

In [None]:
!ls
#Check GPU availability
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

relations.txt  sample_data  test.sent  test.tup  train.sent  train.tup
Num GPUs Available:  1


In [None]:

#Function to extract Annotations of Sentences
def extract_relations(file_path):
  result=[]
  with open(file_path, 'r') as file:
    for each_line in file:
      replaced_string = each_line.replace(' ', '_').replace('/', '_or_').replace('\n', '')
      result.append(replaced_string)
  return result


def extract_triplets(file_path):
  triplets = []
  with open(file_path, 'r') as file:
      for line in file:
          line = line.strip()  # Remove leading/trailing whitespace
          line=line.replace('/', 'or').replace('\n', '')
          if line:  # Check if the line is not empty
              # Split the line into individual triplets using the delimiter "|"
              individual_triplets = line.split('|')
              triplet_list = []
              # Split each individual triplet into entities and relation
              for triplet in individual_triplets:
                  triplet_parts = triplet.strip().split(';')
                  if len(triplet_parts) == 3:
                      subject, related_entity, relation = [part.strip() for part in triplet_parts]
                      triplet_list.append((subject, related_entity, relation))
              triplets.append(triplet_list)
  return triplets

def read_document(file_path):
      with open(file_path, 'r') as file:
          lines = [line.strip() for line in file]
      return lines

  # Function to tokenize sentences
def tokenize_sentences(sentences):
      tokenized_sentences = []
      for sentence in sentences:
          tokens = [token.text for token in nlp(sentence)]
          tokenized_sentences.append(tokens)
      return tokenized_sentences

['product_or_material_produced', 'manufacturer', 'distributed_by', 'industry', 'position_held', 'original_broadcaster', 'owned_by', 'founded_by', 'distribution_format', 'headquarters_location', 'stock_exchange', 'currency', 'parent_organization', 'chief_executive_officer', 'director_or_manager', 'owner_of', 'operator', 'member_of', 'employer', 'chairperson', 'platform', 'subsidiary', 'legal_form', 'publisher', 'developer', 'brand', 'business_division', 'location_of_formation', 'creator']


In [None]:

  # Extracting sentences and Triplets
train_annotations, test_annotations = extract_triplets('train.tup'), extract_triplets('test.tup')
train_sentences, test_sentences = read_document('train.sent'), read_document('test.sent')

  # Read unique relations from relations.txt
relations = extract_relations('relations.txt')
unique_relations={}
for relation in relations:
          doc = nlp(relation)
          tokens = [token.text for token in doc]
          unique_relations[relation] = tokens

num_unique_relations = len(unique_relations)

  #Create a mapping between each relation and its index
relation_to_index = {relation.replace(" ", "_").replace('/', '_or_').replace('\n', ''): index for index, relation in enumerate(unique_relations)}
print(relation_to_index)

{'product_or_material_produced': 0, 'manufacturer': 1, 'distributed_by': 2, 'industry': 3, 'position_held': 4, 'original_broadcaster': 5, 'owned_by': 6, 'founded_by': 7, 'distribution_format': 8, 'headquarters_location': 9, 'stock_exchange': 10, 'currency': 11, 'parent_organization': 12, 'chief_executive_officer': 13, 'director_or_manager': 14, 'owner_of': 15, 'operator': 16, 'member_of': 17, 'employer': 18, 'chairperson': 19, 'platform': 20, 'subsidiary': 21, 'legal_form': 22, 'publisher': 23, 'developer': 24, 'brand': 25, 'business_division': 26, 'location_of_formation': 27, 'creator': 28}


In [None]:
#Prepare input data
# X_sentences = tokenize_sentences(train_sentences)
# X_triplets = np.array(encode_relations(train_triplets, relation_to_index))

# labels = to_categorical(df['labels'], num_classes=len(df.labels.unique()))
#Define maximum sequence length


In [None]:
#GPT

def count_tokens(sentences, min_count=None):
      token_counts = {}  # Create an empty dictionary to store tokens and their counts

      # Iterate over each sentence
      for sentence in sentences:
          # Iterate over each token in the sentence
          for token in nlp(sentence):
            if str(token) not in ['.',',','(', ')', '-',"%", "'", ':']:
              # Add the token to the dictionary and update the count
              token_counts[str(token)] = token_counts.get(str(token), 0) + 1

      # Filter tokens by minimum count if specified
      if min_count is not None:
          token_counts = {token: count for token, count in token_counts.items() if count > min_count}

      # Sort the dictionary by value (token counts) in descending order
      token_counts = dict(sorted(token_counts.items(), key=lambda x: x[1], reverse=True))

      return token_counts

  # Build vocabulary
vocabulary = count_tokens(train_sentences)
print(vocabulary)

  # Create word_to_index mapping
word_to_index = {word: index for index, (word, count) in enumerate(vocabulary.items(), start=1)}
print(word_to_index)



In [None]:

def convert_to_sequences(sentences, annotations, word_to_index, annotation_to_index):
      X = []
      y = []
      for i in range(len(sentences)):
          sentence = sentences[i]
          annotation = annotations[i]
          sequence = []  # Convert sentence to sequence of word indices
          for word in sentence.split():
            if word not in ['.',',','(', ')', '-',"%", "'", ':']:
              if word in word_to_index:
                  sequence.append(word_to_index[word])
              else:
                  sequence.append(0)  # Use 0 for out-of-vocabulary words
          X.append(sequence)
          # Convert annotation to multi-class label
          label = np.zeros(len(annotation_to_index)).tolist()  # Initialize with zeros for all classes
          for each_tup in annotation:
            label[annotation_to_index[each_tup[2]]] = 1.0  # Set the corresponding index to 1

          y.append(label)
      return X, np.array(y)


#with tf.device('/GPU:0'):
  # Convert training data to sequences
X_train, y_train = convert_to_sequences(train_sentences, train_annotations, word_to_index, relation_to_index)



  # Hyperparameters
EMBEDDING_DIM = 300  # Dimension of word embeddings
NUM_CLASSES = num_unique_relations  # Number of classes for relation extraction
BATCH_SIZE = 128
EPOCHS = 10
MAX_SEQ_LENGTH = max(len(sentence) for sentence in X_train)



  # Convert testing data to sequences
X_test, y_test = convert_to_sequences(test_sentences, test_annotations, word_to_index, relation_to_index)


  # Padding sequences to ensure uniform length
X_train = pad_sequences(X_train, maxlen=MAX_SEQ_LENGTH)
X_test = pad_sequences(X_test, maxlen=MAX_SEQ_LENGTH)

In [None]:
#Define LSTM model
optimizer = tf.keras.optimizers.Adam()
model = Sequential()
model.add(Embedding(len(word_to_index) + 1, output_dim=EMBEDDING_DIM, input_shape=(MAX_SEQ_LENGTH,)))
model.add(Bidirectional(LSTM(128, dropout=0.7, recurrent_dropout=0.7)))
model.add(Dense(64, activation='relu'))

model.add(Dense(NUM_CLASSES, activation='softmax'))

  # Compile model
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Train model

model.fit(X_train, y_train, batch_size=128, epochs=EPOCHS, validation_data=(X_test, y_test))
# Feb 25  13:48: model is not actually training, something to do with pre-process of y_test
#       16:46 mismatching predicate between tuples and relations
#           possible fix: 1. adapt relations.txt file to .tup file(less work, once and for all, but modifies dataset)
#                   2. set up a preprocess pipeline to convert between them(more work, depends on 1:1 mapping between relations.txt & .tup, keep the dataset as it were)



Epoch 1/10

KeyboardInterrupt: 

In [None]:
y_test_predicted=model.predict(X_test)
y_test_predicted = tf.convert_to_tensor(y_test_predicted)
loss = tf.keras.losses.categorical_crossentropy(y_test, y_test_predicted)

In [None]:
X_train

array([[    0,     0,     0, ...,  4501,     4,     0],
       [    0,     0,     0, ..., 11901,     8,     0],
       [    0,     0,     0, ...,   503,    52,     0],
       ...,
       [    0,     0,     0, ...,    45,     3,  1560],
       [    0,     0,     0, ...,     4,   101,     0],
       [    0,     0,     0, ...,    62,     7,     0]], dtype=int32)