# Step1: Load the Model

In [1]:
from tensorflow.keras.models import load_model

# Load the model from the file
model = load_model('model_weights.keras')

  trackable.load_own_variables(weights_store.get(inner_path))


# Step2: Input the Data

In [2]:
# Import some necessary libraries
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [3]:
import pandas as pd

# Load the dataset
training_data = 'training_data/NLI/train.csv'
df_training = pd.read_csv(training_data)
df_training

Unnamed: 0,premise,hypothesis,label
0,"However, Fort Charles was rebuilt as a militar...",Fort Charles was rebuilt as an amusement park ...,0
1,Buchanan's The Democrats and Republicans have...,THe parties will never be similar.,0
2,In order to review an acquisition that is usin...,The auditor only reviews the acquisition itsel...,0
3,Three young people sit outside and engage with...,There is a tin can and string telephone.,0
4,The lucrative tin mines of Kuala Lumpur in the...,The Chinese labor was seen as less costly and ...,1
...,...,...,...
26939,Information in agencies' plans and reports pro...,"Thanks to agencies' plans and reports, over $3...",0
26940,"He is the Mr. Magoo of scientific theory, geni...",He understands everything he can't see.,0
26941,"Over the past 25 years, the Postal Service has...",Classifying mail is important to the function ...,1
26942,Whoever first stepped ashore on Madeira discov...,The British discovered the Canary Islands first.,0


In [4]:
validation_data = 'training_data/NLI/dev.csv'
df_validation = pd.read_csv(validation_data)

In [5]:
df_validation['hypothesis'] = df_validation['hypothesis'].astype(str)

In [6]:
# Combine premises & hypotheses for tokenization

training_texts = list(df_training['premise']) + list(df_training['hypothesis']) + list(df_validation['premise']) + list(df_validation['premise'])

# Initialize and fit the tokenizer
training_tokenizer = Tokenizer(num_words=20000)  # Keep the most frequent 10,000words and ignore the rest
training_tokenizer.fit_on_texts(training_texts)

# Convert texts to sequences
training_premise_sequences = training_tokenizer.texts_to_sequences(df_training['premise'])
training_hypothesis_sequences = training_tokenizer.texts_to_sequences(df_training['hypothesis'])
# print(training_premise_sequences)

# Find the maximum sequence length for padding
training_max_len = max(max(len(seq) for seq in training_premise_sequences), max(len(seq) for seq in training_hypothesis_sequences))
training_max_len = min(training_max_len, 500) # Set a maximum length
print(np.mean([len(seq) for seq in training_premise_sequences]))
print(np.mean([len(seq) for seq in training_hypothesis_sequences]))

# print(max_len)

# Pad sequences
training_premise_padded = pad_sequences(training_premise_sequences, maxlen=40, padding='post', truncating='post')
training_hypothesis_padded = pad_sequences(training_hypothesis_sequences, maxlen=40, padding='post', truncating='post')

print(training_premise_padded)
print(len(training_premise_padded))
print(len(training_premise_padded[0]))

18.864348277909738
10.269521971496436
[[  328  2104  1601 ...     0     0     0]
 [10608     1  1782 ...     0     0     0]
 [    6   377     5 ...     0     0     0]
 ...
 [   83     1   434 ...     0     0     0]
 [ 8148   108  3633 ...     0     0     0]
 [   21     8  2768 ...     0     0     0]]
26944
40


In [7]:
# Combine premises & hypotheses for tokenization

# validation_texts = list(df_validation['premise']) + list(df_validation['hypothesis'])
# validation_texts = [str(text) for text in validation_texts]

# # Initialize and fit the tokenizer
# validation_tokenizer = Tokenizer(num_words=10000)  # Keep the most frequent 10,000words and ignore the rest
# validation_tokenizer.fit_on_texts(validation_texts)

# Convert texts to sequences
validation_premise_sequences = training_tokenizer.texts_to_sequences(df_validation['premise'])
validation_hypothesis_sequences = training_tokenizer.texts_to_sequences(df_validation['hypothesis'])
# print(validation_hypothesis_sequences)

# Find the maximum sequence length for padding
validation_max_len = max(max(len(seq) for seq in validation_premise_sequences), max(len(seq) for seq in validation_hypothesis_sequences))
validation_max_len = min(validation_max_len, 500) # Set a maximum length
print(np.mean([len(seq) for seq in validation_premise_sequences]))
print(np.mean([len(seq) for seq in validation_hypothesis_sequences]))

# print(max_len)

# Pad sequences
validation_premise_padded = pad_sequences(validation_premise_sequences, maxlen=40, padding='post', truncating='post')
validation_hypothesis_padded = pad_sequences(validation_hypothesis_sequences, maxlen=40, padding='post', truncating='post')

print(validation_premise_padded)
print(len(validation_premise_padded))
print(len(validation_premise_padded[0]))

18.724506456879915
10.176488051061304
[[ 6445 16141     0 ...     0     0     0]
 [   24    91  2072 ...     0     0     0]
 [  874    38   474 ...     0     0     0]
 ...
 [ 1566   472     4 ...     0     0     0]
 [   21     5  4408 ...     0     0     0]
 [   71  1298     5 ...     0     0     0]]
6737
40


In [8]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

In [9]:
# Getting the labels

# training_labels = df_training['label'].values
# If labels are not integers
training_labels = df_training['label'].astype(int).values
validation_labels = df_validation['label'].astype(int).values

print(training_labels)
print(len(training_labels))
print(validation_labels)
print(len(validation_labels))

[0 0 0 ... 1 0 0]
26944
[0 0 1 ... 1 0 1]
6737


# Step3: Add Test Data

In [10]:
test_data = 'test_data/NLI/test.csv'
df_test = pd.read_csv(test_data)

In [11]:
# Convert texts to sequences
test_premise_sequences = training_tokenizer.texts_to_sequences(df_test['premise'])
test_hypothesis_sequences = training_tokenizer.texts_to_sequences(df_test['hypothesis'])
# print(validation_hypothesis_sequences)

# Find the maximum sequence length for padding
test_max_len = max(max(len(seq) for seq in test_premise_sequences), max(len(seq) for seq in test_hypothesis_sequences))
test_max_len = min(test_max_len, 500) # Set a maximum length
print(np.mean([len(seq) for seq in test_premise_sequences]))
print(np.mean([len(seq) for seq in test_hypothesis_sequences]))

# print(max_len)

# Pad sequences
test_premise_padded = pad_sequences(test_premise_sequences, maxlen=40, padding='post', truncating='post')
test_hypothesis_padded = pad_sequences(test_hypothesis_sequences, maxlen=40, padding='post', truncating='post')

print(test_premise_padded)
print(len(test_premise_padded))
print(len(test_premise_padded[0]))

13.79588128407026
6.902786190187765
[[ 291  236  253 ...    0    0    0]
 [   2 2024  138 ...    0    0    0]
 [ 139   47    6 ...    0    0    0]
 ...
 [  13   48  106 ...    0    0    0]
 [   2  799    4 ...    0    0    0]
 [   2   64  236 ...    0    0    0]]
3302
40


# Step4: Generate the prediction file

In [13]:
# Assuming 'model' is your trained model and 'validation_hypothesis_padded', 'validation_premise_padded' are your validation data

# 1. Restore best weights (already done during training)
# 2. Make predictions on the validation set
predictions = model.predict([test_hypothesis_padded, test_premise_padded])

print(predictions)
# Assuming you have original labels, combine predictions with original data
data_with_predictions = pd.DataFrame({
#     'Original_Labels': validation_labels,
    'Predictions': np.argmax(predictions, axis=1)
})

# Write data with predictions to CSV
data_with_predictions.to_csv('Group_47_B.csv', index=False)


# from sklearn.metrics import accuracy_score

# val_accuracy = accuracy_score(validation_labels, np.argmax(predictions, axis=1))

# print("val Accuracy:", val_accuracy)

[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
[[0.37312186 0.6268781 ]
 [0.50631595 0.49368402]
 [0.24171114 0.75828886]
 ...
 [0.39328644 0.6067135 ]
 [0.5653148  0.43468523]
 [0.5212724  0.47872755]]
