In [23]:
import json
import numpy as np
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer

f1 = open(r'ATE_train.json' , 'r' , encoding='utf-8')
f2 = open(r'ATE_test.json' , 'r' , encoding='utf-8')
f3 = open(r'ATE_val.json' , 'r' , encoding='utf-8')

train_data = list(json.load(f1).values())
test_data = list(json.load(f2).values())
val_data = list(json.load(f3).values())


f1.close()
f2.close()
f3.close()

texts = []

for i in train_data + test_data + val_data:
    texts.append(i['text'])

Tokenizer = Tokenizer(lower=False)
Tokenizer.fit_on_texts(texts)

sequences = Tokenizer.texts_to_sequences(texts)

labels = []
max_sentence_length = 0

for i in train_data + test_data + val_data:
    for j in i['labels']:
        max_sentence_length = max(max_sentence_length , len(i['text'].split(' ')))
        if j not in labels:
            labels.append(j)

labels.append('UNKNNOWN')
labels.sort()

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder.fit(labels)

label_encoder.transform(labels)

from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

def preprocess_data(data, Tokenizer, LabelEncoder):
    global max_sentence_length
    
    texts = []
    for i in data:
        texts.append(i['text'])

    x = Tokenizer.texts_to_sequences(texts)
    x = pad_sequences(x, maxlen=max_sentence_length, padding='post')


    y = [[label if label in LabelEncoder.classes_ else 'UNKNOWN' for label in item['labels']] for item in data]
    y = [LabelEncoder.transform(sublist).tolist() for sublist in y]
    y = pad_sequences(y, maxlen=max_sentence_length, padding='post')
    y = to_categorical(y, num_classes=len(LabelEncoder.classes_))
    
    return x, y

x_train , y_train = preprocess_data(train_data , Tokenizer , label_encoder)
x_val , y_val = preprocess_data(val_data , Tokenizer , label_encoder)
x_test , y_test = preprocess_data(test_data , Tokenizer , label_encoder)


In [24]:
task2_models_dict = {
    "RNN , Word2Vec" : "t2_model1_word2vec.pkl" , "RNN , GloVe" : "t2_model1_glove.pkl" , "RNN , FastText" : "t2_model1_fasttext.pkl" ,
    "LSTM , Word2Vec" : "t2_model2_word2vec.pkl" , "LSTM , GloVe" : "t2_model2_glove.pkl" , "LSTM , FastText" : "t2_model2_fasttext.pkl" ,
    "GRU , Word2Vec" : "t2_model3_word2vec.pkl" , "GRU , GloVe" : "t2_model3_glove.pkl" , "GRU , FastText" : "t2_model3_fasttext.pkl" , 
    "BiLSTM , Word2Vec" : "t2_model4_word2vec.pkl" , "BiLSTM , GloVe" : "t2_model4_glove.pkl" , "BiLSTM , FastText" : "t2_model4_fasttext.pkl",
}

In [25]:
from tensorflow.keras.models import load_model
from sklearn.metrics import f1_score

results = []

print("Loading models")
for model_type, model_name in task2_models_dict.items():
    model = load_model(model_name)
    if model_type == "model5 , Word2Vec" or model_type == "model5 , GloVe" or model_type == "model5 , FastText":
        print("\n\n\n")
        y_test_sparse = np.argmax(y_test, axis=-1)
        y_test_pred = model.predict(x_test)
        accuracy = np.mean([np.array_equal(y_test_pred[i], y_test_sparse[i]) for i in range(len(y_test_sparse))])
        f1 = f1_score(y_test_sparse.flatten(), y_test_pred.flatten(), average='macro')
        results.append(f"{model_type} - Test accuracy: {accuracy} - Macro F1: {f1}")

    else:
        accuracy = model.evaluate(x_test, y_test, verbose=0)
        y_pred_test = model.predict(x_test).argmax(axis=-1)
        y_true_test = y_test.argmax(axis=-1)
        f1_test = f1_score(y_true_test.flatten(), y_pred_test.flatten(), average='macro')
        results.append(f"{model_type} - Test accuracy: {accuracy[1]} - Macro F1: {f1_test}")





Loading models


In [26]:
print("RESULTS FOR TASK-2")
for result in results:
    print(result)


RESULTS FOR TASK-2
RNN , Word2Vec - Test accuracy: 0.9655818343162537 - Macro F1: 0.6850093399061189
RNN , GloVe - Test accuracy: 0.9670143723487854 - Macro F1: 0.7329829577304127
RNN , FastText - Test accuracy: 0.9655818343162537 - Macro F1: 0.6549143332712156
LSTM , Word2Vec - Test accuracy: 0.9648104906082153 - Macro F1: 0.643698920050038
LSTM , GloVe - Test accuracy: 0.9673817157745361 - Macro F1: 0.7018021313370433
LSTM , FastText - Test accuracy: 0.9643696546554565 - Macro F1: 0.6305542724022618
GRU , Word2Vec - Test accuracy: 0.9655084013938904 - Macro F1: 0.6738732150781567
GRU , GloVe - Test accuracy: 0.9683000445365906 - Macro F1: 0.7031082598559598
GRU , FastText - Test accuracy: 0.9644798636436462 - Macro F1: 0.6539074956500986
BiLSTM , Word2Vec - Test accuracy: 0.9673817157745361 - Macro F1: 0.7124432257976269
BiLSTM , GloVe - Test accuracy: 0.9691815972328186 - Macro F1: 0.7317567211659775
BiLSTM , FastText - Test accuracy: 0.9681898355484009 - Macro F1: 0.698630955810230

ASPECT TERM EXTRACTION

In [27]:
entities_dict = []

for item in test_data:
    text, labels = item['text'].split(), item['labels']
    entities = []
    entity = []

    for i, label in enumerate(labels):
        if label == 'B':
            if entity:
                entities.append(entity)
                entity = []
            entity = [text[i]]
        elif label == 'I' and entity:
            entity[0] += ' ' + text[i]
        elif label == 'O':
            if entity:
                entities.append(entity)
                entity = []

    if entity:
        entities.append(entity)

    if entities:
        for i in entities:
            entities_dict.append(i[0])
        

entities_dict


['Boot time',
 'tech support',
 'Set up',
 'Windows 8',
 'touchscreen functions',
 'internal speakers',
 'use',
 'Works',
 'apple OS',
 'features',
 'log on',
 'WiFi connection',
 'battery life',
 'delete key',
 'interneting',
 'priced',
 'track pad',
 'graphics',
 'mountain lion',
 'build',
 'durability',
 'battery life',
 'works',
 'Windows 8',
 'baterry',
 'size',
 'weight',
 'performance',
 'speed',
 'screen',
 'price',
 'Hardware performance',
 'works',
 'set up',
 'Keyboard',
 'Windows 8',
 'setup',
 'configure',
 'Windows 8',
 'usb ports',
 'features',
 'screen',
 'keyboard',
 'Performance',
 'quality',
 'performance',
 'OS',
 'portability',
 'portable computing',
 'MS Office 2011 for Mac',
 'performance',
 'look',
 'performance',
 'lit up keys',
 'screen display',
 'Mountain Lion OS',
 'Microsoft Windows',
 'OSX',
 'Microsoft Office',
 'graphics',
 'colors',
 'Built-in apps',
 'operating system',
 'size',
 'SquareTrade 3-Year Computer Accidental Protection Warranty',
 'AppleCar