In [18]:
from os import environ
environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import numpy as np
import pandas as pd
import nltk
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, SpatialDropout1D, LSTM, Dense, Lambda, Activation, Multiply, Concatenate, RepeatVector, Flatten, Reshape
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K
from sklearn.preprocessing import OneHotEncoder

In [2]:
# Load the dataset
df = pd.read_csv('Laptop_Train_v2.csv', encoding='latin1').dropna()
df = df[['text', 'term', 'polarity']]
df.drop(df[df['polarity'] == 'conflict'].index, inplace = True)
# Preprocessing: Tokenization, stopword removal, and lowercase conversion
stopwords = nltk.corpus.stopwords.words('english')
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')

def preprocess_text(text):
    tokens = tokenizer.tokenize(text.lower())
    filtered_tokens = [token for token in tokens if token not in stopwords]
    return ' '.join(filtered_tokens)

df['processed_text'] = df['text'].apply(preprocess_text)

In [3]:
df

Unnamed: 0,text,term,polarity,processed_text
0,I charge it at night and skip taking the cord ...,cord,neutral,charge night skip taking cord good battery life
1,I charge it at night and skip taking the cord ...,battery life,positive,charge night skip taking cord good battery life
3,The tech guy then said the service center does...,service center,negative,tech guy said service center 1 1 exchange dire...
4,The tech guy then said the service center does...,"""sales"" team",negative,tech guy said service center 1 1 exchange dire...
5,The tech guy then said the service center does...,tech guy,neutral,tech guy said service center 1 1 exchange dire...
...,...,...,...,...
3898,We also use Paralles so we can run virtual mac...,Windows 7 Home Premium,neutral,also use paralles run virtual machines windows...
3899,We also use Paralles so we can run virtual mac...,Windows Server Enterprise 2003,neutral,also use paralles run virtual machines windows...
3900,We also use Paralles so we can run virtual mac...,Windows Server 2008 Enterprise,neutral,also use paralles run virtual machines windows...
3906,"How Toshiba handles the repair seems to vary, ...",repair,positive,toshiba handles repair seems vary folks indica...


In [4]:
max_len = 128
text_tokenizer = Tokenizer(num_words=10000)
text_tokenizer.fit_on_texts(df['processed_text'])
text_sequences = text_tokenizer.texts_to_sequences(df['processed_text'])
text_sequences = pad_sequences(text_sequences, maxlen=max_len)
print(df.dtypes)


text              object
term              object
polarity          object
processed_text    object
dtype: object


In [5]:
# Tokenize the text and aspect terms

aspect_tokenizer = Tokenizer(num_words=100)
aspect_tokenizer.fit_on_texts(df['term'].astype(str))
aspect_sequences = aspect_tokenizer.texts_to_sequences(df['term'])
aspect_sequences = pad_sequences(aspect_sequences, maxlen=1)

In [6]:
df['polarity']

0        neutral
1       positive
3       negative
4       negative
5        neutral
          ...   
3898     neutral
3899     neutral
3900     neutral
3906    positive
3907     neutral
Name: polarity, Length: 2313, dtype: object

In [25]:
# Split the data into training and testing sets
train_size = int(len(df) * 0.8)
train_text = text_sequences[:train_size]
train_aspect = aspect_sequences[:train_size]
encoder = OneHotEncoder(sparse_output=False)
encoder.fit(np.array(df['polarity']).reshape(-1,1))
train_labels = encoder.transform(np.array(df['polarity'][:train_size]).reshape(-1,1))
test_text = text_sequences[train_size:]
test_aspect = aspect_sequences[train_size:]
test_labels = encoder.transform(np.array(df['polarity'][train_size:]).reshape(-1,1))

In [26]:
# Define the model architecture
text_input = Input(shape=(max_len,))
aspect_input = Input(shape=(1,))
embedding = Embedding(input_dim=10000, output_dim=128, input_length=max_len)(text_input)
dropout = SpatialDropout1D(0.2)(embedding)
lstm = LSTM(128, return_sequences=True)(dropout)
attention = Dense(1, activation='tanh')(lstm)
attention = Flatten()(attention)
attention = Reshape((128, 1))(attention)  # Reshape the attention tensor
attention = Multiply()([attention, RepeatVector(max_len)(aspect_input)])
attention = Activation('softmax')(attention)
context = Multiply()([lstm, attention])
context = Lambda(lambda x: K.sum(x, axis=1))(context)
merged = Concatenate(axis=1)([context, aspect_input])
output = Dense(3, activation='softmax')(merged)


In [27]:
train_labels

array([[0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]])

In [36]:

model = Model(inputs=[text_input, aspect_input], outputs=output)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit([train_text, train_aspect], train_labels, validation_data=([test_text, test_aspect], test_labels), epochs=15, batch_size=200)

# Evaluate the model on the test set
test_loss, test_acc = model.evaluate([test_text, test_aspect], test_labels, verbose=0)
print('Test accuracy:', test_acc)


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Test accuracy: 0.5961123108863831


In [37]:
sentances = ["The battery is good. But the processor is bit slow than what i expected.","The battery is good. But the performance is very slow than what i expected."]
aspects = ['battery', 'processor']
sentance_tokenized = text_tokenizer.texts_to_sequences(sentances) # list of tokenized sentences
Aspect_X_train_tokenized = aspect_tokenizer.texts_to_sequences(aspects) # list of tokenized sentences

text_X_train_padded = pad_sequences(sentance_tokenized, maxlen=max_len)
aspect_X_train_padded = pad_sequences(Aspect_X_train_tokenized, maxlen=1)

sample = [text_X_train_padded, aspect_X_train_padded]

In [38]:
encoder.inverse_transform(model.predict(sample))

array([['negative'],
       ['negative']], dtype=object)