In [1]:
# import libraries
import os
import re
from tqdm import tqdm
import numpy as np
import pandas as pd
import nltk
nltk.download("all")
import matplotlib.pyplot as plt
import torch

from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.layers import Input, Embedding, LSTM, Dense, concatenate, Flatten
from keras.models import Model

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /Users/ivankwok/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /Users/ivankwok/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /Users/ivankwok/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /Users/ivankwok/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /Users/ivankwok/nltk_data...
[nltk_data]    |   Package basque_grammars is already up-to-date!
[nltk_data]    | Downloadi

In [2]:
df = pd.read_csv('amazon.csv')

In [3]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

elif torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    print('MPS device:', mps_device)
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

MPS device: mps


In [4]:
#dfReview for review product name and product description
dfReview = df[['review_title']]
dfReview

Unnamed: 0,review_title
0,"Satisfied,Charging is really fast,Value for mo..."
1,"A Good Braided Cable for Your Type C Device,Go..."
2,"Good speed for earlier versions,Good Product,W..."
3,"Good product,Good one,Nice,Really nice product..."
4,"As good as original,Decent,Good one for second..."
...,...
1460,"Received the product without spanner,Excellent..."
1461,"ok,everything was good couldn't return bcoz I ..."
1462,"very good,Work but front melt after 2 month,Go..."
1463,"Fan Speed is slow,Good quality,Good product,go..."


In [5]:
#replace capital letters and remove punctuation
for column in range(1):
  dfReview.iloc[:,column] = dfReview.iloc[:,column].apply(lambda x: x.lower())
  dfReview.iloc[:,column] = dfReview.iloc[:,column].apply((lambda x: re.sub(r"http\S+"," ",x)))
  dfReview.iloc[:,column] = dfReview.iloc[:,column].apply((lambda x: re.sub('[^a-zA-z0-9\s]'," ",x)))


In [6]:
# change the dfReview to list
review_title = dfReview.values.tolist()

# unwrap the list
review_title = [string[0] for string in review_title]

In [7]:
import string
from nltk.tokenize import word_tokenize
from collections import defaultdict, Counter

# Tokenize texts, build vocabulary and find maximum sentence length
def encode(tokenized_texts, word2idx, max_len):


    input_ids = []
    for tokenized_sent in tokenized_texts:
        # Pad sentences to max_len
        tokenized_sent += ['<pad>'] * (max_len - len(tokenized_sent))

        # Encode tokens to input_ids
        input_id = [word2idx.get(token, word2idx['<unk>']) for token in tokenized_sent]
        input_ids.append(input_id)
    
    return np.array(input_ids)


def tokenize(texts, min_freq=1):
    max_len = 0
    tokenized_texts = []
    word2idx = {}

    # Add <pad> and <unk> tokens to the vocabulary
    word2idx['<pad>'] = 0
    word2idx['<unk>'] = 1
    word2idx['<BOS>'] = 2
    word2idx['<EOS>'] = 3

    # Tokenize the texts
    for sent in texts:
        # Convert to lowercase and remove punctuation
        sent = sent.lower()
        sent = ''.join(c for c in sent if c not in string.punctuation)

        tokenized_sent = ['<BOS>'] + word_tokenize(sent) + ['<EOS>']
        tokenized_texts.append(tokenized_sent)
        max_len = max(max_len, len(tokenized_sent))

    # Count token frequencies
    token_freq = Counter(token for tokens in tokenized_texts for token in tokens)

    # Building our vocab from the corpus starting from index 4
    idx = 4
    for token, freq in token_freq.items():
        if freq >= min_freq and token not in word2idx:
            word2idx[token] = idx
            idx += 1

    vocab_size = len(word2idx)

    return tokenized_texts, word2idx, vocab_size, max_len

In [8]:
# size of the vocabulary

print("Tokenizing...\n")
tokenized_texts, word2idx, vocab_size, max_len = tokenize(review_title)
input_ids = encode(tokenized_texts, word2idx, max_len)

print("Vocabulary size:", len(word2idx))

Tokenizing...

Vocabulary size: 3698


Q1-2

In [9]:
%%time
URL = "https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip"
FILE = "fastText"

if os.path.isdir(FILE):
    print("fastText exists.")
else:
    !wget -P $FILE $URL
    !unzip $FILE/crawl-300d-2M.vec.zip -d $FILE

fastText exists.
CPU times: user 84 µs, sys: 58 µs, total: 142 µs
Wall time: 118 µs


In [10]:
from tqdm import tqdm_notebook

def load_pretrained_vectors(word2idx, fname):


    print("Loading pretrained vectors...")
    fin = open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())

    # Initilize random embeddings
    embeddings = np.random.uniform(-0.25, 0.25, (len(word2idx), d))
    embeddings[word2idx['<pad>']] = np.zeros((d,))

    # Load pretrained vectors
    count = 0
    for line in tqdm_notebook(fin):
        tokens = line.rstrip().split(' ')
        word = tokens[0]
        if word in word2idx:
            count += 1
            
            embeddings[word2idx[word]] = np.array(tokens[1:], dtype=np.float32)

    print(f"There are {count} / {len(word2idx)} pretrained vectors found.")

    return embeddings

In [11]:
# Load pretrained vectors
embeddings = load_pretrained_vectors(word2idx, "fastText/crawl-300d-2M.vec")
embeddings = torch.tensor(embeddings)

Loading pretrained vectors...


0it [00:00, ?it/s]

There are 3424 / 3698 pretrained vectors found.


In [12]:
# size of the embedding matrix

print(f"size of the embedding matrix: {embeddings.shape}")

size of the embedding matrix: torch.Size([3698, 300])


In [13]:
from torch.utils.data import (TensorDataset, DataLoader, RandomSampler,
                              SequentialSampler)

def data_loader(train_inputs, val_inputs, train_labels, val_labels,
                batch_size=64):

    # Convert data type to torch.Tensor
    train_inputs, val_inputs, train_labels, val_labels =\
    tuple(torch.tensor(data) for data in
          [train_inputs, val_inputs, train_labels, val_labels])

    # Specify batch_size
    batch_size = 50

    # Create DataLoader for training data
    train_data = TensorDataset(train_inputs, train_labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

    # Create DataLoader for validation data
    val_data = TensorDataset(val_inputs, val_labels)
    val_sampler = SequentialSampler(val_data)
    val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

    return train_dataloader, val_dataloader

In [14]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.layers import Dense, Embedding, LSTM, concatenate, Input
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

In [109]:
#preprocess numeric inputs

#rating count
df['rating_count'] = df['rating_count'].replace(',','',regex=True).astype('float64')
#actual price
df['actual_price'] = df['actual_price'].replace(',','',regex=True)
df['actual_price'] = df['actual_price'].replace('₹','',regex=True).astype('float64')
#discounted price
df['discounted_price'] = df['discounted_price'].replace(',','',regex=True)
df['discounted_price'] = df['discounted_price'].replace('₹','',regex=True).astype('float64')


#concatenate numeric values into arrays
actual_price = df[['actual_price']].to_numpy()
rating_count = df[['rating_count']].to_numpy()
int_output = df['discounted_price'].to_numpy()
rating = df['rating'].to_numpy()

In [110]:
# train a scalar and normalize the data attributes
act_price_scaler = StandardScaler()
act_price_inputs = act_price_scaler.fit_transform(actual_price)


In [111]:
rating_scaler = StandardScaler()
rating_inputs = rating_scaler.fit_transform(rating_count)

In [112]:
output_scaler = StandardScaler()
int_output = output_scaler.fit_transform(int_output.reshape(-1,1))

In [99]:
#int_inputs = np.concatenate((act_price_inputs, rating_inputs), axis=1)
int_inputs = act_price_inputs

In [100]:

# Combine both textual and numerical features
X = [review_title, int_inputs]
y = int_output
# Split the data into train and test sets
X_train_text, X_test_text, X_train_num, X_test_num, y_train, y_test = train_test_split(input_ids, int_inputs, y, test_size=0.2, random_state=42)








In [101]:
import keras
from keras.layers import Input, Embedding, LSTM, Dense, Dropout, BatchNormalization
from keras.layers import concatenate
from keras.models import Model

max_sequence_length = input_ids.shape[1]  # The length of the input sequences
vocab_size = embeddings.shape[0]  # The size of the vocabulary
embedding_dim = embeddings.shape[1]  # The dimension of the word embeddings
'''
# Define the textual input branch
input_text = Input(shape=(max_len,))
embedding = Embedding(vocab_size, embedding_dim, input_length=max_sequence_length, weights=[embeddings], trainable=False)(input_text)
lstm = LSTM(64, dropout=0.2, recurrent_dropout=0.2)(embedding)

# Define the numerical input branch
input_num = Input(shape=(2,))
dense_num = Dense(32, activation='relu')(input_num)
dense_num = BatchNormalization()(dense_num)
dense_num = Dropout(0.2)(dense_num)

# Merge the two branches and add more layers
merged = concatenate([lstm, dense_num])
merged = Dense(64, activation='relu')(merged)
merged = BatchNormalization()(merged)
merged = Dropout(0.5)(merged)

# Add the output layer
output = Dense(1, activation='linear')(merged)

# Build and compile the model
model = Model(inputs=[input_text, input_num], outputs=output)
model.compile(optimizer='adam', loss='mean_squared_error')

# Display the model summary
model.summary()
'''

# Define the model
input_text = Input(shape=(max_len,))
embedding_layer = Embedding(vocab_size, embedding_dim, input_length=max_len, weights=[embeddings], trainable=False)(input_text)
lstm_layer = LSTM(32)(embedding_layer)

input_num = Input(shape=(1,))
dense_num = Dense(16, activation='relu')(input_num)

merged = concatenate([lstm_layer, dense_num])
output = Dense(1, activation='tanh')(merged)

model = Model(inputs=[input_text, input_num], outputs=output)

# Compile and train the model
model.compile(optimizer='adam', loss='mean_squared_error')
model.fit([X_train_text, X_train_num], y_train, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x367f8e5e0>

In [102]:
# Evaluate the model
model.evaluate([X_test_text, X_test_num], y_test)



0.5482922196388245

In [124]:
# import the necessary packages


import numpy as np
import pandas as pd
import torch

# make a simple predictor function
def predict(text, actual_price):
    # Convert the text to input ids
    # Convert to lowercase and remove punctuation
    sent = text.lower()
    sent = ''.join(c for c in sent if c not in string.punctuation)
    
    sent = ['<BOS>'] + word_tokenize(sent) + ['<EOS>']
    print(sent)
    sent += ['<pad>'] * (max_len - len(sent))

    input_ids = [word2idx[word] if word in word2idx else word2idx['<unk>'] for word in sent]
    pred_int_input = input_scaler.transform(np.array([[actual_price]]))

    input_ids = np.array(input_ids).reshape(1, -1)
    # Predict the discounted price
    
    pred = model.predict([input_ids, pred_int_input])
    print(pred)
    # Convert the discounted price to the original scale
    discounted_price = output_scaler.inverse_transform(pred.reshape(-1, 1))
    return discounted_price[0][0]

actual_price = 600
#rating_count = 400000
text = "Dont like."

predicted_discount_price = predict(text, actual_price)

print(f"Review title: {text}")
print(f"Actual price: {actual_price}")
#print(f"Rating count: {rating_count}")


print(f"Predicted discounted price: {predicted_discount_price:.2f}")

['<BOS>', 'dont', 'like', '<EOS>']
[[-0.38366398]]
Review title: Dont like.
Actual price: 600
Predicted discounted price: 461.94
