# **Importing Libraries & Downloading Data**

In [1]:
import opendatasets as od
od.download("https://www.kaggle.com/datasets/bittlingmayer/amazonreviews")

Skipping, found downloaded files in ".\amazonreviews" (use force=True to force download)


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import models,layers,optimizers
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
import bz2
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
import re
import keras
%matplotlib inline
import os
print(os.listdir("amazonreviews"))

['test.ft.txt.bz2', 'train.ft.txt.bz2']


# **Configuring for CUDA Processing**

In [3]:
tf.config.experimental.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]

In [4]:
tf.test.is_built_with_cuda()

False

# **Reading data from File**

In [5]:
def getdata(file):
    labels=[]
    texts=[]
    for line in bz2.BZ2File(file):
        x=line.decode()
        labels.append(int(x[9])-1)
        texts.append(x[11:].strip())
    return np.array(labels),texts
test = pd.read_csv('resources/test.csv')
test.columns = ['sentiment', 'title', 'text']
train =  pd.read_csv('resources/train.csv')
train.columns = ['sentiment', 'title', 'text']

In [68]:
train = pd.read_csv('resources/dataset.csv')
train.columns = ['stars', 'title', 'text']
#train = train[train['stars'] != 3]
train['sentiment'] = train['stars'].replace({1:1, 2:1, 3:2, 4:3, 5:3})

In [69]:
test['text'] = test['title'] + " " + test['text']
train['text'] = train['title'] + " " + train['text']
#train_labels = keras.utils.to_categorical(train['stars'] - 1, num_classes=5)
train_labels = keras.utils.to_categorical(train['sentiment'] - 1, num_classes=3)
#train_labels = train['sentiment'] - 1
train_texts = train['text'].astype(str).values

In [70]:
train_texts = train_texts[:500_000]
train_labels = train_labels[:500_000]

# **Text Preprocessing**

In [71]:
def normalise(texts):
    return np.array([re.compile(r"[^a-z0-1\s]").sub(r" ",re.compile(r"['\W']").sub(r" ",text.lower())) for text in texts])

train_texts=normalise(train_texts)
# test_texts=normalise(test_texts)

# **Train & Validation Split**
### 20% for validation

In [72]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, random_state=0, test_size=0.2)

## **Using top 12000 words as features**

In [73]:
# tokenizer=Tokenizer(num_words=12000)
# tokenizer.fit_on_texts(train_texts)
# train_texts = tokenizer.texts_to_sequences(train_texts)
# val_texts = tokenizer.texts_to_sequences(val_texts)
# test_texts = tokenizer.texts_to_sequences(test_texts)

In [74]:
# mlen = max(len(train_ex) for train_ex in train_texts)
# train_texts = pad_sequences(train_texts, maxlen=mlen)
# val_texts = pad_sequences(val_texts, maxlen=mlen)
# test_texts = pad_sequences(test_texts, maxlen=mlen)

In [75]:
from keras.src.layers import TextVectorization
mlen = max(len(train_ex) for train_ex in train_texts)
vectorize_layer = TextVectorization(max_tokens=12_000, output_sequence_length=mlen)
vectorize_layer.adapt(train_texts)

# **Convolutional Neural Net Model (CNN)**

In [91]:
from keras.src.layers import Embedding, BatchNormalization, MaxPooling1D, GlobalMaxPooling1D, Activation, Dense, Dropout, Input, Conv1D, Flatten
from keras import Sequential

def build_model():
    sequences = layers.Input(shape=(1,), dtype='string')
    x = vectorize_layer(sequences)
    embedded = layers.Embedding(12000, 64)(x)
    
    x = layers.Conv1D(64, 3, activation='relu')(embedded)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPool1D(3)(x)
    
    x = layers.Conv1D(64, 5, activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPool1D(5)(x)
    
    x = layers.Conv1D(64, 5, activation='relu')(x)
    x = layers.GlobalMaxPool1D()(x)
    x = layers.Flatten()(x)
    
    x = layers.Dense(100, activation='relu')(x)
    predictions = layers.Dense(3, activation='softmax')(x)
    model = models.Model(inputs=sequences, outputs=predictions)
    model.compile(optimizer='rmsprop',loss='categorical_crossentropy',metrics=['accuracy'])
    return model

def build_model2(n_classes):
    model = Sequential([
        Input(shape=(1,), dtype='string'),
        vectorize_layer,
        Embedding(12_000, 64),
        
        Conv1D(64, 3),
        BatchNormalization(),
        Activation('relu'),
        MaxPooling1D(3),
        
        Conv1D(64, 5),
        BatchNormalization(),
        Activation('relu'),
        MaxPooling1D(5),
                
        Conv1D(64, 5),
        BatchNormalization(),
        Activation('relu'),
        GlobalMaxPooling1D(),
        
        Flatten(),
        
        Dense(100, activation='relu'),
        Dense(n_classes, activation='softmax')
    ])
    model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
    return model

In [92]:
with tf.device('/GPU:0'):
    model = build_model2(3)
    model.fit(train_texts,train_labels,batch_size=128,epochs=2,validation_data=(val_texts,val_labels))

Epoch 1/2
Epoch 2/2

KeyboardInterrupt: 

In [None]:
preds = model.predict(test_texts)
print('Accuracy score: {:0.4}'.format(accuracy_score(test_labels, 1 * (preds > 0.5))))
print('F1 score: {:0.4}'.format(f1_score(test_labels, 1 * (preds > 0.5))))
print('ROC AUC score: {:0.4}'.format(roc_auc_score(test_labels, preds)))

# **Therefore we get an accuracy of about 94-95%**