In [None]:
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertForSequenceClassification, BertTokenizer
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from tqdm import tqdm
from wordcloud import WordCloud, STOPWORDS
import spacy
import matplotlib.pyplot as plt

from functions import getdata
from classes import amazon_dataset, amazon_dataset_run

In [None]:
model = 'model1'

model_path = f'../models/{model}'

In [None]:
# Load model
model = BertForSequenceClassification.from_pretrained(model_path)

In [None]:
#get web data if you'd rather
'''options = webdriver.ChromeOptions()
driver_path = ChromeDriverManager().install()

reviews = getdata('https://www.amazon.com/product-reviews/B0828BJGD2/',options,driver_path)

df = pd.Series(reviews)'''

#get csv data, edit these as needed
bose = pd.read_csv("")
sony = pd.read_csv("")
yuandidu = pd.read_csv("")

#if you want to use all the data it can be concatinated
'''df = pd.concat([bose,sony,yuandidu])
print(len(bose), len(sony), len(yuandidu),len(df),len(bose)+len(sony)+len(yuandidu))

print(df.info())
reviews = df[['text','rating']]

bose_reviews = reviews[reviews['text'].str.contains('bose', case=False)]
print(len(bose_reviews))'''

df = sony[['text','rating']]

reviews = df['text']

In [None]:
reviews

In [None]:
# Remove stop words and generate word cloud from remaining text for visualisation
stopwords = set(STOPWORDS)
wordcloud = WordCloud(stopwords=stopwords, background_color='white', max_words=100, width=800, height=400).generate(list(reviews)[0])

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
# Join text ready for text extraction
joined_text = " ".join(reviews)

In [None]:
def extract_relevant_paragraphs(text, keywords):
    sentences = text.split('.')
    relevant_sentences = []

    # Loop through each sentence
    for sentence in sentences:
        # Check if any of the keywords are in the sentence
        if any(keyword.lower() in sentence.lower() for keyword in keywords):
            relevant_sentences.append(sentence.strip())

    return relevant_sentences

# Extract paragraphs discussing specified keywords
bluetooth_paragraphs = extract_relevant_paragraphs(joined_text, ["Bluetooth","A2DP"])
print("Bluetooth Discussion:", bluetooth_paragraphs)
print(len(bluetooth_paragraphs))

reviews_filtered = bluetooth_paragraphs

In [None]:
# Get the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Encode the reviews with the tokenizer
reviews_encoded = tokenizer.batch_encode_plus(reviews_filtered, add_special_tokens=True, truncation=True, padding=True, return_tensors='pt', max_length=128, return_attention_mask=True)

In [None]:
dataset = amazon_dataset_run(reviews_encoded, labels=None)

In [None]:
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

In [None]:
# Use GPU if avalible, if not use CPU
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
print(f'Using: {device}')
model.to(device)

In [None]:
preds = []
# Run model and make predictions
for batch in tqdm(dataloader):
    with torch.no_grad():
        inputs = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        model.eval()

        output = model(inputs, attention_mask=attention_mask)

        logits = output.logits
        predictions = torch.argmax(logits, dim=-1)

        preds.append(predictions.numpy())

In [None]:
# Print the mean score
print(sum(preds)/len(preds))