In [None]:
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertForSequenceClassification, BertTokenizer, DistilBertForSequenceClassification
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from tqdm import tqdm
from wordcloud import WordCloud, STOPWORDS
import spacy
import matplotlib.pyplot as plt

from functions import getdata
from classes import amazon_dataset, amazon_dataset_run

In [None]:
# Select model
model = 'distilbert-base-uncased_2e-05_32_0.3'

model_path = f'../modelsV2/dis/{model}'

In [None]:
# Load model
model = DistilBertForSequenceClassification.from_pretrained(model_path)

In [None]:
#get web data if you'd rather
'''options = webdriver.ChromeOptions()
driver_path = ChromeDriverManager().install()

reviews = getdata('https://www.amazon.com/product-reviews/B0828BJGD2/',options,driver_path)

df = pd.Series(reviews)'''

#get csv data, edit these as needed
bose = pd.read_csv("")
sony = pd.read_csv("")
yuandidu = pd.read_csv("")

#if you want to use all the data it can be concatinated
'''df = pd.concat([bose,sony,yuandidu])
print(len(bose), len(sony), len(yuandidu),len(df),len(bose)+len(sony)+len(yuandidu))

print(df.info())
reviews = df[['text','rating']]

bose_reviews = reviews[reviews['text'].str.contains('bose', case=False)]
print(len(bose_reviews))'''

df = sony[['text','rating']]

reviews = df['text']

In [None]:
df['rating'].hist(bins=5)

In [None]:
reviews

In [None]:
# Get stopwords and create wordcloud
stopwords = set(STOPWORDS)
wordcloud = WordCloud(stopwords=stopwords, background_color='black', max_words=100, width=800, height=400).generate(list(reviews)[0])

In [None]:
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
joined_text = " ".join(reviews)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
# Check if GPU is avalible, if not use SPU
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
print(f'Using: {device}')
model.to(device)

In [None]:
#Get the paragraphs that 
def extract_relevant_paragraphs(text, keywords):
    text = ' '.join(text)
    instances = text.split('.')
    relevant_instances = []

    # Loop through each sentence (limited at 50 just for testing speed)
    for instance in instances[:50]:
        # Check if any of the keywords are in the sentence
        if keywords[0] == 'total':
            relevant_instances.append(instance.strip())
        else:
            if any(keyword.lower() in instance.lower() for keyword in keywords):
                relevant_instances.append(instance.strip())

    return relevant_instances

features = ['total', 'bluetooth', 'ANC']
survey_dict = {}

# Get relevent paragraphs for each target feature and create model sentiment predictions
for feature in features:
    relevant_instances = extract_relevant_paragraphs(reviews, [feature])
    print(len(relevant_instances))

    # Only continue if there are enough datapoints
    if len(relevant_instances) > 4:

        reviews_encoded = tokenizer.batch_encode_plus(relevant_instances, add_special_tokens=True, truncation=True, padding=True, return_tensors='pt', max_length=128, return_attention_mask=True)
        dataset = amazon_dataset_run(reviews_encoded, labels=None)
        dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

        preds = []

        for batch in tqdm(dataloader):
            with torch.no_grad():
                inputs = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)

                model.eval()

                output = model(inputs, attention_mask=attention_mask)

                logits = output.logits
                predictions = torch.argmax(logits, dim=-1)

                preds.append(predictions.numpy())
        for i in range(len(relevant_instances)):
            print(preds[i], relevant_instances[i])
        
        print(float(sum(preds)/len(preds)))

        # Add results to dictionary
        survey_dict[feature] = float(sum(preds)/len(preds))
    else:
        # Label as not enough datapoints if there are less points than specified
        survey_dict[feature] = f'Not enough data: {len(relevant_instances)} datapoints found'

In [None]:
print(survey_dict)
results_df = pd.DataFrame([survey_dict]).T
results_df.columns = ['Sentiment']

In [None]:
#results_df.to_csv('C:/Users/UKGC/Sensian Research Ltd/Overall Data Sharing - Documents/Python Scripts/Sentiment analysis/bose_results.csv')