In [None]:
!pip install transformers
!pip install torch
!pip install spacy
!pip install accelerate
!python -m spacy download en_core_web_sm


In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import spacy

# Load spacy model for sentence splitting
nlp = spacy.load('en_core_web_sm')


In [3]:
# Load the dataset
data = pd.read_csv('./Balanced_Reviews_Dataset_with_Minimum_100_Reviews_per_Class.csv')

# Basic data cleaning
data.dropna(subset=['content', 'rating'], inplace=True)
data['rating'] = pd.to_numeric(data['rating'], errors='coerce')

# Define a function to categorize ratings into sentiment classes
def categorize_sentiment(rating):
    if rating in [4, 5]:
        return 'Positive'
    elif rating == 3:
        return 'Neutral'
    else:
        return 'Negative'

# Apply the function to create a new 'sentiment' column
data['sentiment'] = data['rating'].apply(categorize_sentiment)

# Map sentiments to numeric labels
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['sentiment'])

# Display the first few rows of the dataset
data.head()


Unnamed: 0,id,product,title,content,date,author,rating,url,sentiment,label
0,749,Intel i3-10105F 3.7 GHz Upto 3.7 GHz LGA 1200 ...,wow,retailnet trusted seller buy without doubt,"Feb, 2023",sandE .,5,https://www.flipkart.com/intel-i3-10105f-3-7-g...,Positive,2
1,508,Intel Core i7-13700K Gaming Desktop Processor ...,great cpu,super fast problem installing game light produ...,"Sep, 2023",silly goose,5,https://www.amazon.com/Intel-i7-13700K-Desktop...,Positive,2
2,556,Intel Core i5-13600KF Desktop Processor 14 cor...,great performance,issue added system year cpu world better game ...,"May, 2024",Clinton Peterson,5,https://www.amazon.com/Intel-i5-13600KF-Deskto...,Positive,2
3,465,Intel Core i7-13700K Gaming Desktop Processor ...,super fast run hot,super fast stable cpu note run hot higher volt...,"Nov, 2023",Mikhail E Merkurieff,5,https://www.amazon.com/Intel-i7-13700K-Desktop...,Positive,2
4,144,Intel Core i7-12700KF Gaming Desktop Processor...,perfect,work well rtx minecraft shaders turned max,"Feb, 2024",Aaron tyus,5,https://www.amazon.com/Intel-i7-12700KF-Deskto...,Positive,2


In [4]:
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create data loaders
def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = SentimentDataset(
        texts=df.content.to_numpy(),
        labels=df.label.to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len
    )
    return DataLoader(ds, batch_size=batch_size, num_workers=4)

df_train, df_val = train_test_split(data, test_size=0.2, random_state=42, stratify=data['label'])

train_data_loader = create_data_loader(df_train, tokenizer, max_len=160, batch_size=16)
val_data_loader = create_data_loader(df_val, tokenizer, max_len=160, batch_size=16)


In [5]:
# Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data_loader.dataset,
    eval_dataset=val_data_loader.dataset
)

# Train the model
trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  attn_output = torch.nn.functional.scaled_dot_product_attention(


Epoch,Training Loss,Validation Loss
1,1.1709,1.126186
2,1.0706,1.056784
3,1.0207,0.978817
4,0.8117,0.856717
5,0.6999,0.607545
6,0.5056,0.608404
7,0.3502,0.431088
8,0.1752,0.463315
9,0.1477,0.437219
10,0.0595,0.515403


TrainOutput(global_step=250, training_loss=0.615006183385849, metrics={'train_runtime': 163.2184, 'train_samples_per_second': 24.507, 'train_steps_per_second': 1.532, 'total_flos': 328891772160000.0, 'train_loss': 0.615006183385849, 'epoch': 10.0})

In [6]:
# Evaluate the model
trainer.evaluate()


{'eval_loss': 0.5154025554656982,
 'eval_runtime': 0.7979,
 'eval_samples_per_second': 125.33,
 'eval_steps_per_second': 8.773,
 'epoch': 10.0}

In [7]:
improvement_suggestions = {
    'heating': 'Consider improving the cooling system or using materials that dissipate heat better.',
    'overheat': 'Improve thermal management to prevent overheating issues.',
    'noisy': 'Look into better sound insulation or quieter components.',
    'loud': 'Reduce the noise levels by using quieter components or better insulation.',
    'battery': 'Optimize battery usage or consider using higher capacity batteries.',
    'drain': 'Improve battery efficiency to reduce quick draining.',
    'weight': 'Explore using lighter materials for construction.',
    'heavy': 'Consider using lightweight materials to make the product easier to handle.',
    'price': 'Consider offering discounts or adding value through bundling services or products.',
    'expensive': 'Review the pricing strategy to make the product more affordable.',
    'performance': 'Enhance performance through software updates or hardware improvements.',
    'slow': 'Improve processing speed to enhance performance.',
    'lag': 'Optimize the system to reduce lag and improve user experience.',
    'design': 'Refine the design for better aesthetics and functionality.',
    'ergonomics': 'Improve ergonomic design to ensure comfort and ease of use.',
    'connectivity': 'Enhance connectivity options for better user experience.',
    'wifi': 'Improve WiFi reception and stability.',
    'bluetooth': 'Ensure Bluetooth connections are stable and have a good range.',
    'display': 'Upgrade the display for better resolution and color accuracy.',
    'brightness': 'Increase screen brightness for better visibility in various lighting conditions.',
    'contrast': 'Improve screen contrast for better image quality.',
    'durability': 'Use more durable materials to increase the product’s lifespan.',
    'fragile': 'Strengthen the build to make the product less prone to damage.',
    'reliability': 'Enhance reliability through better quality control and testing.',
    'bugs': 'Address software bugs through updates and patches.',
    'support': 'Improve customer support for better user satisfaction.',
    'warranty': 'Extend warranty options to provide more security to the users.',
    'features': 'Add more useful features to enhance product value.',
    'compatibility': 'Ensure compatibility with a wide range of devices and platforms.',
    'setup': 'Simplify the setup process to make it more user-friendly.',
    'manual': 'Provide a more detailed and clear user manual.',
    'instructions': 'Ensure instructions are clear and easy to follow.',
    'updates': 'Regularly provide software updates to fix issues and add new features.',
    'security': 'Enhance security features to protect user data and privacy.'
}


In [8]:

def highlight_sentiment(text, tokenizer, model):
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    model.to(device)
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    pos_aspects = []
    neg_aspects = []
    neu_aspects = []
    improvement_list = []

    total_sentences = 0
    positive_count = 0
    negative_count = 0
    neutral_count = 0

    for sent in doc.sents:
        total_sentences += 1
        inputs = tokenizer(sent.text, return_tensors='pt').to(device)
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
        sentiment = torch.argmax(probs).item()

        if sentiment == 2:
            pos_aspects.append(sent.text.strip())
            positive_count += 1
        elif sentiment == 0:
            neg_aspects.append(sent.text.strip())
            negative_count += 1
            for word in sent.text.split():
                word_lower = word.lower()
                if word_lower in improvement_suggestions:
                    improvement_list.append(improvement_suggestions[word_lower])
        else:
            neu_aspects.append(sent.text.strip())
            neutral_count += 1

    positive_score = positive_count / total_sentences
    negative_score = negative_count / total_sentences
    neutral_score = neutral_count / total_sentences

    if positive_score > 0.6:
        overall_sentiment = "Mostly Positive"
    elif positive_score > 0.3:
        overall_sentiment = "Partially Positive"
    elif negative_score > 0.6:
        overall_sentiment = "Mostly Negative"
    elif negative_score > 0.3:
        overall_sentiment = "Partially Negative"
    else:
        overall_sentiment = "Neutral"

    return {
        'Positive': ' '.join(pos_aspects),
        'Negative': ' '.join(neg_aspects),
        'Neutral': ' '.join(neu_aspects),
        'Overall Sentiment': overall_sentiment,
        'Scores': {
            'Positive': positive_score,
            'Negative': negative_score,
            'Neutral': neutral_score
        },
        'Improvements': list(set(improvement_list))
    }

def predict_and_explain(text, tokenizer, model):
    sentiment = highlight_sentiment(text, tokenizer, model)
    return sentiment

# Example usage
review = "I recently purchased this processor and I am very impressed with its performance. It handles multitasking very efficiently, and my system has become much faster. The installation was straightforward, and the cooling system works very well, keeping the temperatures low. However, I have noticed that it tends to get quite noisy when under heavy load, which is a bit annoying. Additionally, the price is a bit high compared to similar products, but overall, I am satisfied with this purchase."
highlighted_review = predict_and_explain(review, tokenizer, model)

print(f"Highlighted Review: {highlighted_review}")

Highlighted Review: {'Positive': 'I recently purchased this processor and I am very impressed with its performance. It handles multitasking very efficiently, and my system has become much faster. The installation was straightforward, and the cooling system works very well, keeping the temperatures low. Additionally, the price is a bit high compared to similar products, but overall, I am satisfied with this purchase.', 'Negative': '', 'Neutral': 'However, I have noticed that it tends to get quite noisy when under heavy load, which is a bit annoying.', 'Overall Sentiment': 'Mostly Positive', 'Scores': {'Positive': 0.8, 'Negative': 0.0, 'Neutral': 0.2}, 'Improvements': []}


In [10]:
review = """
I recently purchased this processor and I am very impressed with its performance. It handles multitasking very efficiently, and my system has become much faster. The installation was straightforward, and the cooling system works very well, keeping the temperatures low. However, I have noticed that it tends to get quite noisy when under heavy load, which is a bit annoying. Additionally, the price is a bit high compared to similar products, but overall, I am satisfied with this purchase.
"""

highlighted_review = predict_and_explain(review, tokenizer, model)

print(f"Highlighted Review: {highlighted_review}")


Highlighted Review: {'Positive': 'I recently purchased this processor and I am very impressed with its performance. It handles multitasking very efficiently, and my system has become much faster. The installation was straightforward, and the cooling system works very well, keeping the temperatures low. Additionally, the price is a bit high compared to similar products, but overall, I am satisfied with this purchase.', 'Negative': 'However, I have noticed that it tends to get quite noisy when under heavy load, which is a bit annoying.', 'Neutral': '', 'Overall Sentiment': 'Mostly Positive', 'Scores': {'Positive': 0.8, 'Negative': 0.2, 'Neutral': 0.0}, 'Improvements': ['Look into better sound insulation or quieter components.']}


In [9]:
from transformers import BertTokenizer, BertForSequenceClassification

# Assuming 'model' is your trained BERT model and 'tokenizer' is the corresponding tokenizer
model_name = "/content/"

# Save the model
model.save_pretrained(model_name)

# Save the tokenizer
tokenizer.save_pretrained(model_name)

print(f"Model and tokenizer saved in {model_name}")


Model and tokenizer saved in /content/


In [25]:
df = pd.DataFrame(data)
import seaborn as sns
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to('cuda')

# Convert data to DataFrame
df = pd.DataFrame(data)

# Initialize lists to store true labels and predictions
true_labels = []
predictions = []

# Loop through each row in the DataFrame
for idx, row in df.iterrows():
    text = f"{row['title']} {row['content']}"
    sentiment_prediction = evaluate_sentiment(text)
    
    # Assuming 'rating' column represents labels (you should replace with actual labels from your data)
    true_labels.append(row['rating'])  # Use 'rating' or another appropriate column for true labels
    predictions.append(sentiment_prediction[0])  # Assuming evaluate_sentiment returns a list with one prediction

# Calculate evaluation metrics
accuracy = accuracy_score(true_labels, predictions)
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='weighted')
conf_matrix = confusion_matrix(true_labels, predictions)

# Print evaluation metrics
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1-score: {f1:.4f}')
print('Confusion Matrix:')
print(conf_matrix)

# Plot confusion matrix
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Negative', 'Neutral', 'Positive'], yticklabels=['Negative', 'Neutral', 'Positive'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)