# List of unknown sentiments

In [8]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')

# Sample list of new sentences
new_sentences = [
    "I love this product. It's amazing!",
    "The customer service was terrible.",
    "The weather is beautiful today.",
    "This book is so boring.",
    "I'm not sure how I feel about this movie.",
]


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Pre-processing the text

**Tokenization**

In [9]:
# Tokenization
tokenized_sentences = [word_tokenize(sentence) for sentence in new_sentences]

# Print some tokenized data
for i, sentence in enumerate(tokenized_sentences):
    if i < 2:  # Print the first two tokenized sentences as an example
        print(f"Tokenized Sentence {i + 1}: {sentence}")

Tokenized Sentence 1: ['I', 'love', 'this', 'product', '.', 'It', "'s", 'amazing', '!']
Tokenized Sentence 2: ['The', 'customer', 'service', 'was', 'terrible', '.']


**Removing stop words/punctuation marks**

In [10]:
# Remove stop words and punctuation
stop_words = set(stopwords.words('english'))
preprocessed_sentences = [[word.lower() for word in words if word.isalnum() and word.lower() not in stop_words] for words in tokenized_sentences]

# Print the preprocessed sentences (the first two as an example)
for i, sentence in enumerate(preprocessed_sentences):
    if i < 2:  # Print the first two preprocessed sentences as an example
        print(f"Preprocessed Sentence {i + 1}: {' '.join(sentence)}")

Preprocessed Sentence 1: love product amazing
Preprocessed Sentence 2: customer service terrible


**Model training**

In [6]:
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
import torch

# Define the number of training epochs
epochs = 3

# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)  # 2 labels: positive and negative

# Convert data to input format
input_ids = []
attention_masks = []

for sentence in preprocessed_sentences:
    encoded = tokenizer.encode_plus(
        text=sentence,
        add_special_tokens=True,
        max_length=64,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt'
    )
    input_ids.append(encoded['input_ids'])
    attention_masks.append(encoded['attention_mask'])

# Convert to PyTorch tensors
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor([0, 1, 0, 1, 0])  # 0 for positive, 1 for negative (example labels)

# Create a DataLoader for batch processing
dataset = TensorDataset(input_ids, attention_masks, labels)
dataloader = DataLoader(dataset, batch_size=4)

# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# Training loop
model.train()
for epoch in range(epochs):
    for batch in dataloader:
        optimizer.zero_grad()
        input_ids, attention_masks, labels = batch
        output = model(input_ids, attention_mask=attention_masks, labels=labels)
        loss = output.loss
        loss.backward()
        optimizer.step()


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import torch

# Assuming you have defined new_sentences and preprocessed_sentences as shown in your previous code.

# Tokenization
tokenized_sentences = [word_tokenize(sentence) for sentence in new_sentences]

# Remove stop words and punctuation
stop_words = set(stopwords.words('english'))
preprocessed_sentences = [[word.lower() for word in words if word.isalnum() and word.lower() not in stop_words] for words in tokenized_sentences]

# Rest of your code for model training...

# Evaluation
model.eval()
with torch.no_grad():
    predictions = []
    for batch in dataloader:
        input_ids, attention_masks, _ = batch
        outputs = model(input_ids, attention_mask=attention_masks)
        predicted_labels = torch.argmax(outputs.logits, dim=1)
        predictions.extend(predicted_labels.tolist())

# Define a threshold for classifying sentiments (e.g., 0.5 for positive/negative)
threshold = 0.5
sentiment_labels = ["positive" if pred == 0 else "negative" for pred in predictions]

# Ground truth labels for your new sentences (assuming you have them)
ground_truth_labels = ["positive", "negative", "positive", "negative", "positive"]

# Compare predicted sentiment labels to ground truth labels
correct_predictions = [1 if predicted == truth else 0 for predicted, truth in zip(sentiment_labels, ground_truth_labels)]

# Calculate accuracy
accuracy = sum(correct_predictions) / len(correct_predictions)
print(f"Accuracy: {accuracy:.2%}")

# You can also calculate other evaluation metrics such as precision, recall, and F1-score if you have the required information.


Accuracy: 80.00%
