In [1]:
import pandas as pd

df_train_sentiment = pd.read_csv("all_train_sentiment.csv")
df_train_sentiment

Unnamed: 0,description,sentiment
0,Bromwell High is a cartoon comedy. It ran at t...,1.0
1,Homelessness (or Houselessness as George Carli...,1.0
2,Brilliant over-acting by Lesley Ann Warren. Be...,1.0
3,This is easily the most underrated film inn th...,1.0
4,This is not the typical Mel Brooks film. It wa...,1.0
...,...,...
24995,"Towards the end of the movie, I felt it was to...",0.0
24996,This is the kind of movie that my enemies cont...,0.0
24997,I saw 'Descent' last night at the Stockholm Fi...,0.0
24998,Some films that you pick up for a pound turn o...,0.0


In [2]:
df_test_sentiment = pd.read_csv("all_test_sentiment.csv")
df_test_sentiment

Unnamed: 0,description,sentiment
0,I went and saw this movie last night after bei...,1.0
1,Actor turned director Bill Paxton follows up h...,1.0
2,As a recreational golfer with some knowledge o...,1.0
3,"I saw this film in a sneak preview, and it is ...",1.0
4,Bill Paxton has taken the true story of the 19...,1.0
...,...,...
24995,I occasionally let my kids watch this garbage ...,0.0
24996,When all we have anymore is pretty much realit...,0.0
24997,The basic genre is a thriller intercut with an...,0.0
24998,Four things intrigued me as to this film - fir...,0.0


In [3]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW

In [4]:
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        text = item['description']
        label = item['sentiment']
        inputs = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
        return {'input_ids': inputs['input_ids'][0], 'attention_mask': inputs['attention_mask'][0], 'labels': torch.tensor(label)}

In [5]:
# Hyperparameters
batch_size = 100
max_length = 128
learning_rate = 0.0001
epochs = 5

In [6]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1).to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# Create DataLoader for training and validation data
train_dataset = CustomDataset(df_train_sentiment, tokenizer, max_length)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = CustomDataset(df_test_sentiment, tokenizer, max_length)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [8]:
# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=learning_rate)
loss_fn = torch.nn.BCEWithLogitsLoss(reduction='none')

In [9]:
# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for i, batch in enumerate(train_loader):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].unsqueeze(1).float().to(device)  # or labels = batch['labels'].view(-1,1).float().to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits
        loss = loss_fn(logits, labels).sum()
        print(f'Epoch {epoch+1}/{epochs}, Batch {i+1}/{len(train_loader)}, Loss: {loss.item():.4f}')
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    
    average_train_loss = total_loss/len(train_loader)

    print(f'Epoch {epoch+1}/{epochs}')
    print(f'Training Loss: {average_train_loss:.4f}')

    # Validation
    model.eval()
    test_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].unsqueeze(1).float().to(device)  # or labels = batch['labels'].view(-1,1).float().to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            loss = loss_fn(logits, labels).sum()
            test_loss += loss.item()

            # Compute binary predictions
            predicted = (torch.sigmoid(logits)>0.5).float()
            total += labels.size(0)
            correct += (predicted==labels).sum().item()
    
    average_test_loss = test_loss/len(test_loader)
    test_accuracy = correct/total

    print(f'Validation Loss: {average_test_loss:.4f}')
    print(f'Validation Accuracy: {test_accuracy:.4f}\n')

Epoch 1/5, Batch 1/250, Loss: 71.6219
Epoch 1/5, Batch 2/250, Loss: 70.9527
Epoch 1/5, Batch 3/250, Loss: 69.6151
Epoch 1/5, Batch 4/250, Loss: 71.2939
Epoch 1/5, Batch 5/250, Loss: 68.7957
Epoch 1/5, Batch 6/250, Loss: 66.9245
Epoch 1/5, Batch 7/250, Loss: 68.0160
Epoch 1/5, Batch 8/250, Loss: 63.3291
Epoch 1/5, Batch 9/250, Loss: 59.2905
Epoch 1/5, Batch 10/250, Loss: 54.3646
Epoch 1/5, Batch 11/250, Loss: 50.4664
Epoch 1/5, Batch 12/250, Loss: 44.4880
Epoch 1/5, Batch 13/250, Loss: 79.1987
Epoch 1/5, Batch 14/250, Loss: 47.9829
Epoch 1/5, Batch 15/250, Loss: 102.4607
Epoch 1/5, Batch 16/250, Loss: 50.1724
Epoch 1/5, Batch 17/250, Loss: 44.3480
Epoch 1/5, Batch 18/250, Loss: 52.0231
Epoch 1/5, Batch 19/250, Loss: 60.6892
Epoch 1/5, Batch 20/250, Loss: 43.4770
Epoch 1/5, Batch 21/250, Loss: 44.0933
Epoch 1/5, Batch 22/250, Loss: 45.9877
Epoch 1/5, Batch 23/250, Loss: 49.4623
Epoch 1/5, Batch 24/250, Loss: 41.2771
Epoch 1/5, Batch 25/250, Loss: 40.2140
Epoch 1/5, Batch 26/250, Loss: 39

In [10]:
X_train = df_train_sentiment["description"]
y_train = df_train_sentiment["sentiment"]
X_test = df_test_sentiment["description"]
y_test = df_test_sentiment["sentiment"]

In [11]:
def predict_sentiment(X):
    res = []
    for text in X:
        # Tokenize input text
        inputs = tokenizer(text, padding='max_length', truncation=True, max_length=max_length, return_tensors='pt').to(device)
        # Use model for inference
        outputs = model(**inputs)
        # Get predicted label (1 or 0)
        predicted_label = (torch.sigmoid(outputs.logits)>0.5).float().item()
        res.append(predicted_label)
    return res

In [12]:
y_train_pred = predict_sentiment(X_train)

In [13]:
y_test_pred = predict_sentiment(X_test)

In [14]:
from sklearn.metrics import classification_report

In [15]:
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

         0.0       1.00      0.98      0.99     12500
         1.0       0.98      1.00      0.99     12500

    accuracy                           0.99     25000
   macro avg       0.99      0.99      0.99     25000
weighted avg       0.99      0.99      0.99     25000



In [16]:
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

         0.0       0.91      0.81      0.86     12500
         1.0       0.83      0.92      0.87     12500

    accuracy                           0.87     25000
   macro avg       0.87      0.87      0.87     25000
weighted avg       0.87      0.87      0.87     25000



In [17]:
# Define function to perform sentiment analysis
def get_sentiment(text):
    # Tokenize input text
    inputs = tokenizer(text, padding='max_length', truncation=True, max_length=max_length, return_tensors='pt').to(device)
    # Use model for inference
    outputs = model(**inputs)
    # Get predicted label (positive or negative sentiment)
    predicted_label = (torch.sigmoid(outputs.logits)>0.5).float()
    # Return predicted sentiment label
    return 'positive' if predicted_label == 1 else 'negative'

In [18]:
# Example usage
text = "I love this product! It's amazing!"
sentiment = get_sentiment(text)
print(sentiment) # Output: 'positive'

positive
