In [None]:
!pip install transformers



In [None]:
from torch.utils.data import DataLoader, Dataset
import torch
from transformers import BertTokenizerFast
import pandas as pd

class MovieReviewDataset(Dataset):
    def __init__(self, reviews, targets, tokenizer, max_len):
        self.reviews = reviews
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        review = str(self.reviews[idx])
        target = self.targets[idx]

        encoding = self.tokenizer.encode_plus(
          review,
          add_special_tokens=True,
          max_length=self.max_len,
          return_token_type_ids=False,
          pad_to_max_length=True,
          return_attention_mask=True,
          return_tensors='pt',
        )

        return {
          'review_text': review,
          'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten(),
          'targets': torch.tensor(target, dtype=torch.long)
        }

def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = MovieReviewDataset(
        reviews=df.review.astype(str).to_numpy(),
        targets=df.sentiment.to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len
    )

    return DataLoader(
        ds,
        batch_size=batch_size,
        num_workers=4
    )

# Specify the file path and encoding
file_path = 'rk.csv'
encoding = 'utf-8'  # You can adjust the encoding if needed
delimiter = ','     # Specify the delimiter used in your CSV file

# Read the CSV file
df = pd.read_csv(file_path, encoding=encoding, delimiter=delimiter)

# Label 1 is Positive Sentiment, and Label 0 is Negative Sentiment
df.sentiment = df.sentiment.apply(lambda x: 1 if x == 'positive' else 0)

# Initialize the tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# Create data loader
batch_size = 32
max_len = 128
data_loader = create_data_loader(df, tokenizer, max_len, batch_size)


(…)cased/resolve/main/tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

(…)bert-base-uncased/resolve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

(…)base-uncased/resolve/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

(…)rt-base-uncased/resolve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



In [None]:
from torch.utils.data import DataLoader, Dataset
import torch
from transformers import BertTokenizerFast, BertForSequenceClassification
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import pandas as pd

class MovieReviewDataset(Dataset):
    def __init__(self, reviews, targets, tokenizer, max_len):
        self.reviews = reviews
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        review = str(self.reviews[idx])
        target = self.targets[idx]

        encoding = self.tokenizer.encode_plus(
          review,
          add_special_tokens=True,
          max_length=self.max_len,
          return_token_type_ids=False,
          pad_to_max_length=True,
          return_attention_mask=True,
          return_tensors='pt',
        )

        return {
          'review_text': review,
          'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten(),
          'targets': torch.tensor(target, dtype=torch.long)
        }

def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = MovieReviewDataset(
        reviews=df.review.astype(str).to_numpy(),
        targets=df.sentiment.to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len
    )

    return DataLoader(
        ds,
        batch_size=batch_size,
        num_workers=4
    )

def get_predictions(model, data_loader, device):
    model = model.eval()
    review_texts = []
    predictions = []
    targets = []

    with torch.no_grad():
        for batch in tqdm(data_loader, total=len(data_loader), desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            target = batch['targets'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits  # Extract the logits
            _, prediction = torch.max(logits, dim=1)

            review_texts.extend(batch['review_text'])
            predictions.extend(prediction.cpu().numpy())
            targets.extend(target.cpu().numpy())

    return review_texts, predictions, targets



# Load your data
df = pd.read_csv('rk.csv')

# Label 1 is Positive Sentiment, and Label 0 is Negative Sentiment
df.sentiment = df.sentiment.apply(lambda x: 1 if x == 'positive' else 0)

# Initialize the tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# Create validation data loader
batch_size = 32
max_len = 128
val_data_loader = create_data_loader(df, tokenizer, max_len, batch_size)

# Initialize your BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

# Move the model to the appropriate device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Get predictions on the validation set
val_review_texts, val_predictions, val_targets = get_predictions(model, val_data_loader, device)

# Calculate accuracy
val_accuracy = accuracy_score(val_targets, val_predictions)
print(f'Validation Accuracy: {val_accuracy:.4f}')




model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating:   0%|          | 0/752 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you 

Validation Accuracy: 0.4999





In [None]:
from torch.utils.data import DataLoader, Dataset
import torch
from transformers import BertTokenizerFast, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import pandas as pd

# Function to create data loader
def create_data_loader(df, tokenizer, max_len, batch_size, shuffle=True):
    ds = MovieReviewDataset(
        reviews=df.review.astype(str).to_numpy(),
        targets=df.sentiment.to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len
    )

    return DataLoader(
        ds,
        batch_size=batch_size,
        num_workers=4,
        shuffle=shuffle
    )

# Function to fine-tune and train the model
def train(model, train_dataloader, val_dataloader, device, epochs=3):
    optimizer = AdamW(model.parameters(), lr=2e-5)
    total_steps = len(train_dataloader) * epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )

    criterion = torch.nn.CrossEntropyLoss()

    for epoch in range(epochs):
        model.train()
        for batch in tqdm(train_dataloader, desc=f'Training Epoch {epoch + 1}/{epochs}'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            target = batch['targets'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            loss = criterion(logits, target)
            loss.backward()
            optimizer.step()
            scheduler.step()

        # Evaluate on validation set
        val_accuracy = evaluate(model, val_dataloader, device)
        print(f'Epoch {epoch + 1}/{epochs}, Validation Accuracy: {val_accuracy:.4f}')

# Function to evaluate the model
def evaluate(model, data_loader, device):
    model = model.eval()
    predictions = []
    targets = []

    with torch.no_grad():
        for batch in tqdm(data_loader, total=len(data_loader), desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            target = batch['targets'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            _, prediction = torch.max(logits, dim=1)

            predictions.extend(prediction.cpu().numpy())
            targets.extend(target.cpu().numpy())

    accuracy = accuracy_score(targets, predictions)
    return accuracy

# Load your data
file_path = 'rk.csv'
encoding = 'utf-8'
delimiter = ','
df = pd.read_csv(file_path, encoding=encoding, delimiter=delimiter)
df.sentiment = df.sentiment.apply(lambda x: 1 if x == 'positive' else 0)

# Initialize the tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# Create data loaders
batch_size = 32
max_len = 128
train_data_loader = create_data_loader(df, tokenizer, max_len, batch_size)
val_data_loader = create_data_loader(df, tokenizer, max_len, batch_size, shuffle=False)

# Initialize your BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

# Move the model to the appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Fine-tune and train the model
train(model, train_data_loader, val_data_loader, device, epochs=3)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1/3:   0%|          | 0/1563 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the token

Epoch 1/3, Validation Accuracy: 0.9406


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-s

Epoch 2/3, Validation Accuracy: 0.9748


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-s

Epoch 3/3, Validation Accuracy: 0.9884



