In [None]:
!pip install transformers datasets torch scikit-learn
#!pip install numpy==1.23.4
!pip install numpy==1.26.4



In [None]:
import pandas as pd
data=pd.read_excel('/content/IMDB_Dataset_sample.xlsx')
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
from sklearn.model_selection import train_test_split
from datasets import Dataset

# Convert sentiment to numerical labels
data['sentiment'] = data['sentiment'].map({"positive": 1, "negative": 0})

# Split the data into train and test sets
train_df, test_df = train_test_split(data, test_size=0.2, random_state=42)

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [None]:
train_dataset

Dataset({
    features: ['review', 'sentiment', '__index_level_0__'],
    num_rows: 891
})

In [None]:
test_dataset

Dataset({
    features: ['review', 'sentiment', '__index_level_0__'],
    num_rows: 223
})

In [None]:
train_dataset[:5]

{'review': ['This film was pretty good. I am not too big a fan of baseball, but this is a movie that was made to help understand the meaning of love, determination, heart, etc.<br /><br />Danny Glover, Joseph Gordon-Levitt, Brenda Fricker, Christopher Lloyd, Tony Danza, and Milton Davis Jr. are brought in with a variety of talented actors and understanding of the sport. The plot was believable, and I love the message. William Dear and the guys put together a great movie.<br /><br />Most sports films revolve around true stories or events, and they often do not work well. But this film hits a 10 on the perfectness scale, even though there were a few minor mistakes here and there.<br /><br />10/10',
  'How do you take a cast of experienced, well-known actors, and put together such a stupid movie? Nimrod Antel has the answer: Armored. Six co-workers at an armored car business decide to steal a large shipment of cash themselves. But, just as they get to first base with their plans, everythi

In [None]:
from transformers import BertTokenizer

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples["review"],
        padding="max_length",
        truncation=True,
        max_length=128
    )
# Tokenize datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)
train_dataset

Map:   0%|          | 0/891 [00:00<?, ? examples/s]

Map:   0%|          | 0/223 [00:00<?, ? examples/s]

Dataset({
    features: ['review', 'sentiment', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 891
})

In [None]:
# Remove unnecessary columns
train_dataset = train_dataset.remove_columns(["review", "__index_level_0__"])
test_dataset = test_dataset.remove_columns(["review", "__index_level_0__"])
train_dataset

Dataset({
    features: ['sentiment', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 891
})

In [None]:
# Rename the sentiment column to labels
train_dataset = train_dataset.rename_column("sentiment", "labels")
test_dataset = test_dataset.rename_column("sentiment", "labels")

In [None]:
# Set the format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
train_dataset

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 891
})

In [None]:
import numpy as np
train_dataset[:5]

{'labels': tensor([1, 0, 0, 0, 1]),
 'input_ids': tensor([[  101,  2023,  2143,  2001,  3492,  2204,  1012,  1045,  2572,  2025,
           2205,  2502,  1037,  5470,  1997,  3598,  1010,  2021,  2023,  2003,
           1037,  3185,  2008,  2001,  2081,  2000,  2393,  3305,  1996,  3574,
           1997,  2293,  1010,  9128,  1010,  2540,  1010,  4385,  1012,  1026,
           7987,  1013,  1028,  1026,  7987,  1013,  1028,  6266, 20012,  1010,
           3312,  5146,  1011, 11902,  4779,  1010, 15507, 10424,  6799,  2121,
           1010,  5696,  6746,  1010,  4116,  4907,  4143,  1010,  1998,  9660,
           4482,  3781,  1012,  2024,  2716,  1999,  2007,  1037,  3528,  1997,
          10904,  5889,  1998,  4824,  1997,  1996,  4368,  1012,  1996,  5436,
           2001, 19337,  2666, 12423,  1010,  1998,  1045,  2293,  1996,  4471,
           1012,  2520,  6203,  1998,  1996,  4364,  2404,  2362,  1037,  2307,
           3185,  1012,  1026,  7987,  1013,  1028,  1026,  7987,  1013

In [None]:
from transformers import BertForSequenceClassification

# Load the model
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2  # Binary classification
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import TrainingArguments, Trainer

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results", # where i need to store the results
    eval_strategy="epoch", # evaluation of model per epoch
    learning_rate=2e-5, # learning rate
    per_device_train_batch_size=8, # per gpu cpu how many training samples
    per_device_eval_batch_size=8, # per gpu cpu how many evaluation samples
    num_train_epochs=3, # number of epochs
    weight_decay=0.01, # regularization
    logging_dir="./logs", # where i need to save the logs
    logging_steps=10, # after how many steps samples need to login
    save_steps=1000 # after how many steps we need to save
)
training_args

TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=IntervalStrategy.EPOCH,
eval_use_gather_object=False

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer

<transformers.trainer.Trainer at 0x79faac7f9f10>

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss
1,0.3453,0.356693
2,0.2768,0.358328
3,0.0863,0.393548


TrainOutput(global_step=336, training_loss=0.3198170621125471, metrics={'train_runtime': 81.4235, 'train_samples_per_second': 32.828, 'train_steps_per_second': 4.127, 'total_flos': 175823962744320.0, 'train_loss': 0.3198170621125471, 'epoch': 3.0})

In [None]:
results = trainer.evaluate()
print(results)


{'eval_loss': 0.39354848861694336, 'eval_runtime': 1.6054, 'eval_samples_per_second': 138.908, 'eval_steps_per_second': 17.441, 'epoch': 3.0}


In [None]:
model.save_pretrained("./sentiment_model")
tokenizer.save_pretrained("./sentiment_model")


('./sentiment_model/tokenizer_config.json',
 './sentiment_model/special_tokens_map.json',
 './sentiment_model/vocab.txt',
 './sentiment_model/added_tokens.json')

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Load the model and tokenizer
model = BertForSequenceClassification.from_pretrained("./sentiment_model")
tokenizer = BertTokenizer.from_pretrained("./sentiment_model")

In [None]:
review = "I really not like this movie but The story was good!"
inputs = tokenizer(
        review,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=256
    )

print(inputs)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device
model.to(device)
inputs = {key: value.to(device) for key, value in inputs.items()}
inputs

with torch.no_grad():
    outputs = model(**inputs)
    print(outputs)

import numpy as np
torch.argmax(outputs.logits)

{'input_ids': tensor([[ 101, 1045, 2428, 2025, 2066, 2023, 3185, 2021, 1996, 2466, 2001, 2204,
          999,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
SequenceClassifierOutput(loss=None, logits=tensor([[-2.6301,  1.7865]], device='cuda:0'), hidden_states=None, attentions=None)


tensor(1, device='cuda:0')

In [None]:
def predict_sentiment(review, model, tokenizer):
    # Tokenize the input review
    inputs = tokenizer(
        review,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=256
    )

    # Move tensors to the same device as the model (CPU or GPU)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Perform prediction
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

    # Map predictions to sentiment labels
    sentiment = "positive" if predictions.item() == 1 else "negative"
    return sentiment


In [None]:
review = "I really loved this movie."
sentiment = predict_sentiment(review, model, tokenizer)
print(f"Review: {review}")
print(f"Predicted Sentiment: {sentiment}")

Review: I really loved this movie.
Predicted Sentiment: positive


In [None]:
review = "I really didnt loved this movie.The story was not  good."
sentiment = predict_sentiment(review, model, tokenizer)
print(f"Review: {review}")
print(f"Predicted Sentiment: {sentiment}")

Review: I really didnt loved this movie.The story was not  good.
Predicted Sentiment: negative
