## Install dependencies

In [None]:
%pip install datasets
%pip install pandas
%pip install transformers
%pip install torch
%pip install transformers

Setup Pipeline

In [2]:
from transformers import pipeline
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load sentiment analysis pipeline 
sentiment_pipeline = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment",batch_size=4, device=0,  max_length=512, truncation=True)


  from .autonotebook import tqdm as notebook_tqdm


Setup Test Dataset , Clean it , concat the text column

In [3]:
from transformers.pipelines.pt_utils import KeyDataset
import datasets

# dataset = load_dataset("amazon_polarity")
test_dataset = datasets.load_dataset("amazon_polarity", split="test")

test_dataset = test_dataset.remove_columns("label")

# Add a new column by concatenating "title" and "content"
test_dataset = test_dataset.map(lambda example: {"text": example["title"] + " " + example["content"]})

print(test_dataset)


Dataset({
    features: ['title', 'content', 'text'],
    num_rows: 400000
})


Perform the sentimental analysis on the test dataset

In [4]:

test_dataset = test_dataset.with_format("torch", device=device)

test_results = []

def map_dataset(examples):
    sent = sentiment_pipeline(examples["text"],batch_size=4, truncation=True)
    test_results.append(sent)
    return {"label":sent }


print(test_dataset[0])

encoded_test_dataset = test_dataset.map(map_dataset, batched=True, batch_size=128)

print(encoded_test_dataset[0])


{'title': 'Great CD', 'content': 'My lovely Pat has one of the GREAT voices of her generation. I have listened to this CD for YEARS and I still LOVE IT. When I\'m in a good mood it makes me feel better. A bad mood just evaporates like sugar in the rain. This CD just oozes LIFE. Vocals are jusat STUUNNING and lyrics just kill. One of life\'s hidden gems. This is a desert isle CD in my book. Why she never made it big is just beyond me. Everytime I play this, no matter black, white, young, old, male, female EVERYBODY says one thing "Who was that singing ?"', 'text': 'Great CD My lovely Pat has one of the GREAT voices of her generation. I have listened to this CD for YEARS and I still LOVE IT. When I\'m in a good mood it makes me feel better. A bad mood just evaporates like sugar in the rain. This CD just oozes LIFE. Vocals are jusat STUUNNING and lyrics just kill. One of life\'s hidden gems. This is a desert isle CD in my book. Why she never made it big is just beyond me. Everytime I play

Map: 100%|██████████| 400000/400000 [35:26<00:00, 188.09 examples/s]

{'title': 'Great CD', 'content': 'My lovely Pat has one of the GREAT voices of her generation. I have listened to this CD for YEARS and I still LOVE IT. When I\'m in a good mood it makes me feel better. A bad mood just evaporates like sugar in the rain. This CD just oozes LIFE. Vocals are jusat STUUNNING and lyrics just kill. One of life\'s hidden gems. This is a desert isle CD in my book. Why she never made it big is just beyond me. Everytime I play this, no matter black, white, young, old, male, female EVERYBODY says one thing "Who was that singing ?"', 'text': 'Great CD My lovely Pat has one of the GREAT voices of her generation. I have listened to this CD for YEARS and I still LOVE IT. When I\'m in a good mood it makes me feel better. A bad mood just evaporates like sugar in the rain. This CD just oozes LIFE. Vocals are jusat STUUNNING and lyrics just kill. One of life\'s hidden gems. This is a desert isle CD in my book. Why she never made it big is just beyond me. Everytime I play




Clean the final test dataset

In [6]:
def label_sentiment(sent):
    if sent['label'] == '5 stars':
        return 'Perfect'
    elif sent['label'] == '4 stars':
        return 'Satisfied'
    elif sent['label'] == '3 stars':
        return 'Neutral'
    elif sent['label'] == '2 stars' or sent['label'] == '1 star':
        return 'Negative'
    else:
        return 'Unknown'
    
def map_dataset_new_label(examples):
    label_info = examples["label"]
    label_title = label_sentiment(label_info)  # Extract the label value from the tensor
    score = label_info["score"].item()  # Extract the score value from the tensor
    return {"title": examples["title"], "content": examples["content"],"text":examples["text"], "sentiment": label_title, "score": score}

encoded_test_dataset = encoded_test_dataset.map(map_dataset_new_label)
print(encoded_test_dataset[0])

Map: 100%|██████████| 400000/400000 [02:21<00:00, 2820.09 examples/s]

{'title': 'Great CD', 'content': 'My lovely Pat has one of the GREAT voices of her generation. I have listened to this CD for YEARS and I still LOVE IT. When I\'m in a good mood it makes me feel better. A bad mood just evaporates like sugar in the rain. This CD just oozes LIFE. Vocals are jusat STUUNNING and lyrics just kill. One of life\'s hidden gems. This is a desert isle CD in my book. Why she never made it big is just beyond me. Everytime I play this, no matter black, white, young, old, male, female EVERYBODY says one thing "Who was that singing ?"', 'text': 'Great CD My lovely Pat has one of the GREAT voices of her generation. I have listened to this CD for YEARS and I still LOVE IT. When I\'m in a good mood it makes me feel better. A bad mood just evaporates like sugar in the rain. This CD just oozes LIFE. Vocals are jusat STUUNNING and lyrics just kill. One of life\'s hidden gems. This is a desert isle CD in my book. Why she never made it big is just beyond me. Everytime I play




Save the test dataset

In [7]:
# Convert the dataset to a pandas DataFrame
test_df = encoded_test_dataset.to_pandas()

# Save the DataFrame to a CSV file
test_df.to_csv("encoded_test_dataset.csv", index=False)

Train dataset, same process

In [10]:
from transformers.pipelines.pt_utils import KeyDataset
import datasets

# dataset = load_dataset("amazon_polarity")
dataset = datasets.load_dataset("amazon_polarity", split="train")

dataset = dataset.remove_columns("label")

# Add a new column by concatenating "title" and "content"
dataset = dataset.map(lambda example: {"text": example["title"] + " " + example["content"]})


len(dataset)
print(dataset)


Map: 100%|██████████| 3600000/3600000 [01:42<00:00, 35027.97 examples/s]

Dataset({
    features: ['title', 'content', 'text'],
    num_rows: 3600000
})





In [11]:
dataset = dataset.with_format("torch", device=device)

results = []

def map_dataset(examples):
    sent = sentiment_pipeline(examples["text"],batch_size=4, truncation=True)
    results.append(sent)
    return {"label":sent }

print(dataset[0])

encoded_dataset = dataset.map(map_dataset, batched=True)

print(encoded_dataset[0])

{'title': 'Stuning even for the non-gamer', 'content': 'This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^', 'text': 'Stuning even for the non-gamer This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^'}


Map: 100%|██████████| 3600000/3600000 [5:19:41<00:00, 187.68 examples/s]  

{'title': 'Stuning even for the non-gamer', 'content': 'This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^', 'text': 'Stuning even for the non-gamer This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^', 'label': {'label': '5 stars', 'score': tensor(0.7845, device='cuda:0')}}





In [12]:
def label_sentiment(sent):
    if sent['label'] == '5 stars':
        return 'Perfect'
    elif sent['label'] == '4 stars':
        return 'Satisfied'
    elif sent['label'] == '3 stars':
        return 'Neutral'
    elif sent['label'] == '2 stars' or sent['label'] == '1 star':
        return 'Negative'
    else:
        return 'Unknown'
    
def map_dataset_new_label(examples):
    label_info = examples["label"]
    label_title = label_sentiment(label_info)  # Extract the label value from the tensor
    score = label_info["score"].item()  # Extract the score value from the tensor
    return {"title": examples["title"], "content": examples["content"],"text":examples["text"], "sentiment": label_title, "score": score}

encoded_dataset = encoded_dataset.map(map_dataset_new_label)
print(encoded_dataset[0])

Map: 100%|██████████| 3600000/3600000 [21:44<00:00, 2760.50 examples/s]

{'title': 'Stuning even for the non-gamer', 'content': 'This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^', 'text': 'Stuning even for the non-gamer This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^', 'label': {'label': '5 stars', 'score': tensor(0.7845, device='cuda:0')}, 'sentiment': 'Perfect', 'score': tens




In [13]:
# Convert the dataset to a pandas DataFrame
df = encoded_dataset.to_pandas()

# Save the DataFrame to a CSV file
df.to_csv("encoded_dataset.csv", index=False)