### Dependências

In [24]:
pip install transformers torch

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


### Imports

In [25]:
import os
import tarfile
import pandas as pd
from transformers import pipeline

### Dataset

In [26]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
dataset_dir = "datasets"
os.makedirs(dataset_dir, exist_ok=True)

file_name = os.path.join(dataset_dir, "aclImdb_v1.tar.gz")
if not os.path.exists(file_name):
    os.system(f"wget {url} -O {file_name}")

In [27]:
def read_imdb_data_from_tarfile(tar_file, dataset_type):
    data = {'review': [], 'sentiment': []}
    with tarfile.open(tar_file, "r:gz") as tar:
        for member in tar.getmembers():
            if f'aclImdb/{dataset_type}/' in member.name and member.name.endswith('.txt'):
                label = 'pos' if '/pos/' in member.name else 'neg'
                f = tar.extractfile(member)
                if f:
                    review = f.read().decode('utf-8')
                    review = '\n'.join([line for line in review.split('\n') if not line.startswith('http')])
                    if review.strip():
                        data['review'].append(review)
                        data['sentiment'].append(label)
    return pd.DataFrame(data)

# train_data = read_imdb_data_from_tarfile(file_name, 'train')
test_data = read_imdb_data_from_tarfile(file_name, 'test')

In [28]:
test_data

Unnamed: 0,review,sentiment
0,I love sci-fi and am willing to put up with a ...,neg
1,"Worth the entertainment value of a rental, esp...",neg
2,its a totally average film with a few semi-alr...,neg
3,STAR RATING: ***** Saturday Night **** Friday ...,neg
4,"First off let me say, If you haven't enjoyed a...",neg
...,...,...
24994,Just got around to seeing Monster Man yesterda...,pos
24995,I got this as part of a competition prize. I w...,pos
24996,I got Monster Man in a box set of three films ...,pos
24997,"Five minutes in, i started to feel how naff th...",pos


In [29]:
test_data['sentiment'].value_counts()

sentiment
neg    12500
pos    12499
Name: count, dtype: int64

## Modelo

In [30]:
test_size = 1000

### Pipeline

In [31]:
classifier = pipeline("sentiment-analysis", framework="pt")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [32]:
def preprocess_and_chunk_reviews(reviews, max_seq_length=512):
    processed_reviews = []
    for review in reviews:
        chunks = [review[i:i+max_seq_length] for i in range(0, len(review), max_seq_length)]
        processed_reviews.extend(chunks)
    
    return processed_reviews

def classify_reviews(reviews, classifier, max_seq_length=512):

    processed_reviews = preprocess_and_chunk_reviews(reviews, max_seq_length)    
    all_predictions = []

    for review_chunk in processed_reviews:
        predictions_chunk = classifier(review_chunk)        
        all_predictions.extend(predictions_chunk)
    
    return all_predictions


In [33]:
predictions = classify_reviews(test_data['review'][:test_size].tolist(), classifier)
predictions

[{'label': 'NEGATIVE', 'score': 0.9305744767189026},
 {'label': 'NEGATIVE', 'score': 0.9997639060020447},
 {'label': 'NEGATIVE', 'score': 0.999687671661377},
 {'label': 'POSITIVE', 'score': 0.9814736247062683},
 {'label': 'POSITIVE', 'score': 0.9839194416999817},
 {'label': 'NEGATIVE', 'score': 0.9982080459594727},
 {'label': 'NEGATIVE', 'score': 0.9994907379150391},
 {'label': 'NEGATIVE', 'score': 0.9996466636657715},
 {'label': 'NEGATIVE', 'score': 0.9836370348930359},
 {'label': 'POSITIVE', 'score': 0.9987999200820923},
 {'label': 'NEGATIVE', 'score': 0.9994495511054993},
 {'label': 'NEGATIVE', 'score': 0.9992795586585999},
 {'label': 'NEGATIVE', 'score': 0.9711073637008667},
 {'label': 'POSITIVE', 'score': 0.9882192611694336},
 {'label': 'POSITIVE', 'score': 0.9997252821922302},
 {'label': 'NEGATIVE', 'score': 0.9975519776344299},
 {'label': 'NEGATIVE', 'score': 0.9983616471290588},
 {'label': 'POSITIVE', 'score': 0.9998654127120972},
 {'label': 'NEGATIVE', 'score': 0.9992132186889

In [34]:
predicted_labels = [prediction['label'] for prediction in predictions]
predicted_labels[:10]

['NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'POSITIVE',
 'POSITIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'POSITIVE']

In [35]:
label_map = {'neg': 'NEGATIVE', 'pos': 'POSITIVE'}
test_data['sentiment'] = test_data['sentiment'][:test_size].map(label_map)
true_labels = test_data['sentiment'][:test_size].tolist()
true_labels[:10]

['NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE']

In [36]:
correct_predictions = 0
total_predictions = test_size
for pred, label in zip(predicted_labels, true_labels):
    if pred == label:
        correct_predictions += 1

accuracy = correct_predictions / total_predictions
accuracy

0.7