<a href="https://colab.research.google.com/github/LarissaRandow/IMBD/blob/main/IMBD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers torch pandas tqdm

In [None]:
!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xvzf aclImdb_v1.tar.gz

In [None]:
import os
import pandas as pd
from tqdm import tqdm
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification

# Função para carregar os dados do IMDB
def load_imdb_data(data_dir, sample_size=None):
    data = {'text': [], 'label': []}
    for label in ['pos', 'neg']:
        labeled_dir = os.path.join(data_dir, label)
        for review in os.listdir(labeled_dir):
            with open(os.path.join(labeled_dir, review), 'r', encoding='utf-8') as file:
                data['text'].append(file.read())
                data['label'].append(1 if label == 'pos' else 0)
    df = pd.DataFrame(data)
    if sample_size:
        df = df.sample(n=sample_size, random_state=42).reset_index(drop=True)
    return df

# Caminho para o diretório de dados
data_dir = 'aclImdb'
sample_size = 500  # Definindo o tamanho da amostra

train_data = load_imdb_data(os.path.join(data_dir, 'train'), sample_size=sample_size)
test_data = load_imdb_data(os.path.join(data_dir, 'test'), sample_size=sample_size)

# Configuração da pipeline de análise de sentimento com modelo explícito
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
sentiment_pipeline = pipeline('sentiment-analysis', model=model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Função para truncar texto
def truncate_text(text, tokenizer, max_length=512):
    encoded = tokenizer.encode(text, truncation=True, max_length=max_length)
    truncated_text = tokenizer.decode(encoded, skip_special_tokens=True)
    return truncated_text

# Análise de sentimento nos dados
def analyze_sentiment(data):
    sentiments = []
    for text in tqdm(data['text'], desc='Analyzing Sentiments'):
        truncated_text = truncate_text(text, tokenizer)
        result = sentiment_pipeline(truncated_text)
        sentiments.append(result[0])
    return sentiments

train_data['sentiment'] = analyze_sentiment(train_data)
test_data['sentiment'] = analyze_sentiment(test_data)

# Transformação dos resultados
def transform_sentiments(data):
    data['sentiment_label'] = data['sentiment'].apply(lambda x: 1 if x['label'] == 'POSITIVE' else 0)
    data['sentiment_score'] = data['sentiment'].apply(lambda x: x['score'])
    data.drop(columns=['sentiment'], inplace=True)

transform_sentiments(train_data)
transform_sentiments(test_data)

# Avaliação do modelo
from sklearn.metrics import accuracy_score, classification_report

train_accuracy = accuracy_score(train_data['label'], train_data['sentiment_label'])
test_accuracy = accuracy_score(test_data['label'], test_data['sentiment_label'])

print(f'Train Accuracy: {train_accuracy:.4f}')
print(f'Test Accuracy: {test_accuracy:.4f}')
print('Classification Report (Test Data):')
print(classification_report(test_data['label'], test_data['sentiment_label']))


Analyzing Sentiments: 100%|██████████| 500/500 [03:42<00:00,  2.24it/s]
Analyzing Sentiments: 100%|██████████| 500/500 [03:47<00:00,  2.20it/s]

Train Accuracy: 0.8720
Test Accuracy: 0.8940
Classification Report (Test Data):
              precision    recall  f1-score   support

           0       0.85      0.94      0.89       234
           1       0.94      0.86      0.90       266

    accuracy                           0.89       500
   macro avg       0.90      0.90      0.89       500
weighted avg       0.90      0.89      0.89       500




