## Questão 5

In [61]:
import nltk
import pandas as pd
import re
import spacy

from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('punkt')

# Carregar dataset
reviews_train = load_files("../db/aclImdb_v1/aclImdb/train", categories=['pos', 'neg'], encoding='utf-8')
texts, labels = reviews_train.data, reviews_train.target

df = pd.DataFrame({'text': texts, 'label': labels})
df_test = df.sample(n=5000, random_state=42).reset_index(drop=True)

df.head()

[nltk_data] Downloading package punkt to /home/luan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,text,label
0,"Zero Day leads you to think, even re-think why...",1
1,Words can't describe how bad this movie is. I ...,0
2,Everyone plays their part pretty well in this ...,1
3,There are a lot of highly talented filmmakers/...,0
4,I've just had the evidence that confirmed my s...,0


In [62]:
# A. Remova as stopwords das resenhas do dataset.

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    tokens = re.findall(r'\b\w\w+\b', text.lower())
    return ' '.join([t for t in tokens if t not in stop_words])

df['no_stopwords'] = df['text'].apply(remove_stopwords)

df.head()

Unnamed: 0,text,label,no_stopwords
0,"Zero Day leads you to think, even re-think why...",1,zero day leads think even think two boys young...
1,Words can't describe how bad this movie is. I ...,0,words describe bad movie explain writing see g...
2,Everyone plays their part pretty well in this ...,1,everyone plays part pretty well little nice mo...
3,There are a lot of highly talented filmmakers/...,0,lot highly talented filmmakers actors germany ...
4,I've just had the evidence that confirmed my s...,0,evidence confirmed suspicions bunch kids 14 22...


In [63]:
# B. Realize o stemming das expressões contidas nas features resultantes da operação realizada no item (a).
stemmer = PorterStemmer()

def apply_stemming(text):
    tokens = re.findall(r'\b\w\w+\b', text.lower())
    return ' '.join([stemmer.stem(t) for t in tokens])

df['stemmed'] = df['no_stopwords'].apply(apply_stemming)

df.head()

Unnamed: 0,text,label,no_stopwords,stemmed
0,"Zero Day leads you to think, even re-think why...",1,zero day leads think even think two boys young...,zero day lead think even think two boy young m...
1,Words can't describe how bad this movie is. I ...,0,words describe bad movie explain writing see g...,word describ bad movi explain write see get gr...
2,Everyone plays their part pretty well in this ...,1,everyone plays part pretty well little nice mo...,everyon play part pretti well littl nice movi ...
3,There are a lot of highly talented filmmakers/...,0,lot highly talented filmmakers actors germany ...,lot highli talent filmmak actor germani none a...
4,I've just had the evidence that confirmed my s...,0,evidence confirmed suspicions bunch kids 14 22...,evid confirm suspicion bunch kid 14 22 put dvd...


In [64]:
# C. Realize a lemmatization das expressões contidas nas features resultantes da operação realizada no item (a)

# Carrega o modelo de linguagem do spaCy
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

# Tokenização compatível com CountVectorizer
token_pattern = re.compile(r"(?u)\b\w\w+\b")

def lemmatize_text(text):
    tokens = token_pattern.findall(text.lower())
    doc = nlp(" ".join(tokens))  # cria um único texto com os tokens válidos
    return " ".join([token.lemma_ for token in doc])

df['lemmatized'] = df['no_stopwords'].apply(lemmatize_text)

df.head()

Unnamed: 0,text,label,no_stopwords,stemmed,lemmatized
0,"Zero Day leads you to think, even re-think why...",1,zero day leads think even think two boys young...,zero day lead think even think two boy young m...,zero day lead think even think two boy young m...
1,Words can't describe how bad this movie is. I ...,0,words describe bad movie explain writing see g...,word describ bad movi explain write see get gr...,word describe bad movie explain writing see ge...
2,Everyone plays their part pretty well in this ...,1,everyone plays part pretty well little nice mo...,everyon play part pretti well littl nice movi ...,everyone play part pretty well little nice mov...
3,There are a lot of highly talented filmmakers/...,0,lot highly talented filmmakers actors germany ...,lot highli talent filmmak actor germani none a...,lot highly talented filmmaker actor germany no...
4,I've just had the evidence that confirmed my s...,0,evidence confirmed suspicions bunch kids 14 22...,evid confirm suspicion bunch kid 14 22 put dvd...,evidence confirm suspicion bunch kid 14 22 put...


In [65]:
# D. Utilize a técnica de Bag-of-Words para vetorizar as resenhas do dataset resultante das operações do item (b). Apresente os resultados obtidos.
vectorizer_stem = CountVectorizer(min_df=5)
X_stem = vectorizer_stem.fit_transform(df['stemmed'])
y = df['label'].values

print("X_stem shape:", X_stem.shape)

X_stem shape: (25000, 18543)


In [66]:
# E. Utilize a técnica de Bag-of-n-Grams, identificando bigrams, para vetorizar as resenhas do dataset resultante das operações do item (c). Apresente os resultados obtidos.
bigram_vectorizer = CountVectorizer(ngram_range=(2, 2), min_df=5)
X_lemma_bigram = bigram_vectorizer.fit_transform(df['lemmatized'])

print("X_lemma_bigram shape:", X_lemma_bigram.shape)

X_lemma_bigram shape: (25000, 74716)


## Questão 6

In [67]:
# A. Redimensione os dados do dataset pelo método term frequency–inverse document frequency (tf-idf). Apresente os resultados obtidos.

import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

def load_reviews(data_path):
    reviews = []
    labels = []
    
    for label in ['pos', 'neg']:
        folder_path = os.path.join(data_path, 'train', label)
        for filename in os.listdir(folder_path):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
                reviews.append(file.read())
                labels.append(1 if label == 'pos' else 0)
    
    return reviews, labels

data_path = "../db/aclImdb_v1/aclImdb"

reviews, labels = load_reviews(data_path)

X_train, X_test, y_train, y_test = train_test_split(reviews, labels, test_size=0.2, random_state=42)

# Aplicar TF-IDF
tfidf_vectorizer = TfidfVectorizer(min_df=5)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print(f"Tamanho da matriz de treino: {X_train_tfidf.shape}")
print(f"Tamanho da matriz de teste: {X_test_tfidf.shape}")

Tamanho da matriz de treino: (20000, 24469)
Tamanho da matriz de teste: (5000, 24469)


In [70]:
# B. Crie um modelo de classificação baseado em regressão logística sobre a base redimensionada no item (a) e avalie os resultados obtidos.

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Treinar o modelo
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_tfidf, y_train)

# Prever no conjunto de teste
y_pred = model.predict(X_test_tfidf)

# Avaliar
print(f"Acurácia no teste: {accuracy_score(y_test, y_pred):.3f}\n")
print(classification_report(y_test, y_pred))

Acurácia no teste: 0.885

              precision    recall  f1-score   support

           0       0.89      0.88      0.88      2485
           1       0.88      0.89      0.89      2515

    accuracy                           0.88      5000
   macro avg       0.88      0.88      0.88      5000
weighted avg       0.88      0.88      0.88      5000

