### Ignorar:

(mestrado guto):
**Propostas da Prof. Karin:**
- Treinar um modelo de classificação e tentar atingir o Tone, parecido com o que tem no artigo 
    - Gap Classificação: 
        1. Desempenho Ruim do Classificador (treinar "melhor")
        2. Desempenho ruim do fine-tuning por LLM
- Implementar um método (fine tuning / eng. de prompt) que faça a tarefa de reescrita (empática/polite)
    - Pode ter um gap aqui
- Treinar um modelo pra identificar motivos das reviews (Agrupamentos)

In [1]:
# %pip install pandas
# %pip install matplotlib
# %pip install --upgrade numpy
# %pip install seaborn
# %pip install scikit-learn
# %pip install tensorflow
# %pip install ipywidgets
# %pip install --upgrade gensim

%matplotlib inline

from IPython.display import clear_output

# clear_output()

---

### Inicio | Imports

In [None]:
from sklearn.model_selection import train_test_split
from string import punctuation
from keras.layers import TextVectorization
from gensim import utils
import gensim.models

import matplotlib.pyplot as plt
import seaborn as sns
import re # Regular Expression
import pandas as pd
import numpy as np

### Dataframe:

In [2]:
path = 'PolitenessDataset-FULL.csv'

PolitenessDF = pd.read_csv(path)
print('Data Stats:', PolitenessDF.describe())
PolitenessDF.head(10)

Data Stats:               Tone
count  2500.000000
mean      2.867200
std       0.948854
min       1.000000
25%       2.000000
50%       3.000000
75%       3.000000
max       5.000000


Unnamed: 0,Venue,Review ID,review,Tone,Review URL
0,ShitMyReviewerSay,,"It is early in the year, but difficult to imag...",2,
1,ShitMyReviewerSay,,You do not use the empirical data for the anal...,2,
2,ShitMyReviewerSay,,I understand that Wikipedia is not the best so...,3,
3,ShitMyReviewerSay,,Reviewer #1: 'The project can hardly be descri...,3,
4,ShitMyReviewerSay,,The figures are dishonest and not all that use...,2,
5,ShitMyReviewerSay,,Find your inner nerdâ€”it must be a big part o...,1,
6,ShitMyReviewerSay,,[entire review] 'Research method is very impor...,4,
7,ShitMyReviewerSay,,Some papers are a pleasure to read. This is no...,2,
8,ShitMyReviewerSay,,"Sorry guys, I'm throwing in the towel.",1,
9,ShitMyReviewerSay,,Nobody in their right mind would ever suggest ...,2,


In [3]:
# codificação
test = PolitenessDF.copy()

# codificação
test['CodeVenue'] = test['Venue'].astype('category').cat.codes

# train test split
X_train, X_test, y_train, y_test = train_test_split(test[['review','Tone']], test['Tone'], test_size=0.2, random_state=42)

max_length = X_train['review'].str.len().max()
print(f"The maximum length of characters is: {max_length}")

# Lengths of each review
lengths = X_train['review'].dropna().astype(str).str.len()

The maximum length of characters is: 552.0


In [None]:
# Some visuals

# Plotting the distribution of the sources
PolitenessDF['Venue'].value_counts().plot(title='Sources Freq', kind='bar')
plt.show()

# Tone distribution
plt.figure(figsize=(10, 6))
sns.histplot(PolitenessDF['Tone'], bins=5, fill=True, color='blue', edgecolor='black')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.xticks(ticks=[1, 2, 3, 4, 5])
plt.title("Histogram of Politeness Tones", fontsize=16)
plt.xlabel("Tone", fontsize=14)
plt.ylabel("Count", fontsize=14)
plt.show()

# Plot tone per venue, stacked
PolitenessDF.groupby(['Venue', 'Tone']).size().unstack().plot(kind='bar', stacked=True, title='Tone per Venue')
plt.show()

# Plot histogram of review lengths
plt.figure(figsize=(10, 6))
sns.histplot(lengths, bins=30, kde=True, color='blue', alpha=0.7)  # Seaborn for enhanced visuals
plt.title("Histogram of Review Lengths", fontsize=16)
plt.xlabel("Review Length (characters)", fontsize=14)
plt.ylabel("Frequency", fontsize=14)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()



### Pre-Processing

In [4]:
print('Special Characters:', punctuation)

# stop_words = stopwords.words('portuguese')
def preprocess(x: str):
    new_x = x.replace(r'"',' ')
    for c in punctuation:
        new_x = new_x.replace(c,' ')
    pattern = r"""
    [^\w\s]|         # Remove punctuation
    http\S+|         # Remove links
    @\w+|            # Remove mentions
    #\S+|            # Remove hashtags
    \b\w*\d\w*\b|    # Remove words containing numbers
    \s+              # Normalize spaces
    """    
    new_x = re.sub(pattern, ' ', new_x, flags=re.VERBOSE) #removendo pontuação do texto
    return new_x.lower().strip()

df_train = X_train.dropna().copy()
df_test = X_test.dropna().copy()

## Pré-processar datasets de treino e teste
## Dados de treino
df_train['review_original'] = df_train['review']
df_train['review'] = df_train['review'].apply(preprocess)

## Dados de teste
df_test['review_original'] = df_test['review']
df_test['review'] = df_test['review'].apply(preprocess)

Special Characters: !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [5]:
## Samples (só para ver o processo mesmo)
df_train[df_train['Tone'] == 5].sample(1)['review_original'].values[0]

'This inclusion criteria is not needed due to the fact to be in a master program they would be legal adults. Please remove and adjust anywhere that this shows up.'

In [16]:
## classe para montar o dataset
class PreProcess:
    def __init__(self, docs):
            self.lista_text = docs
    def __iter__(self):
        for line in self.lista_text:
            # assume there's one document per line, tokens separated by whitespace:
            yield utils.simple_preprocess(line) # este método tokeniza e faz algum preprocessamento
            # https://tedboy.github.io/nlps/generated/generated/gensim.utils.simple_preprocess.html

sentences = PreProcess(df_train['review'].values)
# assim treina o modelo usando as configurações padrão e estas especificadas aqui
ModelWord2Vec = gensim.models.Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=1, epochs=20, sg=1)

ModelWord2Vec

<gensim.models.word2vec.Word2Vec at 0x18add605eb0>

In [17]:
# adapta:
MAX_LENGHT = 552
vectorizer = TextVectorization(
    max_tokens=20000, #vocabulário maximo
    output_sequence_length=MAX_LENGHT,
    )

vectorizer.adapt(df_train['review'].dropna().astype(str).to_list())
voc = vectorizer.get_vocabulary()
print('Vocabulário novo:',len(voc))
clean_voc = [str(word) for word in voc] 

# agora vemos os tokens do dataset começando pelos mais frequentes:
print(f'Vocabulário tem {len(voc)} tokens. Os primeiros 5 tokens são:')
voc[:5]

Vocabulário novo: 4857
Vocabulário tem 4857 tokens. Os primeiros 5 tokens são:


['', '[UNK]', 'the', 'is', 'of']

In [18]:
len(clean_voc)

4857

In [19]:
# um dicionario com todas as palavras do vocabulario e seus indices de 0 a len(voc)-1
word_index = dict(zip(clean_voc, range(len(clean_voc))))
word_index

{'': 0,
 '[UNK]': 1,
 'the': 2,
 'is': 3,
 'of': 4,
 'to': 5,
 'a': 6,
 'and': 7,
 'in': 8,
 'this': 9,
 'i': 10,
 'paper': 11,
 'it': 12,
 'that': 13,
 'not': 14,
 'for': 15,
 'be': 16,
 'are': 17,
 'on': 18,
 'authors': 19,
 'with': 20,
 'have': 21,
 'as': 22,
 'but': 23,
 'would': 24,
 'an': 25,
 'there': 26,
 'or': 27,
 'work': 28,
 'you': 29,
 'more': 30,
 'by': 31,
 'some': 32,
 'results': 33,
 'very': 34,
 'can': 35,
 'from': 36,
 'my': 37,
 'what': 38,
 'they': 39,
 'manuscript': 40,
 'well': 41,
 'if': 42,
 'which': 43,
 'should': 44,
 'no': 45,
 'proposed': 46,
 'also': 47,
 'do': 48,
 'all': 49,
 'like': 50,
 'interesting': 51,
 'was': 52,
 'data': 53,
 'at': 54,
 'model': 55,
 'comments': 56,
 'so': 57,
 'does': 58,
 'has': 59,
 't': 60,
 'see': 61,
 'how': 62,
 'one': 63,
 'method': 64,
 'clear': 65,
 'seems': 66,
 'your': 67,
 'could': 68,
 'written': 69,
 'these': 70,
 'their': 71,
 'its': 72,
 'than': 73,
 'other': 74,
 'however': 75,
 'problem': 76,
 'why': 77,
 'secti

In [20]:
### HYPERPARAMETERS (from notebooks)
POLITENESS_LEVELS = 5
EPOCHS = 30
MAXLEN = 768 # Since SciBERT returns 768 embeddings vector
LSTM_UNITS = 256
is_BiLSTM = True # Flag to automate other pre-processing for With or Without BiLSTM variants
VOCAB_LEN = 1853
EMBEDDING_DIMENSION = 768

import pickle

# LOAD EMBEDS DATASET
def loadPickle(name):
    start_path = '../PolitePEER/'
    if is_BiLSTM:
        LOAD_PATH = start_path+'Tokennized_Processed_X_train-BiLSTM.csv'
        train_embeds = pd.read_csv(LOAD_PATH)
        
        LOAD_PATH = start_path+'Tokennized_Processed_X_test-BiLSTM.csv'
        test_embeds = pd.read_csv(LOAD_PATH)
        
        LOAD_PATH = start_path+'Tokennized_Processed_X_val-BiLSTM.csv'
        val_embeds = pd.read_csv(LOAD_PATH)
        
    else:
        LOAD_PATH = start_path+name+'_train.pickle'
        with open(LOAD_PATH, 'rb') as handle:
            train_embeds = pickle.load(handle)
            handle.close()

        LOAD_PATH = start_path+name+'_test.pickle'
        with open(LOAD_PATH, 'rb') as handle:
            test_embeds = pickle.load(handle)
            handle.close()

        LOAD_PATH = start_path+name+'_val.pickle'
        with open(LOAD_PATH, 'rb') as handle:
            val_embeds = pickle.load(handle)
            handle.close()

    y_train = pd.read_csv(start_path+'y_train.csv')
    y_val = pd.read_csv(start_path+'y_val.csv')
    y_test = pd.read_csv(start_path+'y_test.csv')

    print('\n***** LOADED '+ name+' *****\n')
    print(f'TRAIN SHAPE : {train_embeds.shape}\nTEST SHAPE : {test_embeds.shape}\nVAL SHAPE : {val_embeds.shape}\nY-TRAIN SHAPE : {y_train.shape}\nY-TEST SHAPE : {y_test.shape}\nY-VAL SHAPE : {y_val.shape}')

    return train_embeds, test_embeds, val_embeds, y_train, y_test, y_val

# /kaggle/input/iitpolitenesslevels/SCIBERT_train.pickle

In [21]:
name = ''
train_embeds, test_embeds, val_embeds, y_train, y_test, y_val = loadPickle(name)


***** LOADED  *****

TRAIN SHAPE : (4556, 768)
TEST SHAPE : (855, 768)
VAL SHAPE : (284, 768)
Y-TRAIN SHAPE : (4556, 5)
Y-TEST SHAPE : (855, 5)
Y-VAL SHAPE : (284, 5)


<span style="color:#e687f1;">**TODO**</span>: continuar daqui

GPT suggestions, embeddings:

Summary of Options:
- BERT (Hugging Face): Contextual, state-of-the-art embeddings but computationally heavier.
- GloVe/FastText: Simpler, pre-trained embeddings that are non-contextual but lightweight.


### Embeddings

In [7]:
from transformers import pipeline, AutoTokenizer, AutoModel

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

sentences = df_train['review'].tolist()

inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")

In [9]:
import torch

with torch.no_grad():
    outputs = model(**inputs)
    # Extract the CLS token embeddings
    sentence_embeddings = outputs.last_hidden_state[:, 0, :]

### Logistic Regression

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

labels = df_train['Tone'].to_list()

# Example: Split into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(sentence_embeddings.numpy(), labels, test_size=0.2, random_state=42)


# Initialize and train the classifier
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           1       0.29      0.19      0.23        32
           2       0.34      0.36      0.35        87
           3       0.61      0.68      0.64       194
           4       0.45      0.38      0.41        73
           5       0.36      0.29      0.32        14

    accuracy                           0.50       400
   macro avg       0.41      0.38      0.39       400
weighted avg       0.49      0.50      0.49       400



## BiLSTM

In [24]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import LabelEncoder

# Encode labels to integers
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(sentence_embeddings, labels, test_size=0.2, random_state=42)

# Custom Dataset class
class PolitenessDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = torch.tensor(embeddings, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.embeddings[idx], self.labels[idx]

# Create DataLoaders
train_dataset = PolitenessDataset(X_train, y_train)
test_dataset = PolitenessDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Define BiLSTM model
class BiLSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, n_layers, dropout):
        super(BiLSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers=n_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)  # Multiply by 2 for bidirectional
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # LSTM layer
        lstm_out, _ = self.lstm(x.unsqueeze(1))  # Add sequence dimension
        lstm_out = self.dropout(lstm_out[:, -1, :])  # Use the last hidden state

        # Fully connected layer
        logits = self.fc(lstm_out)
        return logits

# Model parameters
input_dim = 768  # Dimension of embeddings
hidden_dim = 128  # Number of hidden units
output_dim = len(np.unique(labels))  # Number of politeness levels
n_layers = 2  # Number of LSTM layers
dropout = 0.3  # Dropout rate

# Initialize model, loss, and optimizer
model = BiLSTMClassifier(input_dim, hidden_dim, output_dim, n_layers, dropout)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training loop
def train_model(model, train_loader, criterion, optimizer, device, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for embeddings, labels in train_loader:
            embeddings, labels = embeddings.to(device), labels.to(device)

            # Forward pass
            outputs = model(embeddings)
            loss = criterion(outputs, labels)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(train_loader):.4f}")

# Train the model
train_model(model, train_loader, criterion, optimizer, device, num_epochs=10)

# Evaluation loop
def evaluate_model(model, test_loader, device):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for embeddings, labels in test_loader:
            embeddings, labels = embeddings.to(device), labels.to(device)

            # Forward pass
            outputs = model(embeddings)
            _, predicted = torch.max(outputs, 1)

            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    print(f"Accuracy: {100 * correct / total:.2f}%")

# Evaluate the model
evaluate_model(model, test_loader, device)



  self.embeddings = torch.tensor(embeddings, dtype=torch.float32)


Epoch [1/10], Loss: 1.3079
Epoch [2/10], Loss: 1.1273
Epoch [3/10], Loss: 1.0356
Epoch [4/10], Loss: 0.9749
Epoch [5/10], Loss: 0.9299
Epoch [6/10], Loss: 0.8586
Epoch [7/10], Loss: 0.8154
Epoch [8/10], Loss: 0.7239
Epoch [9/10], Loss: 0.6499
Epoch [10/10], Loss: 0.5679
Accuracy: 48.75%
