In [21]:
# Importar bibliotecas necessárias
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import nltk


## Pré-processamento

### Lendo o arquivo

In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Ler o arquivo CSV
df = pd.read_csv("data.csv", names=['data', 'label'])

### Removendo linhas com valores nulos ou vazios

In [23]:
# Remover linhas com valores nulos ou vazios
df = df.dropna()

### Removendo a primeira linha

In [24]:
# Remover a primeira linha
df = df.iloc[1:]


### Removendo alphanumericos, pontuação, stopwords e deixando tudo em minúsculo

In [25]:
nltk.download('stopwords')
# Remover alfanuméricos
df['data'] = df['data'].str.replace('\d+', '')

# Deixar tudo em minúsculo
df['data'] = df['data'].str.lower()

# Remover pontuação
df['data'] = df['data'].str.replace('[^\w\s]', '')

# Remover stopwords
stopwords = nltk.corpus.stopwords.words('english')
df['data'] = df['data'].apply(lambda x: " ".join(x for x in x.split() if x not in stopwords))

[nltk_data] Downloading package stopwords to /home/apo-pc/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  df['data'] = df['data'].str.replace('\d+', '')
  df['data'] = df['data'].str.replace('[^\w\s]', '')


### Lemmatização

In [26]:
# Lemmatização
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
df['data'] = df['data'].apply(lambda x: " ".join([lemmatizer.lemmatize(word) for word in x.split()]))

[nltk_data] Downloading package wordnet to /home/apo-pc/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Dividindo os dados em treino e teste

In [27]:
# Dividir os dados em treino e teste
X_train, X_test, y_train, y_test = train_test_split(df['data'], df['label'], test_size=0.2, random_state=42)

In [28]:
# COntar qtd de classes
y_train.value_counts()


neutral     2508
positive    1480
negative     685
Name: label, dtype: int64

### Codificando a variável target

In [29]:
# Instantiate the label encoder
label_encoder = LabelEncoder()

# Fit and transform the training labels
y_train = label_encoder.fit_transform(y_train)

# Transform the test labels (use the same label encoder instance)
y_test = label_encoder.transform(y_test)

## Vetorização
### TF-IDF

In [30]:
# Pré-processar os dados de texto
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


## Treinamento
### Importando bibliotecas
(Pytorch=LSTM, RNN, CNN)

In [31]:
# Importar bibliotecas necessárias
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

#

### LSTM

In [32]:
# Importing necessary PyTorch libraries
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

Definindo dispositivo CUDA

In [33]:
print(f"CUDA é suportado pelo sistema? {torch.cuda.is_available()}")
print(f"Versao do CUDA: {torch.version.cuda}")
 
# Storing ID of current CUDA device
cuda_id = torch.cuda.current_device()
print(f"ID do CUDA device: {torch.cuda.current_device()}")
       
print(f"Nome do dispositivo CUDA:{torch.cuda.get_device_name(cuda_id)}")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

torch.set_default_device('cuda')


CUDA é suportado pelo sistema? True
Versao do CUDA: 12.1
ID do CUDA device: 0
Nome do dispositivo CUDA:NVIDIA GeForce RTX 4060


In [34]:
# Convert text data to PyTorch tensors
X_train_tfidf = torch.tensor(X_train_tfidf.toarray(), dtype=torch.float32)
X_test_tfidf = torch.tensor(X_test_tfidf.toarray(), dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.long)
y_test = torch.tensor(y_test, dtype=torch.long)

# Create DataLoader for training and testing data
train_dataset = TensorDataset(X_train_tfidf, y_train)
test_dataset = TensorDataset(X_test_tfidf, y_test)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [35]:
# Define a simple LSTM model
class SimpleLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(SimpleLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # Add an extra dimension to the input tensor
        x = x.unsqueeze(1)

        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])  # Take the output from the last time step
        return out

# Set hyperparameters
input_size = X_train_tfidf.shape[1]
hidden_size = 64
num_layers = 1
output_size = len(df['label'].unique())  # Assuming your labels are numerical

# Instantiate the model, loss function, and optimizer
model = SimpleLSTM(input_size, hidden_size, num_layers, output_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the LSTM model
num_epochs = 10
for epoch in range(num_epochs):
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

# Evaluate the model on the test set
model.eval()
with torch.no_grad():
    test_outputs = model(X_test_tfidf)
    _, predicted = torch.max(test_outputs, 1)

# Convert the PyTorch tensors back to NumPy arrays for sklearn metrics
predicted = predicted.numpy()
y_test = y_test.numpy()

# Print classification report and accuracy
print("Classification Report:\n", classification_report(y_test, predicted))
print("Accuracy:", accuracy_score(y_test, predicted))


RuntimeError: Expected a 'cuda' device type for generator but found 'cpu'

#### Avaliando o modelo

In [16]:
class RobustLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, bidirectional=True, dropout=0.5):
        super(RobustLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=bidirectional)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_size * 2 if bidirectional else hidden_size, output_size)

    def forward(self, x):
        x = x.unsqueeze(1)
        out, _ = self.lstm(x)
        out = self.dropout(out)
        out = self.fc(out[:, -1, :])
        return out

# Set hyperparameters
input_size = X_train_tfidf.shape[1]
hidden_size = 64
num_layers = 2  # Increased the number of layers
output_size = len(df['label'].unique())
bidirectional = True
dropout = 0.5

# Instantiate the model with bidirectional LSTM and dropout
model = RobustLSTM(input_size, hidden_size, num_layers, output_size, bidirectional=bidirectional, dropout=dropout)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Implement learning rate scheduling
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)
# Train the LSTM model with progress updates
num_epochs = 10
print_interval = 100  # Print loss every 100 batches

for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    for batch_idx, (inputs, labels) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # Print training progress
        if batch_idx % print_interval == 0 and batch_idx > 0:
            avg_loss = total_loss / print_interval
            print(f"Epoch [{epoch + 1}/{num_epochs}], Batch [{batch_idx}/{len(train_loader)}], Loss: {avg_loss:.4f}")
            total_loss = 0.0

    # Learning rate scheduling step
    scheduler.step()

Epoch [1/10], Batch [100/147], Loss: 1.0056
Epoch [2/10], Batch [100/147], Loss: 0.8205
Epoch [3/10], Batch [100/147], Loss: 0.5394
Epoch [4/10], Batch [100/147], Loss: 0.3222
Epoch [5/10], Batch [100/147], Loss: 0.2199
Epoch [6/10], Batch [100/147], Loss: 0.1741
Epoch [7/10], Batch [100/147], Loss: 0.1730
Epoch [8/10], Batch [100/147], Loss: 0.1685
Epoch [9/10], Batch [100/147], Loss: 0.1579
Epoch [10/10], Batch [100/147], Loss: 0.1646


In [17]:
# Evaluate the model on the test set
model.eval()
with torch.no_grad():
    test_outputs = model(X_test_tfidf)
    _, predicted = torch.max(test_outputs, 1)

# Convert the PyTorch tensors back to NumPy arrays for sklearn metrics
predicted = predicted.numpy()
# y_test = y_test.numpy()

# Print classification report and accuracy
print("Classification Report:\n", classification_report(y_test, predicted))
print("Accuracy:", accuracy_score(y_test, predicted))

Classification Report:
               precision    recall  f1-score   support

           0       0.36      0.30      0.33       175
           1       0.72      0.78      0.75       622
           2       0.75      0.70      0.73       372

    accuracy                           0.69      1169
   macro avg       0.61      0.60      0.60      1169
weighted avg       0.68      0.69      0.68      1169

Accuracy: 0.6860564585115483


In [18]:
from sklearn.model_selection import ParameterGrid

# Define a grid of hyperparameters to search
param_grid = {
    'hidden_size': [32, 64, 128],
    'num_layers': [1, 2, 3],
    'bidirectional': [True, False],
    'dropout': [0.3, 0.5, 0.7],
    'learning_rate': [0.001, 0.01, 0.1],
}

best_accuracy = 0.0
best_params = None

# Perform grid search
for params in ParameterGrid(param_grid):
    model = RobustLSTM(input_size, params['hidden_size'], params['num_layers'],
                       output_size, bidirectional=params['bidirectional'], dropout=params['dropout'])
    optimizer = optim.Adam(model.parameters(), lr=params['learning_rate'])
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)

    # Train the model
    for epoch in range(num_epochs):
        model.train()
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

        scheduler.step()

    # Evaluate the model on the test set
    model.eval()
    with torch.no_grad():
        test_outputs = model(X_test_tfidf)
        _, predicted = torch.max(test_outputs, 1)

    accuracy = accuracy_score(y_test, predicted)
    print(f"Parameters: {params}, Accuracy: {accuracy:.4f}")

    # Update best parameters if the current model is better
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_params = params

print("Best Parameters:", best_params)
print("Best Accuracy:", best_accuracy)

Parameters: {'bidirectional': True, 'dropout': 0.3, 'hidden_size': 32, 'learning_rate': 0.001, 'num_layers': 1}, Accuracy: 0.6929


KeyboardInterrupt: 

## TPOT

In [None]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=100, random_state=42)
X_train_tfidf = svd.fit_transform(X_train_tfidf)

X_test_tfidf = svd.transform(X_test_tfidf)

In [None]:
X_train_tfidf

In [None]:
# Importing necessary libraries
from tpot import TPOTClassifier

# Instantiate TPOTRegressor
tpot = TPOTClassifier(verbosity=3, n_jobs=-1,scoring='accuracy', config_dict='TPOT light')

# Fit the regressor to the training data
tpot.fit(X_train_tfidf, y_train)

# Score on the test set
print(tpot.score(X_test_tfidf, y_test))

## Mulher do Kaggle

In [None]:
# IMportando CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

bow_model = CountVectorizer()
bow_df = pd.DataFrame(bow_model.fit_transform(df['data']).todense())
bow_df.columns = sorted(bow_model.vocabulary_)
bow_df

In [None]:
X_train, X_test, y_train, y_test = train_test_split(bow_df, df['label'], test_size=0.2, random_state=42)

In [None]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, predicted_labels)*100
print("Accuracy:", accuracy)