In [65]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn import Module, LSTM, Linear
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
import pandas as pd

In [66]:
class Net(Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(Net, self).__init__()
        self.lstm = LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.fc = Linear(hidden_size*2, num_classes)
        
        
    def forward(self, x):
        x = x.unsqueeze(1)
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        return out

In [67]:
class SentimentAnalysisDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        label = self.dataframe.iloc[idx, 0] 
        message = self.dataframe.iloc[idx, 1]  

        encoding = self.tokenizer(
            message,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'].squeeze().float()
        attention_mask = encoding['attention_mask'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label': torch.tensor(label, dtype=torch.long)
        }

In [68]:
sentiment_analysis_csv = pd.read_csv('FirstReportData/sentiment_analysis.csv', header=None, encoding='ISO-8859-1')

sentiment_analysis_csv = sentiment_analysis_csv[:200000]

train_data, test_data = train_test_split(sentiment_analysis_csv, test_size=0.4, random_state=42)

print("Training data size:", len(train_data))
print("Testing data size:", len(test_data))

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_dataset = SentimentAnalysisDataset(train_data, tokenizer, max_length=128)
test_dataset = SentimentAnalysisDataset(test_data, tokenizer, max_length=128)

Training data size: 120000
Testing data size: 80000


In [69]:
device = torch.device("mps")

input_size = 128
hidden_size = 64
num_layers = 2
num_classes = 2  

model = Net(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, num_classes=num_classes).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

def train(model, dataloader, criterion, optimizer):
    model.train()
    total_loss = 0
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    return total_loss / len(dataloader)


def evaluate(model, dataloader, criterion):
    model.eval()
    total_loss = 0
    correct = 0
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids)
            loss = criterion(outputs, labels)

            _, predicted = torch.max(outputs, 1)
            correct += (predicted == labels).sum().item()
            total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    accuracy = correct / len(dataloader.dataset)
    return avg_loss, accuracy


# Training loop
num_epochs = 1
for epoch in range(num_epochs):
    train_loss = train(model, train_dataloader, criterion, optimizer)
    test_loss, test_accuracy = evaluate(model, test_dataloader, criterion)

    print(
        f'Epoch [{epoch + 1}/{num_epochs}], Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}')
    

Epoch [1/1], Train Loss: 0.0034, Test Loss: 0.0000, Test Accuracy: 1.0000


In [79]:
# torch.save(model.state_dict(), 'FirstReportData/sentiment_analysis_model.pth')

model = Net(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, num_classes=num_classes).to(device)
model.load_state_dict(torch.load('FirstReportData/sentiment_analysis_model.pth'))
model.eval()
message = "i loved it here"
encoding = tokenizer(
    message,
    add_special_tokens=True,
    max_length=128,
    padding='max_length',
    truncation=False,
    return_tensors='pt'
)

output = model(encoding['input_ids'].float().to(device))
_, predicted = torch.max(output, 1)
print("Predicted:", predicted.item())


Predicted: 0


  model.load_state_dict(torch.load('FirstReportData/sentiment_analysis_model.pth'))
