In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk import word_tokenize
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, classification_report

nltk.download('punkt_tab')
nltk.download('stopwords')

def preprocess_pandas(data, columns):
    processed_rows = []
    data['Sentence'] = data['Sentence'].str.lower()
    data['Sentence'] = data['Sentence'].replace('[a-zA-Z0-9-_.]+@[a-zA-Z0-9-_.]+', '', regex=True)                      # remove emails
    data['Sentence'] = data['Sentence'].replace('((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\.|$)){4}', '', regex=True)    # remove IPs
    data['Sentence'] = data['Sentence'].str.replace('[^\w\s]', '', regex=True)                                          # remove special characters
    data['Sentence'] = data['Sentence'].replace('\d', '', regex=True)                                                   # remove numbers

    for index, row in data.iterrows():
        word_tokens = word_tokenize(row['Sentence'])
        filtered_sent = [w for w in word_tokens if w not in stopwords.words('english')]
        processed_rows.append({
            "index": row['index'],
            "Class": row['Class'],
            "Sentence": " ".join(filtered_sent)
        })

    # Convert list of dicts to DataFrame at the end
    return pd.DataFrame(processed_rows, columns=columns)


# If this is the primary file that is executed (ie not an import of another file)
if __name__ == "__main__":
    # get data, pre-process and split
    data = pd.read_csv("amazon_cells_labelled.txt", delimiter='\t', header=None)
    data.columns = ['Sentence', 'Class']
    data['index'] = data.index                                          # add new column index
    columns = ['index', 'Class', 'Sentence']
    data = preprocess_pandas(data, columns)                             # pre-process
    training_data, validation_data, training_labels, validation_labels = train_test_split( # split the data into training, validation, and test splits
        data['Sentence'].values.astype('U'),
        data['Class'].values.astype('int32'),
        test_size=0.10,
        random_state=0,
        shuffle=True
    )

    # vectorize data using TFIDF and transform for PyTorch for scalability
    word_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1,2), max_features=50000, max_df=0.5, use_idf=True, norm='l2')
    training_data = word_vectorizer.fit_transform(training_data)        # transform texts to sparse matrix
    training_data = training_data.todense()                             # convert to dense matrix for Pytorch
    vocab_size = len(word_vectorizer.vocabulary_)
    validation_data = word_vectorizer.transform(validation_data)
    validation_data = validation_data.todense()
    train_x_tensor = torch.from_numpy(np.array(training_data)).type(torch.FloatTensor)
    train_y_tensor = torch.from_numpy(np.array(training_labels)).long()
    validation_x_tensor = torch.from_numpy(np.array(validation_data)).type(torch.FloatTensor)
    validation_y_tensor = torch.from_numpy(np.array(validation_labels)).long()


class SentimentANN(nn.Module):
    def __init__(self, input_dim, hidden_dim=100):
        super(SentimentANN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, 2)  # Output: 2 klasser (positive/negative)

                            
    def forward(self, x):
        x = F.relu(self.fc1(x))   #Simplifierad feedforward
        x = self.fc2(x)
        return x


# Initialize
input_dim = vocab_size
model = SentimentANN(input_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Create DataLoaders
batch_size = 32
train_dataset = TensorDataset(train_x_tensor, train_y_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = TensorDataset(validation_x_tensor, validation_y_tensor)
val_loader = DataLoader(val_dataset, batch_size=batch_size)


# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch_x, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss:.4f}")




model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for val_x, val_y in val_loader:
        outputs = model(val_x)
        _, predicted = torch.max(outputs, 1)
        all_preds.extend(predicted.tolist())
        all_labels.extend(val_y.tolist())


print("\nValidation Results:")
print("Accuracy:", accuracy_score(all_labels, all_preds))
print("Precision:", precision_score(all_labels, all_preds))
print("Recall:", recall_score(all_labels, all_preds))
print("\nClassification Report:\n", classification_report(all_labels, all_preds))

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\joelw\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\joelw\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Epoch [1/10], Loss: 19.9907
Epoch [2/10], Loss: 18.1466
Epoch [3/10], Loss: 13.6885
Epoch [4/10], Loss: 8.0774
Epoch [5/10], Loss: 4.3439
Epoch [6/10], Loss: 2.4600
Epoch [7/10], Loss: 1.5705
Epoch [8/10], Loss: 1.0881
Epoch [9/10], Loss: 0.7931
Epoch [10/10], Loss: 0.6397

Validation Results:
Accuracy: 0.87
Precision: 0.9
Recall: 0.8490566037735849

Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.89      0.87        47
           1       0.90      0.85      0.87        53

    accuracy                           0.87       100
   macro avg       0.87      0.87      0.87       100
weighted avg       0.87      0.87      0.87       100

