In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

# 1. Load Data
def load_data(file_path):
    texts, labels = [], []
    with open(file_path, 'r') as f:
        for line in f:
            text, label = line.strip().split('\t')
            texts.append(text)
            labels.append(int(label))
    return texts, labels

file_small = r'C:\Users\Samous\Downloads\amazon_cells_labelled (1).txt'
data, labels = load_data(file_small)

# 2. Text to Vectors
vectorizer = CountVectorizer(max_features=5000)
X = vectorizer.fit_transform(data).toarray()

# 3. Split
X_train, X_temp, y_train, y_temp = train_test_split(X, labels, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# 4. Tensor Conversion
to_tensor = lambda x: torch.tensor(x, dtype=torch.float32)
train_data, val_data, test_data = map(to_tensor, [X_train, X_val, X_test])
train_labels = torch.tensor(y_train)
val_labels = torch.tensor(y_val)
test_labels = torch.tensor(y_test)

# 5. Dataset & Dataloader
class TextDataset(Dataset):
    def __init__(self, data, labels):
        self.data, self.labels = data, labels
    def __len__(self): return len(self.data)
    def __getitem__(self, idx): return self.data[idx], self.labels[idx]

batch_size = 64
train_loader = DataLoader(TextDataset(train_data, train_labels), batch_size=batch_size, shuffle=True)
val_loader = DataLoader(TextDataset(val_data, val_labels), batch_size=batch_size)
test_loader = DataLoader(TextDataset(test_data, test_labels), batch_size=batch_size)

# 6. Define ANN
class ANNModel(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, output_dim=2):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )
    def forward(self, x): return self.model(x)

model = ANNModel(input_dim=train_data.shape[1])
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 7. Training
def train(model, loader):
    model.train()
    for inputs, labels in loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()

# 8. Evaluation
def evaluate(model, loader):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for inputs, labels in loader:
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
    return 100 * correct / total

# 9. Run
for epoch in range(10):
    train(model, train_loader)
    acc = evaluate(model, val_loader)
    print(f"Epoch {epoch+1}, Val Accuracy: {acc:.2f}%")

print("Test Accuracy:", evaluate(model, test_loader))


Epoch 1, Val Accuracy: 80.00%
Epoch 2, Val Accuracy: 84.00%
Epoch 3, Val Accuracy: 84.00%
Epoch 4, Val Accuracy: 83.00%
Epoch 5, Val Accuracy: 83.00%
Epoch 6, Val Accuracy: 82.00%
Epoch 7, Val Accuracy: 82.00%
Epoch 8, Val Accuracy: 82.00%
Epoch 9, Val Accuracy: 83.00%
Epoch 10, Val Accuracy: 82.00%
Test Accuracy: 86.0


In [16]:

# 1. Load Larger Dataset


import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

# 1. Load Data
def load_data(file_path):
    texts, labels = [], []
    with open(file_path, 'r') as f:
        for line in f:
            text, label = line.strip().split('\t')
            texts.append(text)
            labels.append(int(label))
    return texts, labels

file_large =  r'C:\Users\Samous\Desktop\amazon_cells_labelled_LARGE_25K.txt'  # ← your 25K file path here

data, labels = load_data(file_large)

# 2. Text to Vectors
vectorizer = CountVectorizer(max_features=5000)
X = vectorizer.fit_transform(data).toarray()

# 3. Split
X_train, X_temp, y_train, y_temp = train_test_split(X, labels, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# 4. Tensor Conversion
to_tensor = lambda x: torch.tensor(x, dtype=torch.float32)
train_data, val_data, test_data = map(to_tensor, [X_train, X_val, X_test])
train_labels = torch.tensor(y_train)
val_labels = torch.tensor(y_val)
test_labels = torch.tensor(y_test)

# 5. Dataset & Dataloader
class TextDataset(Dataset):
    def __init__(self, data, labels):
        self.data, self.labels = data, labels
    def __len__(self): return len(self.data)
    def __getitem__(self, idx): return self.data[idx], self.labels[idx]

batch_size = 64
train_loader = DataLoader(TextDataset(train_data, train_labels), batch_size=batch_size, shuffle=True)
val_loader = DataLoader(TextDataset(val_data, val_labels), batch_size=batch_size)
test_loader = DataLoader(TextDataset(test_data, test_labels), batch_size=batch_size)

# 6. Define ANN
class ANNModel(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, output_dim=2):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )
    def forward(self, x): return self.model(x)

model = ANNModel(input_dim=train_data.shape[1])
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 7. Training
def train(model, loader):
    model.train()
    for inputs, labels in loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()

# 8. Evaluation
def evaluate(model, loader):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for inputs, labels in loader:
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
    return 100 * correct / total

# 9. Run
for epoch in range(10):
    train(model, train_loader)
    acc = evaluate(model, val_loader)
    print(f"Epoch {epoch+1}, Val Accuracy: {acc:.2f}%")

print("Test Accuracy:", evaluate(model, test_loader))



Epoch 1, Val Accuracy: 88.72%
Epoch 2, Val Accuracy: 88.92%
Epoch 3, Val Accuracy: 87.72%
Epoch 4, Val Accuracy: 86.60%
Epoch 5, Val Accuracy: 87.04%
Epoch 6, Val Accuracy: 86.88%
Epoch 7, Val Accuracy: 86.56%
Epoch 8, Val Accuracy: 86.64%
Epoch 9, Val Accuracy: 86.80%
Epoch 10, Val Accuracy: 86.84%
Test Accuracy: 87.28


In [21]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

# 1. Load Data from both small and large datasets
def load_data(file_path):
    texts, labels = [], []
    with open(file_path, 'r') as f:
        for line in f:
            text, label = line.strip().split('\t')
            texts.append(text)
            labels.append(int(label))
    return texts, labels

# Load small and large dataset
file_small = r'C:\Users\Samous\Downloads\amazon_cells_labelled (1).txt'
file_large = r'C:\Users\Samous\Desktop\amazon_cells_labelled_LARGE_25K.txt'  # Path to large file

texts_small, labels_small = load_data(file_small)
texts_large, labels_large = load_data(file_large)

# 2. Combine small and large dataset
texts = texts_small + texts_large
labels = labels_small + labels_large

# 3. Convert text to vectors using CountVectorizer (limiting to 5000 features)
vectorizer = CountVectorizer(max_features=5000)
X = vectorizer.fit_transform(texts).toarray()

# 4. Split data into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, labels, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# 5. Convert data and labels to PyTorch tensors
to_tensor = lambda x: torch.tensor(x, dtype=torch.float32)
train_data, val_data, test_data = map(to_tensor, [X_train, X_val, X_test])
train_labels = torch.tensor(y_train)
val_labels = torch.tensor(y_val)
test_labels = torch.tensor(y_test)

# 6. Create a custom dataset class for loading batches
class TextDataset(Dataset):
    def __init__(self, data, labels):
        self.data, self.labels = data, labels
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

# 7. Set batch size and create DataLoader objects for training, validation, and test data
batch_size = 64
train_loader = DataLoader(TextDataset(train_data, train_labels), batch_size=batch_size, shuffle=True)
val_loader = DataLoader(TextDataset(val_data, val_labels), batch_size=batch_size)
test_loader = DataLoader(TextDataset(test_data, test_labels), batch_size=batch_size)

# 8. Define the Artificial Neural Network (ANN) model
class ANNModel(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, output_dim=2):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )
    
    def forward(self, x):
        return self.model(x)

# Initialize the model, loss function, and optimizer
model = ANNModel(input_dim=train_data.shape[1])  # The input dimension is equal to the number of features
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 9. Define the training function
def train(model, loader):
    model.train()
    for inputs, labels in loader:
        optimizer.zero_grad()  # Reset gradients
        outputs = model(inputs)  # Forward pass
        loss = loss_fn(outputs, labels)  # Compute loss
        loss.backward()  # Backward pass
        optimizer.step()  # Update model weights

# 10. Define the evaluation function to compute accuracy
def evaluate(model, loader):
    model.eval()  # Set model to evaluation mode
    correct, total = 0, 0
    with torch.no_grad():  # No need to compute gradients during evaluation
        for inputs, labels in loader:
            outputs = model(inputs)  # Forward pass
            _, preds = torch.max(outputs, 1)  # Get the predicted class
            correct += (preds == labels).sum().item()  # Count correct predictions
            total += labels.size(0)  # Total number of samples
    return 100 * correct / total  # Return accuracy as a percentage

# 11. Run training for 10 epochs and evaluate on the validation set
for epoch in range(10):
    train(model, train_loader)  # Train the model for one epoch
    acc = evaluate(model, val_loader)  # Evaluate on the validation set
    print(f"Epoch {epoch+1}, Val Accuracy: {acc:.2f}%")

# 12. Final evaluation on the test set
test_acc = evaluate(model, test_loader)
print("Test Accuracy:", test_acc)


Epoch 1, Val Accuracy: 87.73%
Epoch 2, Val Accuracy: 86.73%
Epoch 3, Val Accuracy: 86.27%
Epoch 4, Val Accuracy: 86.15%
Epoch 5, Val Accuracy: 86.62%
Epoch 6, Val Accuracy: 86.00%
Epoch 7, Val Accuracy: 85.85%
Epoch 8, Val Accuracy: 85.65%
Epoch 9, Val Accuracy: 85.85%
Epoch 10, Val Accuracy: 85.69%
Test Accuracy: 86.0
