In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import re
import torch
from torch.utils.data import Dataset, DataLoader

# Load the data
df = pd.read_csv('BBC News Train.csv')  # Replace with your dataset path

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Replace numbers with <NUM> token
    text = re.sub(r'\d+', '<NUM>', text)
    # Replace prices with <MONEY> token
    text = re.sub(r'\$\d+(\.\d{2})?', '<MONEY>', text)
    # Replace other special characters with space
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    return text

# Preprocess the text data
df['processed_text'] = df['Text'].apply(preprocess_text)  # Replace 'text' with your text column name

# Create CountVectorizer
vectorizer = CountVectorizer(max_features=5000)  # Adjust max_features as needed
X = vectorizer.fit_transform(df['processed_text']).toarray()

# Encode labels
label_encoder = {label: i for i, label in enumerate(df['Category'].unique())}  # Replace 'label' with your label column name
y = df['Category'].map(label_encoder).values

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train)
y_train_tensor = torch.LongTensor(y_train)
X_test_tensor = torch.FloatTensor(X_test)
y_test_tensor = torch.LongTensor(y_test)

# Create Dataset and DataLoader
class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = TextDataset(X_train_tensor, y_train_tensor)
test_dataset = TextDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

print(f"Vocabulary size: {len(vectorizer.get_feature_names_out())}")
print(f"Number of classes: {len(label_encoder)}")
print(f"Train set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")

Vocabulary size: 5000
Number of classes: 5
Train set size: 1192
Test set size: 298


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import re
import torch
from torch.utils.data import Dataset, DataLoader

# Load the data
df_train = pd.read_csv('train.csv')  # Replace with your dataset path
df_test = pd.read_csv('test.csv')  # Replace with your dataset path

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Replace numbers with <NUM> token
    text = re.sub(r'\d+', '<NUM>', text)
    # Replace prices with <MONEY> token
    text = re.sub(r'\$\d+(\.\d{2})?', '<MONEY>', text)
    # Replace other special characters with space
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    return text

# Preprocess the text data
df['processed_text'] = df['Text'].apply(preprocess_text)  # Replace 'text' with your text column name

# Create CountVectorizer
vectorizer = CountVectorizer(max_features=5000)  # Adjust max_features as needed
X = vectorizer.fit_transform(df['processed_text']).toarray()

# Encode labels
label_encoder = {label: i for i, label in enumerate(df['Category'].unique())}  # Replace 'label' with your label column name
y = df['Category'].map(label_encoder).values

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train)
y_train_tensor = torch.LongTensor(y_train)
X_test_tensor = torch.FloatTensor(X_test)
y_test_tensor = torch.LongTensor(y_test)

# Create Dataset and DataLoader
class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = TextDataset(X_train_tensor, y_train_tensor)
test_dataset = TextDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

print(f"Vocabulary size: {len(vectorizer.get_feature_names_out())}")
print(f"Number of classes: {len(label_encoder)}")
print(f"Train set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score, classification_report

# Define the neural network
class SimpleClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(SimpleClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Set hyperparameters
input_size = X_train.shape[1]  # Number of features
hidden_size = 64
num_classes = len(label_encoder)
learning_rate = 0.001
num_epochs = 10
batch_size = 32

# Initialize the model
model = SimpleClassifier(input_size, hidden_size, num_classes)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training function
def train(model, train_loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    for batch_X, batch_y in train_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    return total_loss / len(train_loader)

# Evaluation function
def evaluate(model, test_loader, criterion, device):
    model.eval()
    total_loss = 0
    all_predictions = []
    all_labels = []
    
    with torch.no_grad():
        for batch_X, batch_y in test_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            total_loss += loss.item()
            
            _, predicted = torch.max(outputs, 1)
            all_predictions.extend(predicted.cpu().numpy())
            all_labels.extend(batch_y.cpu().numpy())
    
    accuracy = accuracy_score(all_labels, all_predictions)
    return total_loss / len(test_loader), accuracy, all_predictions, all_labels

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Training loop
for epoch in range(num_epochs):
    train_loss = train(model, train_loader, criterion, optimizer, device)
    test_loss, test_accuracy, _, _ = evaluate(model, test_loader, criterion, device)
    
    print(f"Epoch [{epoch+1}/{num_epochs}]")
    print(f"  Train Loss: {train_loss:.4f}")
    print(f"  Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")

# Final evaluation
_, final_accuracy, all_predictions, all_labels = evaluate(model, test_loader, criterion, device)
print("\nFinal Test Accuracy: {:.4f}".format(final_accuracy))
print("\nClassification Report:")
print(classification_report(all_labels, all_predictions, target_names=list(label_encoder.keys())))

Epoch [1/10]
  Train Loss: 0.7434
  Test Loss: 0.2145, Test Accuracy: 0.9765
Epoch [2/10]
  Train Loss: 0.0952
  Test Loss: 0.1386, Test Accuracy: 0.9799
Epoch [3/10]
  Train Loss: 0.0299
  Test Loss: 0.1197, Test Accuracy: 0.9799
Epoch [4/10]
  Train Loss: 0.0143
  Test Loss: 0.1158, Test Accuracy: 0.9832
Epoch [5/10]
  Train Loss: 0.0089
  Test Loss: 0.1080, Test Accuracy: 0.9832
Epoch [6/10]
  Train Loss: 0.0061
  Test Loss: 0.1095, Test Accuracy: 0.9799
Epoch [7/10]
  Train Loss: 0.0045
  Test Loss: 0.1060, Test Accuracy: 0.9799
Epoch [8/10]
  Train Loss: 0.0033
  Test Loss: 0.1093, Test Accuracy: 0.9799
Epoch [9/10]
  Train Loss: 0.0024
  Test Loss: 0.1135, Test Accuracy: 0.9799
Epoch [10/10]
  Train Loss: 0.0018
  Test Loss: 0.1150, Test Accuracy: 0.9799

Final Test Accuracy: 0.9799

Classification Report:
               precision    recall  f1-score   support

     business       0.99      0.99      0.99        75
         tech       1.00      0.97      0.98        58
     polit