In [35]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import re
import torch
from torch.utils.data import Dataset, DataLoader

# Load the data
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

def preprocess_text(text):
    text = text.lower()
    # text = re.sub(r'\d+', '<NUM>', text)
    # text = re.sub(r'\$\d+(\.\d{2})?', '<MONEY>', text)
    # text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    return text

# Preprocess the text data, concat Description and Title
df_train['processed_text'] = (df_train["Title"] + " " + df_train["Description"]).apply(preprocess_text)
df_test['processed_text'] = (df_test["Title"] + " " + df_test["Description"]).apply(preprocess_text)


# Create CountVectorizer
vectorizer = CountVectorizer(max_features=5000)
X_train_full = vectorizer.fit_transform(df_train['processed_text']).toarray()
X_test = vectorizer.transform(df_test['processed_text']).toarray()

# Cap counts to 1
X_train_full[X_train_full > 1] = 1
X_test[X_test > 1] = 1

# Encode labels
label_encoder = {label: i for i, label in enumerate(df_train['Class Index'].unique())}
y_train_full = df_train['Class Index'].map(label_encoder).values
y_test = df_test['Class Index'].map(label_encoder).values

# Split train data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42)

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train)
y_train_tensor = torch.LongTensor(y_train)
X_val_tensor = torch.FloatTensor(X_val)
y_val_tensor = torch.LongTensor(y_val)
X_test_tensor = torch.FloatTensor(X_test)
y_test_tensor = torch.LongTensor(y_test)

# Create Dataset and DataLoader
class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = TextDataset(X_train_tensor, y_train_tensor)
val_dataset = TextDataset(X_val_tensor, y_val_tensor)
test_dataset = TextDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

print(f"Vocabulary size: {len(vectorizer.get_feature_names_out())}")
print(f"Number of classes: {len(label_encoder)}")
print(f"Train set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")
print(f"Test set size: {len(X_test)}")

Vocabulary size: 5000
Number of classes: 4
Train set size: 96000
Validation set size: 24000
Test set size: 7600


In [36]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import KFold
import numpy as np
from collections import defaultdict

# Define the neural network
class SimpleClassifier(nn.Module):
    def __init__(self, input_size, num_classes):
        super(SimpleClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, num_classes)
    
    def forward(self, x):
        x = self.fc1(x)
        return x

# Set hyperparameters
input_size = X_train.shape[1]  # Number of features
num_classes = len(label_encoder)
learning_rate = 0.001
num_epochs = 10
batch_size = 32

# Modify the train function to return both loss and accuracy
def train(model, train_loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    for batch_X, batch_y in train_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += batch_y.size(0)
        correct += (predicted == batch_y).sum().item()
    
    accuracy = correct / total
    return total_loss / len(train_loader), accuracy

# Modify the evaluate function to return probabilities
def evaluate(model, data_loader, criterion, device):
    model.eval()
    total_loss = 0
    all_predictions = []
    all_labels = []
    all_probs = []
    
    with torch.no_grad():
        for batch_X, batch_y in data_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            total_loss += loss.item()
            
            probs = torch.softmax(outputs, dim=1)
            _, predicted = torch.max(outputs, 1)
            all_predictions.extend(predicted.cpu().numpy())
            all_labels.extend(batch_y.cpu().numpy())
            all_probs.extend(probs.cpu().numpy())
    
    accuracy = accuracy_score(all_labels, all_predictions)
    return total_loss / len(data_loader), accuracy, all_predictions, all_labels, all_probs

# Modify the train_and_validate function to return training history
def train_and_validate(model, train_loader, val_loader, criterion, optimizer, device, num_epochs):
    history = defaultdict(list)
    for epoch in range(num_epochs):
        train_loss, train_accuracy = train(model, train_loader, criterion, optimizer, device)
        val_loss, val_accuracy, _, _, _ = evaluate(model, val_loader, criterion, device)
        
        history['train_loss'].append(train_loss)
        history['train_accuracy'].append(train_accuracy)
        history['val_loss'].append(val_loss)
        history['val_accuracy'].append(val_accuracy)
        
        print(f"Epoch [{epoch+1}/{num_epochs}]")
        print(f"  Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}")
        print(f"  Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")
    
    return history

# Modify the k_fold_cross_validation function to return fold results
def k_fold_cross_validation(X, y, k=5):
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    fold_results = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(X), 1):
        print(f"\nFold {fold}")

        # Split data
        X_train_fold, X_val_fold = X[train_idx], X[val_idx]
        y_train_fold, y_val_fold = y[train_idx], y[val_idx]

        # Create datasets and dataloaders
        train_dataset = TextDataset(torch.FloatTensor(X_train_fold), torch.LongTensor(y_train_fold))
        val_dataset = TextDataset(torch.FloatTensor(X_val_fold), torch.LongTensor(y_val_fold))
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

        # Initialize model, criterion, and optimizer
        model = SimpleClassifier(input_size, num_classes).to(device)
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)

        # Train and validate
        history = train_and_validate(model, train_loader, val_loader, criterion, optimizer, device, num_epochs)

        # Evaluate on validation set
        val_loss, val_accuracy, val_predictions, val_labels, val_probs = evaluate(model, val_loader, criterion, device)
        fold_results.append({
            'history': history,
            'val_accuracy': val_accuracy,
            'val_predictions': val_predictions,
            'val_labels': val_labels,
            'val_probs': val_probs
        })

    print("\nK-fold Cross-validation Results:")
    mean_accuracy = np.mean([result['val_accuracy'] for result in fold_results])
    std_accuracy = np.std([result['val_accuracy'] for result in fold_results])
    print(f"Mean Accuracy: {mean_accuracy:.4f}")
    print(f"Standard Deviation: {std_accuracy:.4f}")

    return fold_results

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Perform k-fold cross-validation
fold_results = k_fold_cross_validation(X_train, y_train, k=2)

# Train on full training set and evaluate on test set
train_dataset = TextDataset(X_train_tensor, y_train_tensor)
val_dataset = TextDataset(X_val_tensor, y_val_tensor)
test_dataset = TextDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

model = SimpleClassifier(input_size, num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

print("\nTraining on full training set:")
final_history = train_and_validate(model, train_loader, val_loader, criterion, optimizer, device, num_epochs)

# Final evaluation on test set
print("\nFinal Evaluation on Test Set:")
test_loss, final_accuracy, all_predictions, all_labels, all_probs = evaluate(model, test_loader, criterion, device)
print(f"Test Accuracy: {final_accuracy:.4f}")
print("\nClassification Report:")
target_names = ["world", "sport", "business", "sci/tech"]
print(classification_report(all_labels, all_predictions, target_names=target_names))

# Store final results
final_results = {
    'fold_results': fold_results,
    'final_history': final_history,
    'test_accuracy': final_accuracy,
    'test_predictions': all_predictions,
    'test_labels': all_labels,
    'test_probs': all_probs
}


Fold 1
Epoch [1/10]
  Train Loss: 0.5937, Train Accuracy: 0.8653
  Val Loss: 0.3953, Val Accuracy: 0.8896
Epoch [2/10]
  Train Loss: 0.3375, Train Accuracy: 0.9040
  Val Loss: 0.3349, Val Accuracy: 0.8979
Epoch [3/10]
  Train Loss: 0.2842, Train Accuracy: 0.9143
  Val Loss: 0.3138, Val Accuracy: 0.9004
Epoch [4/10]
  Train Loss: 0.2552, Train Accuracy: 0.9206
  Val Loss: 0.3052, Val Accuracy: 0.9007
Epoch [5/10]
  Train Loss: 0.2358, Train Accuracy: 0.9257
  Val Loss: 0.3018, Val Accuracy: 0.9004
Epoch [6/10]
  Train Loss: 0.2217, Train Accuracy: 0.9297
  Val Loss: 0.3018, Val Accuracy: 0.8985
Epoch [7/10]
  Train Loss: 0.2107, Train Accuracy: 0.9326
  Val Loss: 0.3031, Val Accuracy: 0.8978
Epoch [8/10]
  Train Loss: 0.2018, Train Accuracy: 0.9353
  Val Loss: 0.3049, Val Accuracy: 0.8965
Epoch [9/10]
  Train Loss: 0.1945, Train Accuracy: 0.9372
  Val Loss: 0.3075, Val Accuracy: 0.8958
Epoch [10/10]
  Train Loss: 0.1881, Train Accuracy: 0.9386
  Val Loss: 0.3107, Val Accuracy: 0.8952



In [37]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, roc_curve, auc, classification_report
import os

def generate_report(model_name, final_results, target_names):
    os.makedirs("images", exist_ok=True)
    report = f"# Classification Report for {model_name}\n\n"

    # Model Architecture
    report += "## Model Architecture\n"
    report += f"```\n{model}\n```\n\n"

    # K-fold Cross-validation Results
    report += "## K-fold Cross-validation Results\n"
    fold_accuracies = [result['val_accuracy'] for result in final_results['fold_results']]
    report += f"Mean Accuracy: {np.mean(fold_accuracies):.4f}\n"
    report += f"Standard Deviation: {np.std(fold_accuracies):.4f}\n\n"

    # Plot K-fold Cross-validation Results
    plt.figure(figsize=(10, 5))
    plt.bar(range(1, len(fold_accuracies) + 1), fold_accuracies)
    plt.title('K-fold Cross-validation Accuracies')
    plt.xlabel('Fold')
    plt.ylabel('Accuracy')
    plt.savefig(f"images/{model_name}_kfold_accuracies.png")
    plt.close()

    report += f"![K-fold Cross-validation Accuracies](images/{model_name}_kfold_accuracies.png)\n\n"

    # Training History
    report += "## Training History\n"
    history = final_results['final_history']
    
    # Plot training history
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(history['train_loss'], label='Train Loss')
    plt.plot(history['val_loss'], label='Validation Loss')
    plt.title('Model Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(history['train_accuracy'], label='Train Accuracy')
    plt.plot(history['val_accuracy'], label='Validation Accuracy')
    plt.title('Model Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.tight_layout()
    plt.savefig(f"images/{model_name}_training_history.png")
    plt.close()

    report += f"![Training History](images/{model_name}_training_history.png)\n\n"

    # Test Set Results
    report += "## Test Set Results\n"
    report += f"Test Accuracy: {final_results['test_accuracy']:.4f}\n\n"

    # Classification Report
    report += "## Classification Report\n"
    report += "```\n"
    report += classification_report(final_results['test_labels'], final_results['test_predictions'], target_names=target_names)
    report += "```\n\n"

    # Confusion Matrix
    report += "## Confusion Matrix\n"
    cm = confusion_matrix(final_results['test_labels'], final_results['test_predictions'])
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=target_names, yticklabels=target_names)
    plt.title('Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.savefig(f"images/{model_name}_confusion_matrix.png")
    plt.close()

    report += f"![Confusion Matrix](images/{model_name}_confusion_matrix.png)\n\n"

    # ROC Curve
    report += "## ROC Curve\n"
    plt.figure(figsize=(10, 8))
    
    # Convert test_probs to numpy array
    test_probs = np.array(final_results['test_probs'])
    
    for i, class_name in enumerate(target_names):
        fpr, tpr, _ = roc_curve(final_results['test_labels'], test_probs[:, i], pos_label=i)
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, label=f'{class_name} (AUC = {roc_auc:.2f})')
    
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")
    plt.savefig(f"images/{model_name}_roc_curve.png")
    plt.close()

    report += f"![ROC Curve](images/{model_name}_roc_curve.png)\n\n"

    # Model Summary
    report += "## Model Summary\n"
    report += f"- Number of epochs: {num_epochs}\n"
    report += f"- Batch size: {batch_size}\n"
    report += f"- Learning rate: {learning_rate}\n"
    report += f"- Input size: {input_size}\n"
    report += f"- Number of classes: {num_classes}\n"

    # Save report
    with open(f"{model_name}_report.md", "w") as f:
        f.write(report)

    print(f"Report saved as {model_name}_report.md")

# Generate the report
model_name = "SimpleClassifier"
target_names = ["world", "sport", "business", "sci/tech"]
generate_report(model_name, final_results, target_names)

# Generate the report
# model_name = "EnhancedClassifier"
generate_report(model_name, final_results, target_names)

Report saved as SimpleClassifier_report.md
Report saved as SimpleClassifier_report.md


: 