In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import torch
from torch.utils.data import Dataset, DataLoader
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from imblearn.over_sampling import SMOTE

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load the data
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '<NUM>', text)
    text = re.sub(r'\$\d+(\.\d{2})?', '<MONEY>', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Preprocess the text data, concat Description and Title
df_train['processed_text'] = (df_train["Title"] + " " + df_train["Description"]).apply(preprocess_text)
df_test['processed_text'] = (df_test["Title"] + " " + df_test["Description"]).apply(preprocess_text)

# Create TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=10000)
X_train_full = vectorizer.fit_transform(df_train['processed_text']).toarray()
X_test = vectorizer.transform(df_test['processed_text']).toarray()

# Encode labels
label_encoder = {label: i for i, label in enumerate(df_train['Class Index'].unique())}
y_train_full = df_train['Class Index'].map(label_encoder).values
y_test = df_test['Class Index'].map(label_encoder).values

# Split train data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42)


# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train)
y_train_tensor = torch.LongTensor(y_train)
X_val_tensor = torch.FloatTensor(X_val)
y_val_tensor = torch.LongTensor(y_val)
X_test_tensor = torch.FloatTensor(X_test)
y_test_tensor = torch.LongTensor(y_test)

# Create Dataset and DataLoader
class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = TextDataset(X_train_tensor, y_train_tensor)
val_dataset = TextDataset(X_val_tensor, y_val_tensor)
test_dataset = TextDataset(X_test_tensor, y_test_tensor)

print(f"Vocabulary size: {X_train.shape[1]}")
print(f"Number of classes: {len(label_encoder)}")
print(f"Train set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")
print(f"Test set size: {len(X_test)}")


In [None]:

import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import KFold
import numpy as np
from collections import defaultdict
from torch.optim.lr_scheduler import ReduceLROnPlateau

# Define the improved neural network
class ImprovedClassifier(nn.Module):
    def __init__(self, input_size, hidden_sizes, num_classes, dropout_rate):
        super(ImprovedClassifier, self).__init__()
        self.layers = nn.ModuleList()
        
        # Input layer
        self.layers.append(nn.Linear(input_size, hidden_sizes[0]))
        self.layers.append(nn.ReLU())
        self.layers.append(nn.BatchNorm1d(hidden_sizes[0]))
        self.layers.append(nn.Dropout(dropout_rate))
        
        # Hidden layers
        for i in range(1, len(hidden_sizes)):
            self.layers.append(nn.Linear(hidden_sizes[i-1], hidden_sizes[i]))
            self.layers.append(nn.ReLU())
            self.layers.append(nn.BatchNorm1d(hidden_sizes[i]))
            self.layers.append(nn.Dropout(dropout_rate))
        
        # Output layer
        self.layers.append(nn.Linear(hidden_sizes[-1], num_classes))
    
    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

# Set hyperparameters
input_size = X_train.shape[1]  # Number of features
hidden_sizes = [64, 32, 16]
num_classes = len(label_encoder)
learning_rate = 0.001
num_epochs = 10
batch_size = 64
dropout_rate = 0.5
weight_decay = 1e-5

# Modify the train function to include L2 regularization
def train(model, train_loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    for batch_X, batch_y in train_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        
        # Add L2 regularization
        l2_reg = torch.tensor(0.).to(device)
        for param in model.parameters():
            l2_reg += torch.norm(param)
        loss += weight_decay * l2_reg
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += batch_y.size(0)
        correct += (predicted == batch_y).sum().item()
    
    accuracy = correct / total
    return total_loss / len(train_loader), accuracy

# Functions for evaluation and k-fold cross-validation remain the same
def evaluate(model, data_loader, criterion, device):
    model.eval()
    total_loss = 0
    all_predictions = []
    all_labels = []
    all_probs = []
    
    with torch.no_grad():
        for batch_X, batch_y in data_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            total_loss += loss.item()
            
            probs = torch.softmax(outputs, dim=1)
            _, predicted = torch.max(outputs, 1)
            all_predictions.extend(predicted.cpu().numpy())
            all_labels.extend(batch_y.cpu().numpy())
            all_probs.extend(probs.cpu().numpy())
    
    accuracy = accuracy_score(all_labels, all_predictions)
    return total_loss / len(data_loader), accuracy, all_predictions, all_labels, all_probs

def train_and_validate(model, train_loader, val_loader, criterion, optimizer, scheduler, device, num_epochs):
    history = defaultdict(list)
    best_val_accuracy = 0
    
    for epoch in range(num_epochs):
        train_loss, train_accuracy = train(model, train_loader, criterion, optimizer, device)
        val_loss, val_accuracy, _, _, _ = evaluate(model, val_loader, criterion, device)
        
        scheduler.step(val_loss)
        
        history['train_loss'].append(train_loss)
        history['train_accuracy'].append(train_accuracy)
        history['val_loss'].append(val_loss)
        history['val_accuracy'].append(val_accuracy)
        
        print(f"Epoch [{epoch+1}/{num_epochs}]")
        print(f"  Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}")
        print(f"  Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")
        
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            torch.save(model.state_dict(), 'best_model.pth')
    
    return history

def k_fold_cross_validation(X, y, k=5):
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    fold_results = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(X), 1):
        print(f"\nFold {fold}")

        # Split data
        X_train_fold, X_val_fold = X[train_idx], X[val_idx]
        y_train_fold, y_val_fold = y[train_idx], y[val_idx]

        # Create datasets and dataloaders
        train_dataset = TextDataset(torch.FloatTensor(X_train_fold), torch.LongTensor(y_train_fold))
        val_dataset = TextDataset(torch.FloatTensor(X_val_fold), torch.LongTensor(y_val_fold))
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

        # Initialize model, criterion, and optimizer
        model = ImprovedClassifier(input_size, hidden_sizes, num_classes, dropout_rate).to(device)
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
        scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, verbose=True)

        # Train and validate
        history = train_and_validate(model, train_loader, val_loader, criterion, optimizer, scheduler, device, num_epochs)

        # Evaluate on validation set
        val_loss, val_accuracy, val_predictions, val_labels, val_probs = evaluate(model, val_loader, criterion, device)
        fold_results.append({
            'history': history,
            'val_accuracy': val_accuracy,
            'val_predictions': val_predictions,
            'val_labels': val_labels,
            'val_probs': val_probs
        })

    print("\nK-fold Cross-validation Results:")
    mean_accuracy = np.mean([result['val_accuracy'] for result in fold_results])
    std_accuracy = np.std([result['val_accuracy'] for result in fold_results])
    print(f"Mean Accuracy: {mean_accuracy:.4f}")
    print(f"Standard Deviation: {std_accuracy:.4f}")

    return fold_results

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Perform k-fold cross-validation
fold_results = k_fold_cross_validation(X_train, y_train, k=2)

# Train on full training set and evaluate on test set
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True, persistent_workers=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True, persistent_workers=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True, persistent_workers=True)

model = ImprovedClassifier(input_size, hidden_sizes, num_classes, dropout_rate).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, verbose=True)

print("\nTraining on full training set:")
final_history = train_and_validate(model, train_loader, val_loader, criterion, optimizer, scheduler, device, num_epochs)

# Final evaluation on test set
print("\nFinal Evaluation on Test Set:")
test_loss, final_accuracy, all_predictions, all_labels, all_probs = evaluate(model, test_loader, criterion, device)
print(f"Test Accuracy: {final_accuracy:.4f}")
print("\nClassification Report:")
target_names = ["world", "sport", "business", "sci/tech"]
print(classification_report(all_labels, all_predictions, target_names=target_names))

# Store final results
final_results = {
    'fold_results': fold_results,
    'final_history': final_history,
    'test_accuracy': final_accuracy,
    'test_predictions': all_predictions,
    'test_labels': all_labels,
    'test_probs': all_probs
}

: 

In [None]:

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, roc_curve, auc
import os

def generate_report(model_name, final_results, target_names):
    os.makedirs("images", exist_ok=True)
    report = f"# Classification Report for {model_name}\n\n"

    # Model Architecture
    report += "## Model Architecture\n"
    report += f"```\n{model}\n```\n\n"

    # K-fold Cross-validation Results
    report += "## K-fold Cross-validation Results\n"
    fold_accuracies = [result['val_accuracy'] for result in final_results['fold_results']]
    report += f"Mean Accuracy: {np.mean(fold_accuracies):.4f}\n"
    report += f"Standard Deviation: {np.std(fold_accuracies):.4f}\n\n"

    # Plot K-fold Cross-validation Results
    plt.figure(figsize=(10, 5))
    plt.bar(range(1, len(fold_accuracies) + 1), fold_accuracies)
    plt.title('K-fold Cross-validation Accuracies')
    plt.xlabel('Fold')
    plt.ylabel('Accuracy')
    plt.savefig(f"images/{model_name}_kfold_accuracies.png")
    plt.close()

    report += f"![K-fold Cross-validation Accuracies](images/{model_name}_kfold_accuracies.png)\n\n"

    # Training History
    report += "## Training History\n"
    history = final_results['final_history']
    
    # Plot training history
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(history['train_loss'], label='Train Loss')
    plt.plot(history['val_loss'], label='Validation Loss')
    plt.title('Model Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(history['train_accuracy'], label='Train Accuracy')
    plt.plot(history['val_accuracy'], label='Validation Accuracy')
    plt.title('Model Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.tight_layout()
    plt.savefig(f"images/{model_name}_training_history.png")
    plt.close()

    report += f"![Training History](images/{model_name}_training_history.png)\n\n"

    # Test Set Results
    report += "## Test Set Results\n"
    report += f"Test Accuracy: {final_results['test_accuracy']:.4f}\n\n"

    # Classification Report
    report += "## Classification Report\n"
    report += "```\n"
    report += classification_report(final_results['test_labels'], final_results['test_predictions'], target_names=target_names)
    report += "```\n\n"

    # Confusion Matrix
    report += "## Confusion Matrix\n"
    cm = confusion_matrix(final_results['test_labels'], final_results['test_predictions'])
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=target_names, yticklabels=target_names)
    plt.title('Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.savefig(f"images/{model_name}_confusion_matrix.png")
    plt.close()

    report += f"![Confusion Matrix](images/{model_name}_confusion_matrix.png)\n\n"

    # ROC Curve
    report += "## ROC Curve\n"
    plt.figure(figsize=(10, 8))
    
    # Convert test_probs to numpy array
    test_probs = np.array(final_results['test_probs'])
    
    for i, class_name in enumerate(target_names):
        fpr, tpr, _ = roc_curve(final_results['test_labels'], test_probs[:, i], pos_label=i)
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, label=f'{class_name} (AUC = {roc_auc:.2f})')
    
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")
    plt.savefig(f"images/{model_name}_roc_curve.png")
    plt.close()

    report += f"![ROC Curve](images/{model_name}_roc_curve.png)\n\n"

    # Model Summary
    report += "## Model Summary\n"
    report += f"- Number of epochs: {num_epochs}\n"
    report += f"- Batch size: {batch_size}\n"
    report += f"- Learning rate: {learning_rate}\n"
    report += f"- Input size: {input_size}\n"
    report += f"- Number of classes: {num_classes}\n"

    # Save report
    with open(f"{model_name}_report.md", "w") as f:
        f.write(report)

    print(f"Report saved as {model_name}_report.md")
    
# Generate the report
model_name = "ImprovedClassifier"
generate_report(model_name, final_results, target_names)

# Suggestion for alternative input encoding method
print("\nSuggestion for alternative input encoding method:")
print("Consider using word embeddings, such as Word2Vec or GloVe, instead of TF-IDF.")
print("Word embeddings capture semantic relationships between words and may yield better results.")
print("To implement this, you would need to:")
print("1. Use a pre-trained word embedding model or train your own on a large corpus.")
print("2. Convert each document to a sequence of word vectors.")
print("3. Use padding to ensure all sequences have the same length.")
print("4. Modify the model architecture to include an embedding layer and possibly use RNNs or CNNs.")
print("This approach could potentially capture more nuanced relationships in the text data.")