# Software Defect Prediction using PyTorch

This notebook implements multi-label defect prediction models using PyTorch.

In [None]:
import numpy as np
import pandas as pd
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support, hamming_loss, multilabel_confusion_matrix
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)

# Check if CUDA is available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

## Generate Synthetic Data

Since we don't have a real dataset for defect prediction, we'll generate synthetic data.

In [None]:
# Define defect types
defect_labels = ['Security', 'Performance', 'Maintainability', 'Reliability', 'Functional']
num_classes = len(defect_labels)

# Generate synthetic data
num_samples = 1000
num_features = 20

# Feature names
feature_names = [
    "LOC", "Cyclomatic Complexity", "Nesting Depth", "Comment Density",
    "Code Churn", "Coupling", "Cohesion", "Unique Operands",
    "Unique Operators", "Branch Count", "Loop Count", "Parameter Count",
    "Fan-in", "Fan-out", "Halstead Difficulty", "Halstead Volume",
    "Halstead Effort", "Dependency Count", "Age", "Dev Experience"
]

# Generate features
X = np.random.rand(num_samples, num_features) * 10  # Scale to make more realistic

# Generate labels (multi-label)
y = np.zeros((num_samples, num_classes))
for i in range(num_samples):
    # Each sample has a 30% chance of having each defect type
    for j in range(num_classes):
        y[i, j] = 1 if np.random.rand() < 0.3 else 0
    
    # Ensure at least one defect type is present in 60% of samples
    if np.random.rand() < 0.6 and np.sum(y[i]) == 0:
        y[i, np.random.randint(0, num_classes)] = 1

# Create a DataFrame for better visualization
X_df = pd.DataFrame(X, columns=feature_names)
y_df = pd.DataFrame(y, columns=defect_labels)

# Display sample data
print("Feature data sample:")
display(X_df.head())

print("\nLabel data sample:")
display(y_df.head())

print(f"\nData shapes: X: {X.shape}, y: {y.shape}")

# Count label distribution
label_counts = y_df.sum()
print("\nLabel distribution:")
display(label_counts)

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_scaled)
y_train_tensor = torch.FloatTensor(y_train)
X_test_tensor = torch.FloatTensor(X_test_scaled)
y_test_tensor = torch.FloatTensor(y_test)

# Create dataset and dataloader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

## Define Models

In [None]:
# Define the DNN model for multi-label classification
class MultiLabelDNN(nn.Module):
    def __init__(self, input_size, num_classes, hidden_size=128, dropout_rate=0.3):
        super(MultiLabelDNN, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.dropout_rate = dropout_rate
        self.num_classes = num_classes
        
        # Create a flexible architecture
        self.layer1 = nn.Linear(input_size, hidden_size)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(dropout_rate)
        self.layer2 = nn.Linear(hidden_size, hidden_size // 2)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(dropout_rate)
        self.output_layer = nn.Linear(hidden_size // 2, num_classes)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.dropout1(self.relu1(self.layer1(x)))
        x = self.dropout2(self.relu2(self.layer2(x)))
        x = self.sigmoid(self.output_layer(x))
        return x

## Train and Evaluate Models

In [None]:
# Function to train a PyTorch model
def train_model(model, train_loader, criterion, optimizer, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            
            # Zero the parameter gradients
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
            # Backward pass and optimize
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item() * inputs.size(0)
        
        epoch_loss = running_loss / len(train_loader.dataset)
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}')
    
    return model

# Function to evaluate a PyTorch model for multi-label classification
def evaluate_model(model, test_loader, defect_labels):
    model.eval()
    y_true = []
    y_pred = []
    y_prob = []
    
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            
            # Convert outputs to predictions
            probs = outputs.cpu().numpy()
            preds = (probs >= 0.5).astype(int)
            
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(preds)
            y_prob.extend(probs)
    
    # Convert to numpy arrays
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    y_prob = np.array(y_prob)
    
    # Calculate metrics
    accuracy = accuracy_score(y_true, y_pred)
    hl = hamming_loss(y_true, y_pred)
    
    # Calculate per-class metrics
    results = {}
    for i, label in enumerate(defect_labels):
        precision, recall, f1, _ = precision_recall_fscore_support(
            y_true[:, i], y_pred[:, i], average='binary'
        )
        results[label] = {
            'precision': precision,
            'recall': recall,
            'f1': f1
        }
    
    # Calculate micro and macro averages
    precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(
        y_true.flatten(), y_pred.flatten(), average='binary'
    )
    
    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(
        y_true, y_pred, average='macro'
    )
    
    # Print results
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Hamming Loss: {hl:.4f}")
    print(f"Micro-average Precision: {precision_micro:.4f}")
    print(f"Micro-average Recall: {recall_micro:.4f}")
    print(f"Micro-average F1: {f1_micro:.4f}")
    print(f"Macro-average Precision: {precision_macro:.4f}")
    print(f"Macro-average Recall: {recall_macro:.4f}")
    print(f"Macro-average F1: {f1_macro:.4f}")
    
    print("\nPer-class metrics:")
    for label, metrics in results.items():
        print(f"{label}: Precision={metrics['precision']:.4f}, Recall={metrics['recall']:.4f}, F1={metrics['f1']:.4f}")
    
    # Plot confusion matrices
    conf_matrices = multilabel_confusion_matrix(y_true, y_pred)
    
    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
    axes = axes.flatten()
    
    for i, (label, cm) in enumerate(zip(defect_labels, conf_matrices)):
        if i < len(axes):
            sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[i],
                        xticklabels=['No Defect', 'Defect'],
                        yticklabels=['No Defect', 'Defect'])
            axes[i].set_title(f'{label} Defects')
            axes[i].set_xlabel('Predicted')
            axes[i].set_ylabel('True')
    
    # Remove the last subplot if not needed
    if len(defect_labels) < len(axes):
        fig.delaxes(axes[-1])
    
    plt.tight_layout()
    plt.show()
    
    return {
        'accuracy': accuracy,
        'hamming_loss': hl,
        'precision_micro': precision_micro,
        'recall_micro': recall_micro,
        'f1_micro': f1_micro,
        'precision_macro': precision_macro,
        'recall_macro': recall_macro,
        'f1_macro': f1_macro,
        'per_class': results,
        'y_true': y_true,
        'y_pred': y_pred,
        'y_prob': y_prob
    }

In [None]:
# Train and evaluate the DNN model
input_size = X_train_scaled.shape[1]
dnn_model = MultiLabelDNN(input_size, num_classes).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(dnn_model.parameters(), lr=0.001)

# Train the model
print("Training DNN model...")
dnn_model = train_model(dnn_model, train_loader, criterion, optimizer, num_epochs=10)

# Evaluate the model
print("\nEvaluating DNN model...")
dnn_results = evaluate_model(dnn_model, test_loader, defect_labels)

## Feature Importance Analysis

In [None]:
# Analyze feature importance using model weights
def analyze_feature_importance(model, feature_names):
    # Get the weights from the first layer
    weights = model.layer1.weight.data.cpu().numpy()
    
    # Calculate the absolute importance of each feature
    importance = np.abs(weights).mean(axis=0)
    
    # Create a DataFrame for better visualization
    importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': importance
    })
    
    # Sort by importance
    importance_df = importance_df.sort_values('Importance', ascending=False)
    
    # Plot
    plt.figure(figsize=(10, 6))
    sns.barplot(x='Importance', y='Feature', data=importance_df)
    plt.title('Feature Importance')
    plt.tight_layout()
    plt.show()
    
    return importance_df

# Analyze feature importance
importance_df = analyze_feature_importance(dnn_model, feature_names)
display(importance_df)

## Save Models

In [None]:
# Create directories if they don't exist
os.makedirs('../models/defect', exist_ok=True)

# Save the DNN model
torch.save(dnn_model.state_dict(), '../models/defect/dnn_model.pth')

# Save the scaler
joblib.dump(scaler, '../models/defect/scaler.joblib')

# Save the defect labels
with open('../models/defect/defect_labels.txt', 'w') as f:
    for label in defect_labels:
        f.write(f"{label}\n")

print("Models, scaler, and labels saved successfully.")