In [None]:
# Cell 1: Function to train and predict with SVM, KNN, RF
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import os

def train_and_predict(train_path, test_path, output_filename, metrics_filename, output_folder='output/predictions/same-project'):
    """
    Train SVM, KNN, and RF models on training data and predict on test data.
    
    Args:
        train_path: Path to training CSV file
        test_path: Path to test CSV file
        output_filename: Name for output predictions CSV file
        metrics_filename: Name for output metrics CSV file
        output_folder: Folder path for output files (default: 'output/predictions/same-project')
    
    Returns:
        Tuple of (predictions DataFrame, metrics DataFrame)
    """
    try:
        # Load datasets
        train_df = pd.read_csv(train_path)
        test_df = pd.read_csv(test_path)
        
        # Store names and actual bug values for output
        test_names = test_df['name'].copy()
        test_bugs = (test_df['bug'] > 0).astype(int)  # Binary: 1 if bug>0, else 0
        
        # Prepare training data
        # X_train: All columns except 'name' (identifier) and 'bug' (target variable)
        X_train = train_df.drop(['name', 'bug'], axis=1)
        # y_train: Binary classification - 1 if bug count > 0, else 0
        y_train = (train_df['bug'] > 0).astype(int)
        
        # Prepare test data
        # X_test: Same features as X_train (drop 'name' and 'bug')
        X_test = test_df.drop(['name', 'bug'], axis=1)
        
        # Scale features for better model performance
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        # Train and predict with SVM
        svm_model = SVC(probability=True, random_state=42)
        svm_model.fit(X_train_scaled, y_train)
        svm_proba = svm_model.predict_proba(X_test_scaled)[:, 1] * 100
        svm_pred = svm_model.predict(X_test_scaled)
        
        # Train and predict with KNN
        knn_model = KNeighborsClassifier(n_neighbors=5)
        knn_model.fit(X_train_scaled, y_train)
        knn_proba = knn_model.predict_proba(X_test_scaled)[:, 1] * 100
        knn_pred = knn_model.predict(X_test_scaled)
        
        # Train and predict with Random Forest
        rf_model = RandomForestClassifier(random_state=42)
        rf_model.fit(X_train_scaled, y_train)
        rf_proba = rf_model.predict_proba(X_test_scaled)[:, 1] * 100
        rf_pred = rf_model.predict(X_test_scaled)
        
        # Create output DataFrame with predictions and actual bug values
        output_df = pd.DataFrame({
            'Name': test_names,
            'SVM': svm_proba,
            'KNN': knn_proba,
            'RF': rf_proba,
            'Bug': test_bugs
        })
        
        # Format to 2 decimal places
        output_df['SVM'] = output_df['SVM'].round(2)
        output_df['KNN'] = output_df['KNN'].round(2)
        output_df['RF'] = output_df['RF'].round(2)
        
        # Calculate metrics for each model
        metrics_data = []
        
        for model_name, y_pred, y_proba_pct in [
            ('SVM', svm_pred, svm_proba),
            ('KNN', knn_pred, knn_proba),
            ('RF', rf_pred, rf_proba)
        ]:
            y_proba = y_proba_pct / 100  # Convert back to 0-1 range for metrics
            
            metrics_data.append({
                'Method': model_name,
                'Accuracy': accuracy_score(test_bugs, y_pred),
                'Precision': precision_score(test_bugs, y_pred, zero_division=0),
                'Recall': recall_score(test_bugs, y_pred, zero_division=0),
                'F1-Score': f1_score(test_bugs, y_pred, zero_division=0),
                'AUC': roc_auc_score(test_bugs, y_proba)
            })
        
        metrics_df = pd.DataFrame(metrics_data)
        
        # Format metrics to 2 decimal places
        for col in ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'AUC']:
            metrics_df[col] = metrics_df[col].round(2)
        
        # Create output directory if it doesn't exist
        os.makedirs(output_folder, exist_ok=True)
        
        # Save predictions to CSV
        output_path = f'{output_folder}/{output_filename}'
        output_df.to_csv(output_path, index=False)
        
        # Save metrics to CSV
        metrics_path = f'{output_folder}/{metrics_filename}'
        metrics_df.to_csv(metrics_path, index=False)
        
        print(f"✓ Predictions saved to {output_path}")
        print(f"✓ Metrics saved to {metrics_path}")
        print(f"✓ Predictions shape: {output_df.shape}")
        print(f"\nFirst 5 predictions:")
        print(output_df.head())
        print(f"\nMetrics:")
        print(metrics_df.to_string(index=False))
        print(f"\nActual bug distribution: {test_bugs.value_counts().to_dict()}")
        
        return output_df, metrics_df
        
    except Exception as e:
        print(f"Error: {str(e)}")
        raise

# Execute: Train on ant-1.6, predict on ant-1.7
result1_pred, result1_metrics = train_and_predict(
    'datasets/same-project/train/ant-1.6.csv',
    'datasets/same-project/test/ant-1.7.csv',
    'ant16_to_ant17.csv',
    'ant16_to_ant17_result.csv',
    'output/predictions/same-project'
)

✓ Predictions saved to output/predictions/same-project/ant16_to_ant17.csv
✓ Metrics saved to output/predictions/same-project/ant16_to_ant17_result.csv
✓ Predictions shape: (745, 5)

First 5 predictions:
                                                Name    SVM   KNN    RF  Bug
0  org.apache.tools.ant.taskdefs.rmic.RmicAdapter...  16.22  40.0  47.0    0
1  org.apache.tools.ant.taskdefs.optional.perforc...   8.65   0.0   5.0    0
2  org.apache.tools.ant.taskdefs.optional.junit.O...  14.13  20.0   0.0    0
3  org.apache.tools.ant.taskdefs.optional.perforc...  13.79  40.0  13.0    0
4              org.apache.tools.ant.taskdefs.WaitFor  12.69   0.0   4.0    1

Metrics:
Method  Accuracy  Precision  Recall  F1-Score  AUC
   SVM      0.80       0.58    0.45      0.51 0.80
   KNN      0.79       0.52    0.49      0.50 0.76
    RF      0.79       0.53    0.47      0.50 0.80

Actual bug distribution: {0: 579, 1: 166}


In [None]:
# Cell 2: Train on ant-1.7, predict on ivy-1.1
result2_pred, result2_metrics = train_and_predict(
    'datasets/same-project/train/ant-1.7.csv',
    'datasets/same-project/test/ivy-1.1.csv',
    'ant17_to_ivy11.csv',
    'ant17_to_ivy11_result.csv',
    'output/predictions/cross-project'
)

✓ Predictions saved to output/predictions/same-project/ant17_to_ivy11.csv
✓ Metrics saved to output/predictions/same-project/ant17_to_ivy11_result.csv
✓ Predictions shape: (241, 5)

First 5 predictions:
                                                Name    SVM   KNN    RF  Bug
0  fr.jayasoft.ivy.repository.vfs.IvyWebdavFileSy...  14.87  40.0  16.0    0
1            fr.jayasoft.ivy.util.EncrytedProperties  12.70   0.0   6.0    0
2     fr.jayasoft.ivy.xml.XmlModuleDescriptorUpdater  18.29  20.0  34.0    1
3            fr.jayasoft.ivy.resolver.IvyRepResolver  33.08  40.0  52.0    0
4  fr.jayasoft.ivy.event.resolve.StartResolveDepe...  13.40   0.0   7.0    0

Metrics:
Method  Accuracy  Precision  Recall  F1-Score  AUC
   SVM      0.90       0.23    0.19      0.21 0.69
   KNN      0.85       0.12    0.19      0.14 0.63
    RF      0.86       0.19    0.31      0.23 0.71

Actual bug distribution: {0: 225, 1: 16}


In [None]:
# Cell 3: Train on camel-1.6, predict on synapse-1.2
result3_pred, result3_metrics = train_and_predict(
    'datasets/integration-messaging/train/camel-1.6.csv',
    'datasets/integration-messaging/test/synapse-1.2.csv',
    'camel16_to_synapse12.csv',
    'camel16_to_synapse12_result.csv',
    'output/predictions/cross-project'
)

✓ Predictions saved to output/predictions/same-project/camel16_to_synapse12.csv
✓ Metrics saved to output/predictions/same-project/camel16_to_synapse12_result.csv
✓ Predictions shape: (256, 5)

First 5 predictions:
                                                Name    SVM   KNN     RF  Bug
0   org.apache.synapse.endpoints.LoadbalanceEndpoint  37.92  60.0  53.00    1
1  org.apache.synapse.util.concurrent.SynapseThre...  17.28   0.0  27.00    0
2      org.apache.synapse.mediators.AbstractMediator  82.74  80.0  72.00    1
3  org.apache.synapse.config.xml.AnonymousListMed...  19.09  60.0  16.67    0
4  org.apache.synapse.endpoints.algorithms.Loadba...  17.24   0.0  13.00    1

Metrics:
Method  Accuracy  Precision  Recall  F1-Score  AUC
   SVM      0.68       0.62    0.09      0.16 0.70
   KNN      0.66       0.49    0.34      0.40 0.61
    RF      0.70       0.63    0.28      0.39 0.66

Actual bug distribution: {0: 170, 1: 86}
