# Import Required Libraries
Import necessary libraries such as pandas, numpy, matplotlib, seaborn, and any others needed for data analysis and visualization.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set style for plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Load the Dataset
Read the CSV files from the incremental_tasks_csv directory into pandas DataFrames.

In [None]:
# Load all CSV files from incremental_tasks_csv directory
data_dir = Path('incremental_tasks_csv')
dfs = {}

print("Loading CSV files...")
for file_path in data_dir.glob('*.csv'):
    name = file_path.stem  # e.g., 'task1_train'
    print(f"Loading {name}.csv...")
    dfs[name] = pd.read_csv(file_path)
    print(f"Loaded {name}.csv with shape: {dfs[name].shape}")

# Combine all dataframes for overall analysis
all_data = pd.concat(dfs.values(), ignore_index=True)
print(f"\nCombined dataset shape: {all_data.shape}")

# Understand Data Structure and Sample Data
Examine the shape, columns, data types, and display sample rows to understand the dataset structure.

In [None]:
# Display basic information about the combined dataset
print("Dataset Info:")
print(all_data.info())

print("\n" + "="*50)
print("Column Names:")
print(all_data.columns.tolist())

print("\n" + "="*50)
print("Data Types:")
print(all_data.dtypes)

print("\n" + "="*50)
print("Sample Data (first 5 rows):")
display(all_data.head())

print("\n" + "="*50)
print("Sample Data (random 5 rows):")
display(all_data.sample(5))

print("\n" + "="*50)
print("Unique values in key columns:")
key_columns = ['is_vul', 'task', 'Base Severity', 'severity']
for col in key_columns:
    if col in all_data.columns:
        print(f"{col}: {all_data[col].unique()}")

# Basic Data Statistics
Compute summary statistics, check for missing values, and analyze distributions of key features related to code vulnerabilities.

In [None]:
# Check for missing values
print("Missing Values Summary:")
missing_data = all_data.isnull().sum()
missing_percent = (missing_data / len(all_data)) * 100
missing_df = pd.DataFrame({'Missing Count': missing_data, 'Missing Percentage': missing_percent})
display(missing_df[missing_df['Missing Count'] > 0])

print("\n" + "="*50)
print("Summary Statistics for Numerical Columns:")
numerical_cols = all_data.select_dtypes(include=[np.number]).columns
display(all_data[numerical_cols].describe())

print("\n" + "="*50)
print("Value Counts for Categorical Columns:")
categorical_cols = ['is_vul', 'task', 'Base Severity', 'severity', 'source']
for col in categorical_cols:
    if col in all_data.columns:
        print(f"\n{col} distribution:")
        display(all_data[col].value_counts())
        print(f"Percentage: \n{(all_data[col].value_counts(normalize=True) * 100).round(2)}")

print("\n" + "="*50)
print("Vulnerability Analysis:")
vul_counts = all_data['is_vul'].value_counts()
print(f"Total samples: {len(all_data)}")
print(f"Vulnerable samples: {vul_counts.get(1, 0)} ({vul_counts.get(1, 0)/len(all_data)*100:.2f}%)")
print(f"Non-vulnerable samples: {vul_counts.get(0, 0)} ({vul_counts.get(0, 0)/len(all_data)*100:.2f}%)")

# Task-wise vulnerability distribution
print("\nVulnerability by Task:")
task_vul = all_data.groupby('task')['is_vul'].agg(['count', 'sum', lambda x: x.sum()/x.count()*100])
task_vul.columns = ['Total Samples', 'Vulnerable Samples', 'Vulnerability Rate (%)']
display(task_vul)

# Data Cleaning and Preprocessing
Handle missing values, outliers, and perform any necessary preprocessing steps for EDA.

In [None]:
# Data Cleaning
print("Data Cleaning Steps:")

# Handle missing values - fill with appropriate values or drop if necessary
# For this dataset, most columns seem complete, but let's check

# Convert data types if needed
all_data['is_vul'] = all_data['is_vul'].astype(int)
all_data['task'] = all_data['task'].astype(int)
all_data['cvss_is_v3'] = all_data['cvss_is_v3'].astype(int)

# Convert Base Score to float if it's not already
if 'Base Score' in all_data.columns:
    all_data['Base Score'] = pd.to_numeric(all_data['Base Score'], errors='coerce')

# Handle any remaining missing values in critical columns
critical_cols = ['is_vul', 'task']
for col in critical_cols:
    if all_data[col].isnull().sum() > 0:
        print(f"Dropping rows with missing {col}")
        all_data = all_data.dropna(subset=[col])

print(f"Dataset shape after cleaning: {all_data.shape}")

# Remove duplicates if any
initial_shape = all_data.shape
all_data = all_data.drop_duplicates()
print(f"Removed {initial_shape[0] - all_data.shape[0]} duplicate rows")

# For text columns, we might want to clean them, but for EDA, we'll keep as is
print("Data cleaning completed.")

# Exploratory Visualizations
Create plots such as histograms, box plots, and bar charts to visualize data distributions and patterns.

In [None]:
# Set up the plotting area
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Exploratory Data Analysis - Code Vulnerability Dataset', fontsize=16)

# 1. Vulnerability Distribution
vul_counts = all_data['is_vul'].value_counts()
axes[0,0].pie(vul_counts.values, labels=['Non-Vulnerable', 'Vulnerable'], autopct='%1.1f%%', startangle=90)
axes[0,0].set_title('Overall Vulnerability Distribution')
axes[0,0].axis('equal')

# 2. Task-wise Vulnerability Rate
task_vul_rate = all_data.groupby('task')['is_vul'].mean() * 100
task_vul_rate.plot(kind='bar', ax=axes[0,1], color='skyblue')
axes[0,1].set_title('Vulnerability Rate by Task')
axes[0,1].set_xlabel('Task')
axes[0,1].set_ylabel('Vulnerability Rate (%)')
axes[0,1].tick_params(axis='x', rotation=0)

# 3. Base Score Distribution
if 'Base Score' in all_data.columns:
    all_data['Base Score'].hist(bins=20, ax=axes[1,0], alpha=0.7, color='green')
    axes[1,0].set_title('CVSS Base Score Distribution')
    axes[1,0].set_xlabel('Base Score')
    axes[1,0].set_ylabel('Frequency')

# 4. Severity Distribution
if 'Base Severity' in all_data.columns:
    severity_counts = all_data['Base Severity'].value_counts()
    severity_counts.plot(kind='bar', ax=axes[1,1], color='orange')
    axes[1,1].set_title('CVSS Severity Distribution')
    axes[1,1].set_xlabel('Severity')
    axes[1,1].set_ylabel('Count')
    axes[1,1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

# Additional plots
plt.figure(figsize=(12, 6))

# CWE Analysis - Top 10 most common CWEs
if 'cwe_ids' in all_data.columns:
    # Extract CWE IDs (they are in list format as strings)
    cwe_series = all_data['cwe_ids'].dropna().str.strip("[]'").str.split("', '").explode()
    top_cwe = cwe_series.value_counts().head(10)
    
    plt.subplot(1, 2, 1)
    top_cwe.plot(kind='barh', color='purple')
    plt.title('Top 10 Most Common CWE IDs')
    plt.xlabel('Count')
    plt.ylabel('CWE ID')

# Repository analysis - Top repositories
if 'repo_name' in all_data.columns:
    top_repos = all_data['repo_name'].value_counts().head(10)
    
    plt.subplot(1, 2, 2)
    top_repos.plot(kind='barh', color='red')
    plt.title('Top 10 Repositories by Sample Count')
    plt.xlabel('Count')
    plt.ylabel('Repository Name')

plt.tight_layout()
plt.show()

# Time series analysis if commit_time exists
if 'commit_time' in all_data.columns:
    all_data['commit_time'] = pd.to_datetime(all_data['commit_time'], errors='coerce')
    vul_over_time = all_data.groupby(all_data['commit_time'].dt.year)['is_vul'].mean() * 100
    
    plt.figure(figsize=(10, 6))
    vul_over_time.plot(kind='line', marker='o', color='blue')
    plt.title('Vulnerability Rate Over Time (by Year)')
    plt.xlabel('Year')
    plt.ylabel('Vulnerability Rate (%)')
    plt.grid(True, alpha=0.3)
    plt.show()

# Feature Analysis for Vulnerability Detection
Analyze specific features that indicate code vulnerabilities, including counts and proportions.

In [None]:
# Feature Analysis for Vulnerability Detection

print("Feature Analysis for Vulnerability Detection")
print("="*60)

# Analyze code length features
if 'func' in all_data.columns:
    all_data['func_length'] = all_data['func'].fillna('').str.len()
    print("Function Code Length Analysis:")
    print(all_data.groupby('is_vul')['func_length'].describe())
    
    # Visualize
    plt.figure(figsize=(10, 6))
    sns.boxplot(x='is_vul', y='func_length', data=all_data, showfliers=False)
    plt.title('Function Code Length by Vulnerability Status')
    plt.xlabel('Is Vulnerable')
    plt.ylabel('Function Length (characters)')
    plt.xticks([0, 1], ['Non-Vulnerable', 'Vulnerable'])
    plt.show()

# Analyze diff features
if 'diff_func' in all_data.columns:
    all_data['diff_length'] = all_data['diff_func'].fillna('').str.len()
    print("\nDiff Length Analysis:")
    print(all_data.groupby('is_vul')['diff_length'].describe())
    
    plt.figure(figsize=(10, 6))
    sns.boxplot(x='is_vul', y='diff_length', data=all_data, showfliers=False)
    plt.title('Diff Length by Vulnerability Status')
    plt.xlabel('Is Vulnerable')
    plt.ylabel('Diff Length (characters)')
    plt.xticks([0, 1], ['Non-Vulnerable', 'Vulnerable'])
    plt.show()

# CVSS Score analysis
if 'Base Score' in all_data.columns:
    print("\nCVSS Base Score Analysis:")
    print(all_data.groupby('is_vul')['Base Score'].describe())
    
    plt.figure(figsize=(10, 6))
    sns.histplot(data=all_data, x='Base Score', hue='is_vul', multiple='stack', bins=20, alpha=0.7)
    plt.title('CVSS Base Score Distribution by Vulnerability')
    plt.xlabel('Base Score')
    plt.ylabel('Count')
    plt.legend(['Non-Vulnerable', 'Vulnerable'])
    plt.show()

# Severity vs Vulnerability
if 'Base Severity' in all_data.columns:
    severity_vul = pd.crosstab(all_data['Base Severity'], all_data['is_vul'], normalize='index') * 100
    print("\nSeverity vs Vulnerability Rate (%):")
    display(severity_vul)
    
    severity_vul.plot(kind='bar', stacked=True, figsize=(10, 6))
    plt.title('Vulnerability Rate by CVSS Severity')
    plt.xlabel('Severity')
    plt.ylabel('Percentage')
    plt.legend(['Non-Vulnerable', 'Vulnerable'], title='Status')
    plt.xticks(rotation=45)
    plt.show()

# Task progression analysis
print("\nTask Progression Analysis:")
task_progression = all_data.groupby('task').agg({
    'is_vul': ['count', 'mean'],
    'Base Score': 'mean'
}).round(4)
task_progression.columns = ['Sample Count', 'Vul Rate', 'Avg CVSS Score']
display(task_progression)

# Correlation analysis
numerical_features = ['is_vul', 'task', 'Base Score', 'cvss_is_v3', 'func_length', 'diff_length']
corr_matrix = all_data[numerical_features].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Correlation Matrix of Numerical Features')
plt.show()

# Correlation and Insights
Compute correlations between features and generate insights for vulnerability detection models.

In [None]:
# Correlation and Insights

print("Key Insights for Vulnerability Detection Models")
print("="*60)

# Summary statistics
total_samples = len(all_data)
vul_samples = all_data['is_vul'].sum()
vul_rate = (vul_samples / total_samples) * 100

print(f"Dataset Overview:")
print(f"- Total samples: {total_samples:,}")
print(f"- Vulnerable samples: {vul_samples:,} ({vul_rate:.2f}%)")
print(f"- Non-vulnerable samples: {total_samples - vul_samples:,} ({100 - vul_rate:.2f}%)")
print(f"- Number of tasks: {all_data['task'].nunique()}")
print(f"- Number of unique CVEs: {all_data['cve_id'].nunique()}")

# Key findings
print(f"\nKey Findings:")
print("1. Class Imbalance: The dataset shows a significant imbalance with only {:.2f}% vulnerable samples.".format(vul_rate))

if 'task' in all_data.columns:
    task_rates = all_data.groupby('task')['is_vul'].mean() * 100
    print("2. Task Difficulty: Vulnerability rates vary across tasks:")
    for task, rate in task_rates.items():
        print(f"   - Task {task}: {rate:.2f}%")

if 'Base Score' in all_data.columns:
    print("3. Severity Correlation: Higher CVSS scores tend to correlate with vulnerability presence.")
    vul_scores = all_data[all_data['is_vul'] == 1]['Base Score'].mean()
    non_vul_scores = all_data[all_data['is_vul'] == 0]['Base Score'].mean()
    print(f"   - Average CVSS score for vulnerable code: {vul_scores:.2f}")
    print(f"   - Average CVSS score for non-vulnerable code: {non_vul_scores:.2f}")

print("4. Code Features: Function length and diff size may be important indicators.")
print("   - Vulnerable functions tend to have different characteristics in code changes.")

print("\nRecommendations for Model Development:")
print("- Address class imbalance using techniques like SMOTE or weighted loss functions")
print("- Consider incremental learning approaches given the task structure")
print("- Use both code content (func, diff_func) and metadata (CVSS, CWE) features")
print("- Validate models on each task separately to assess continual learning performance")
print("- Focus on high-severity vulnerabilities (CVSS >= 7.0) for critical applications")

# Insights on CWE IDs and CVE IDs
print(f"\nInsights on CWE IDs and CVE IDs:")
print(f"- Top CWE: {cwe_counts.index[0]} appears in {cwe_counts.iloc[0]} samples")
print(f"- Highest vulnerability CWE: {cwe_vul.index[0]} with {cwe_vul.iloc[0]['Vul Rate (%)']:.1f}% vul rate")
print(f"- CVE diversity: {total_cves} unique CVEs across {all_data['task'].nunique()} tasks")
print(f"- CWE-Task relationship: Task {cwe_per_task.idxmax()} has most unique CWEs ({cwe_per_task.max()})")

# Final correlation insights
if 'func_length' in all_data.columns and 'diff_length' in all_data.columns:
    corr_with_vul = all_data[['is_vul', 'func_length', 'diff_length', 'Base Score']].corr()['is_vul']
    print(f"\nFeature Correlations with Vulnerability:")
    for feature, corr in corr_with_vul.items():
        if feature != 'is_vul':
            print(f"- {feature}: {corr:.3f}")

print("\nEDA Complete! This analysis provides a comprehensive understanding of the dataset for building effective vulnerability detection models.")

# Detailed Analysis of CWE IDs and CVE IDs
Analyze the critical fields `cwe_ids` (Common Weakness Enumeration IDs) and `cve_id` (Common Vulnerabilities and Exposures IDs) to understand vulnerability patterns and distributions.

In [None]:
# CVE IDs Analysis
print("CVE IDs Analysis")
print("="*50)

# Basic statistics for CVE IDs
total_cves = all_data['cve_id'].nunique()
print(f"Total unique CVE IDs: {total_cves}")
print(f"Total CVE entries: {len(all_data)}")
print(f"Average entries per CVE: {len(all_data)/total_cves:.2f}")

# Check for duplicates (same CVE in multiple tasks/samples)
cve_counts = all_data['cve_id'].value_counts()
duplicates = cve_counts[cve_counts > 1]
print(f"\nCVEs appearing in multiple samples: {len(duplicates)}")
print(f"Max occurrences of a single CVE: {cve_counts.max()}")

# CVE distribution by task
cve_per_task = all_data.groupby('task')['cve_id'].nunique()
print(f"\nUnique CVEs per task:")
for task, count in cve_per_task.items():
    print(f"  Task {task}: {count}")

# CVE vs Vulnerability
cve_vul = all_data.groupby('cve_id')['is_vul'].agg(['count', 'sum', 'mean'])
cve_vul.columns = ['Total Samples', 'Vulnerable Samples', 'Vul Rate']
print(f"\nCVE Vulnerability Summary:")
print(cve_vul.describe())

# Top CVEs by frequency
print(f"\nTop 10 CVEs by sample count:")
display(cve_counts.head(10))

# Visualize CVE distribution
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
cve_counts.head(20).plot(kind='bar', color='blue')
plt.title('Top 20 CVEs by Sample Count')
plt.xlabel('CVE ID')
plt.ylabel('Sample Count')
plt.xticks(rotation=45, ha='right')

plt.subplot(1, 2, 2)
cve_per_task.plot(kind='bar', color='green')
plt.title('Unique CVEs per Task')
plt.xlabel('Task')
plt.ylabel('Unique CVE Count')
plt.xticks(rotation=0)

plt.tight_layout()
plt.show()

In [None]:
# CWE IDs Analysis
print("\nCWE IDs Analysis")
print("="*50)

# Process CWE IDs (they are stored as string lists)
all_data['cwe_list'] = all_data['cwe_ids'].fillna("[]").apply(lambda x: eval(x) if isinstance(x, str) else [])

# Explode CWE IDs for analysis
cwe_exploded = all_data.explode('cwe_list')
cwe_exploded = cwe_exploded[cwe_exploded['cwe_list'].notna()]

# Basic statistics
total_unique_cwe = cwe_exploded['cwe_list'].nunique()
print(f"Total unique CWE IDs: {total_unique_cwe}")
print(f"Total CWE entries: {len(cwe_exploded)}")
print(f"Average CWEs per sample: {len(cwe_exploded)/len(all_data):.2f}")

# Top CWE IDs
cwe_counts = cwe_exploded['cwe_list'].value_counts()
print(f"\nTop 10 CWE IDs by frequency:")
display(cwe_counts.head(10))

# CWE distribution by task
cwe_per_task = cwe_exploded.groupby('task')['cwe_list'].nunique()
print(f"\nUnique CWEs per task:")
for task, count in cwe_per_task.items():
    print(f"  Task {task}: {count}")

# CWE vs Vulnerability
cwe_vul = cwe_exploded.groupby('cwe_list')['is_vul'].agg(['count', 'sum', lambda x: x.sum()/x.count()*100])
cwe_vul.columns = ['Total Samples', 'Vulnerable Samples', 'Vul Rate (%)']
cwe_vul = cwe_vul.sort_values('Total Samples', ascending=False)
print(f"\nTop 10 CWEs by Vulnerability Rate:")
display(cwe_vul.head(10))

# Visualize CWE distributions
plt.figure(figsize=(15, 10))

plt.subplot(2, 2, 1)
cwe_counts.head(15).plot(kind='barh', color='purple')
plt.title('Top 15 CWE IDs by Frequency')
plt.xlabel('Count')
plt.ylabel('CWE ID')

plt.subplot(2, 2, 2)
cwe_per_task.plot(kind='bar', color='orange')
plt.title('Unique CWE IDs per Task')
plt.xlabel('Task')
plt.ylabel('Unique CWE Count')
plt.xticks(rotation=0)

plt.subplot(2, 2, 3)
cwe_vul.head(10)['Vul Rate (%)'].plot(kind='bar', color='red')
plt.title('Top 10 CWEs by Vulnerability Rate')
plt.xlabel('CWE ID')
plt.ylabel('Vulnerability Rate (%)')
plt.xticks(rotation=45, ha='right')

plt.subplot(2, 2, 4)
# CWE co-occurrence (top 10 CWEs)
top_cwes = cwe_counts.head(10).index
cwe_matrix = pd.crosstab(cwe_exploded[cwe_exploded['cwe_list'].isin(top_cwes)]['cve_id'],
                         cwe_exploded[cwe_exploded['cwe_list'].isin(top_cwes)]['cwe_list'])
cwe_corr = cwe_matrix.corr()
sns.heatmap(cwe_corr, annot=False, cmap='Blues', ax=plt.gca())
plt.title('CWE Co-occurrence Correlation (Top 10)')

plt.tight_layout()
plt.show()

# CWE by severity
if 'Base Severity' in all_data.columns:
    cwe_severity = cwe_exploded.groupby(['cwe_list', 'Base Severity']).size().unstack().fillna(0)
    cwe_severity = cwe_severity.div(cwe_severity.sum(axis=1), axis=0) * 100
    top_cwe_severity = cwe_severity.loc[cwe_counts.head(10).index]
    print(f"\nSeverity Distribution for Top 10 CWEs (%):")
    display(top_cwe_severity)