<a href="https://colab.research.google.com/github/MelinHead225/Scientific-Software-SATD-Analyzer/blob/main/SSW_SATD_Analyzer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import re
import git
import torch
import torch.nn.functional as F
import pandas as pd
from collections import Counter
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import shutil
import sys
from tqdm import tqdm

# Label mapping for classification
LABEL_MAPPING = {
    0: "requirement_debt",
    1: "code/design_debt",
    2: "documentation_debt",
    3: "test_debt",
    4: "scientific_debt",
    5: "non_debt"
}

# Clone GitHub repository
def clone_github_repo(repo_url, local_dir):
    if not os.path.exists(local_dir):
        os.makedirs(local_dir)
    git.Repo.clone_from(repo_url, local_dir)

# Preprocess comments: Convert multi-line comments to single-line & clean text
def preprocess_comments(comments):
    def multiline_to_singleline(comment):
        return ' '.join(comment.splitlines())

    def clean_comment(comment):
        comment = re.sub(r'\s*(/\*\*|\*/|/\*|//)', '', comment)
        comment = re.sub(r'[^a-zA-Z!?\s]', '', comment)
        comment = comment.lower()
        comment = re.sub(r'\s+', ' ', comment).strip()
        return comment

    processed_comments = []
    for filename, line_number, comment in tqdm(comments, desc="Preprocessing comments"):
        if comment.startswith('/**') or comment.startswith('/*'):
            comment = multiline_to_singleline(comment)
        comment = clean_comment(comment)
        if comment:
            processed_comments.append((filename, line_number, comment))
    return processed_comments

# Extract comments from a single file
def extract_comments_from_file(file_path):
    comments = []
    encodings = ['utf-8', 'latin-1', 'utf-16']
    for encoding in encodings:
        try:
            with open(file_path, 'r', encoding=encoding) as file:
                lines = file.readlines()
                inside_multiline_comment = False
                multiline_comment = ""
                current_comment = ""
                line_number = 0
                file_extension = os.path.splitext(file_path)[1]

                for line in lines:
                    line_number += 1
                    if file_extension in ('.py', '.pyx'):
                        match_single = re.match(r'^\s*#(.*)', line)
                        match_multi_start = re.match(r'^\s*(?:"""|\'\'\')(.*)', line)
                        match_multi_end = re.match(r'(.*)(?:"""|\'\'\')\s*$', line)
                        multiline_comment_end = '"""'
                    elif file_extension == '.go':
                        match_single = re.match(r'^\s*//(.*)', line)
                        match_multi_start = re.match(r'^\s*/\*(.*)', line)
                        match_multi_end = re.match(r'(.*)\*/\s*$', line)
                        multiline_comment_end = '*/'
                    elif file_extension in ('.c', '.cpp', '.h', '.hpp', '.php', '.phtml', '.js', '.jsx', '.ts', '.tsx'):
                        match_single = re.match(r'^\s*//(.*)', line)
                        match_multi_start = re.match(r'^\s*/\*(.*)', line)
                        match_multi_end = re.match(r'(.*)\*/\s*$', line)
                        multiline_comment_end = '*/'
                    elif file_extension in ('.pl', '.pm'):
                        match_single = re.match(r'^\s*#(.*)', line)
                        match_multi_start = match_multi_end = None
                        multiline_comment_end = None
                    elif file_extension in ('.f', '.for', '.f90'):
                        match_single = re.match(r'^\s*[!C].?(.*)', line)
                        match_multi_start = match_multi_end = None
                        multiline_comment_end = None
                    else:
                        continue

                    if match_single:
                        comment = match_single.group(1).strip()
                        if comment:
                            if current_comment:
                                current_comment += " " + comment
                            else:
                                current_comment = comment
                        continue

                    if not inside_multiline_comment and match_multi_start:
                        inside_multiline_comment = True
                        multiline_comment = match_multi_start.group(1).strip()
                        if multiline_comment_end and multiline_comment.endswith(multiline_comment_end):
                            multiline_comment = multiline_comment[:-len(multiline_comment_end)].strip()
                            if multiline_comment:
                                if current_comment:
                                    current_comment += " " + multiline_comment
                                else:
                                    current_comment = multiline_comment
                            inside_multiline_comment = False
                        continue

                    if inside_multiline_comment:
                        multiline_comment += " " + line.strip()
                        if multiline_comment_end and multiline_comment.endswith(multiline_comment_end):
                            multiline_comment = multiline_comment[:-len(multiline_comment_end)].strip()
                            if multiline_comment:
                                if current_comment:
                                    current_comment += " " + multiline_comment
                                else:
                                    current_comment = multiline_comment
                            inside_multiline_comment = False
                        continue

                    if current_comment:
                        comments.append((line_number, current_comment))
                        current_comment = ""

                if current_comment:
                    comments.append((line_number, current_comment))
            return comments
        except (UnicodeDecodeError, IOError):
            continue
    raise UnicodeDecodeError(f"Unable to decode the file {file_path} with available encodings.")

# Traverse directory and extract comments (includes full file path)
def traverse_directory_and_extract_comments(root_dir):
    all_comments = []
    for dirpath, _, filenames in os.walk(root_dir):
        for filename in tqdm(filenames, desc=f"Processing files in {dirpath}"):
            if filename.endswith(('.py', '.pyx', '.go', '.f', '.for', '.f90', '.c', '.cpp', '.h', '.hpp', '.pl', '.pm', '.php', '.phtml', '.js', '.jsx', '.ts', '.tsx')):
                file_path = os.path.join(dirpath, filename)
                comments = extract_comments_from_file(file_path)
                for line_number, comment in comments:
                    relative_path = os.path.relpath(file_path, root_dir)
                    all_comments.append((relative_path, line_number, comment))
    return all_comments

# Classify comments using Hugging Face model
def classify_comment(comment, model, tokenizer):
    inputs = tokenizer(comment, return_tensors="pt", truncation=True, padding=True, max_length=512).to(model.device)
    with torch.no_grad():
        logits = model(**inputs).logits
        probs = F.softmax(logits, dim=-1)
        label = torch.argmax(probs, dim=-1).item()
    return LABEL_MAPPING[label]

HF_REPO_NAME = "MelinHead225/bert-large-SSW-SATD-classification"

# Project-level SATD analysis (text-only)
def perform_project_level_analysis(df, repo_name):
    class_counts = Counter(df["Classification"])
    total_comments = len(df)
    satd_comments = total_comments - class_counts.get("non_debt", 0)
    satd_percentage = (satd_comments / total_comments * 100) if total_comments > 0 else 0

    print("\n=== Project-Level SATD Analysis ===")
    print(f"Repository: {repo_name}")
    print(f"Total Comments Analyzed: {total_comments}")
    print(f"SATD Comments: {satd_comments} ({satd_percentage:.2f}%)")
    print("\nSATD Distribution:")
    for class_label, count in class_counts.items():
        print(f"{class_label}: {count} ({count/total_comments*100:.2f}%)")

    satd_df = df[df["Classification"] != "non_debt"]
    file_satd_counts = satd_df["Filename"].value_counts().head(5)
    print("\nTop 5 Files with Most SATD:")
    for file, count in file_satd_counts.items():
        print(f"{file}: {count} SATD comments")

    satd_df["Directory"] = satd_df["Filename"].apply(lambda x: os.path.dirname(x) if os.path.dirname(x) else "root")
    dir_satd_counts = satd_df["Directory"].value_counts().head(5)
    print("\nTop 5 Directories with Most SATD:")
    for directory, count in dir_satd_counts.items():
        print(f"{directory}: {count} SATD comments")

    filetype_counts = Counter()
    for filename in satd_df["Filename"]:
        ext = os.path.splitext(filename)[1]
        filetype_counts[ext] += 1
    print("\nSATD Distribution by File Type:")
    for ext, count in filetype_counts.most_common():
        print(f"{ext}: {count} SATD comments")

# Save comments and perform analysis
def save_results_to_csv(comments, output_file, repo_name):
    df = pd.DataFrame(comments, columns=['Filename', 'Line Number', 'Comment'])

    tokenizer = AutoTokenizer.from_pretrained(HF_REPO_NAME)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = AutoModelForSequenceClassification.from_pretrained(HF_REPO_NAME).to(device)

    tqdm.pandas(desc="Classifying comments")
    df["Classification"] = df["Comment"].progress_apply(lambda c: classify_comment(c, model, tokenizer))

    df.to_csv(output_file, index=False)
    print(f"\nClassified comments saved to {output_file}")

    perform_project_level_analysis(df, repo_name)

def main():
    REPO_URLS = [
        # "analyse_this_repository.git", # This is an example. Insert actual repository here    #
        "https://github.com/healpy/healpy.git"
    ]
    BASE_DIR = "/"

    for REPO_URL in REPO_URLS:
        print(f"\nProcessing repository: {REPO_URL}")

        repo_name = REPO_URL.split('/')[-1].replace('.git', '')
        OUTPUT_FILE = f"{repo_name}_comments_classification.csv"
        LOCAL_DIR = os.path.join(BASE_DIR, repo_name)

        clone_github_repo(REPO_URL, LOCAL_DIR)
        comments = traverse_directory_and_extract_comments(LOCAL_DIR)
        processed_comments = preprocess_comments(comments)
        save_results_to_csv(processed_comments, OUTPUT_FILE, repo_name)
        shutil.rmtree(LOCAL_DIR)
        print(f"\nDeleted cloned repository: {LOCAL_DIR}")

if __name__ == '__main__':
    main()


Processing repository: https://github.com/healpy/healpy.git


Processing files in /healpy: 100%|██████████| 15/15 [00:00<00:00, 7872.19it/s]
Processing files in /healpy/bin: 100%|██████████| 1/1 [00:00<00:00, 14716.86it/s]
Processing files in /healpy/lib: 0it [00:00, ?it/s]
Processing files in /healpy/lib/healpy: 100%|██████████| 12/12 [00:00<00:00, 406.72it/s]
Processing files in /healpy/lib/healpy/utils: 100%|██████████| 2/2 [00:00<00:00, 3298.71it/s]
Processing files in /healpy/lib/healpy/data: 100%|██████████| 30/30 [00:00<00:00, 466033.78it/s]
Processing files in /healpy/.git: 100%|██████████| 5/5 [00:00<00:00, 97997.76it/s]
Processing files in /healpy/.git/branches: 0it [00:00, ?it/s]
Processing files in /healpy/.git/refs: 0it [00:00, ?it/s]
Processing files in /healpy/.git/refs/tags: 0it [00:00, ?it/s]
Processing files in /healpy/.git/refs/heads: 100%|██████████| 1/1 [00:00<00:00, 26546.23it/s]
Processing files in /healpy/.git/refs/remotes: 0it [00:00, ?it/s]
Processing files in /healpy/.git/refs/remotes/origin: 100%|██████████| 1/1 [00:00

tokenizer_config.json:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/960 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Classifying comments: 100%|██████████| 544/544 [00:12<00:00, 45.03it/s]


Classified comments saved to healpy_comments_classification.csv

=== Project-Level SATD Analysis ===
Repository: healpy
Total Comments Analyzed: 544
SATD Comments: 177 (32.54%)

SATD Distribution:
non_debt: 367 (67.46%)
test_debt: 1 (0.18%)
scientific_debt: 171 (31.43%)
code/design_debt: 4 (0.74%)
requirement_debt: 1 (0.18%)

Top 5 Files with Most SATD:
lib/healpy/pixelfunc.py: 40 SATD comments
lib/healpy/sphtfunc.py: 23 SATD comments
lib/healpy/rotator.py: 22 SATD comments
lib/healpy/projaxes.py: 13 SATD comments
src/_sphtools.pyx: 12 SATD comments

Top 5 Directories with Most SATD:
lib/healpy: 132 SATD comments
src: 23 SATD comments
test: 18 SATD comments
lib/healpy/utils: 3 SATD comments
root: 1 SATD comments

SATD Distribution by File Type:
.py: 154 SATD comments
.pyx: 23 SATD comments

Deleted cloned repository: /healpy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  satd_df["Directory"] = satd_df["Filename"].apply(lambda x: os.path.dirname(x) if os.path.dirname(x) else "root")
