#### Load Python Files + Extract Comments

In [1]:
import os
from pathlib import Path


def extract_comments_from_py(path):
    comments = []
    for file in Path(path).rglob("*.py"):
        with open(file, "r", encoding="utf-8") as f:
            for i, line in enumerate(f.readlines(), 1):
                line = line.strip()
                if line.startswith("#") and len(line) > 1:
                    comments.append({
                        "file": str(file.relative_to(path)),
                        "line": i,
                        "comment": line
                    })
    return comments


comments = extract_comments_from_py("../data/3-comments-nlp/bad_comments")
print(f"Extracted {len(comments)} comments")
comments[:3]

Extracted 4 comments


[{'file': 'email_validator.py', 'line': 6, 'comment': '# good email regex'},
 {'file': 'email_validator.py', 'line': 12, 'comment': '# Check email'},
 {'file': 'email_validator.py', 'line': 16, 'comment': '# validate'}]

#### Classify Comments Using a Pre-Trained NLP Model

In [2]:
from transformers import pipeline

# Load sentiment or zero-shot classifier (choose best for comment quality)
classifier = pipeline("text-classification",
                      model="microsoft/xtremedistil-l6-h256-uncased")


def classify_comment(text):
    result = classifier(text)[0]
    return result["label"], result["score"]


for comment in comments:
    label, score = classify_comment(comment["comment"])
    comment["quality"] = "bad" if label.lower() in ["negative", "vague",
                                                    "irrelevant"] else "good"
    comment["score"] = score

# Filter only bad comments
bad_comments = [c for c in comments if c["quality"] == "bad"]
bad_comments[:3]

pytorch_model.bin:   0%|          | 0.00/51.0M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/xtremedistil-l6-h256-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cpu


model.safetensors:   0%|          | 0.00/51.0M [00:00<?, ?B/s]

[]

#### Use OpenAI API to Get Minimalist Fix Suggestions

In [None]:
import openai
import os

openai.api_key = os.getenv("OPENAI_API_KEY")  # store your key in your env vars


def suggest_fix(comment):
    prompt = f"The following code comment is vague or bad:\n\n\"{comment}\"\n\nIn one sentence, explain how to improve it (do not rewrite the comment):"

    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "user", "content": prompt}
        ],
        max_tokens=60,
        temperature=0.3
    )
    return response["choices"][0]["message"]["content"]


# Add suggestions to each bad comment
for c in bad_comments:
    c["why"] = "Automatically classified as vague or unclear."
    c["suggestion"] = suggest_fix(c["comment"])

bad_comments[:3]

#### Export as JSON for Later Use

In [None]:
import json

with open("nlp-results.json", "w") as f:
    json.dump(bad_comments, f, indent=2)