In [None]:
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support
from google.colab import files

# File Upload
print("Please upload the file: 366_ARPs_for_extracting_Issue_Solution_Pairs.xlsx")
uploaded = files.upload()

input_path = list(uploaded.keys())[0]
output_path = "DECA_PD_SP_results.xlsx"

# Keywords for heuristic rules
problem_keywords = [
    "What", "When", "Who", "Which", "How", "?", "I am trying to build", "I want to design",
    "How to architecture", "I am evaluating", "I am building", "The user should", "I need help",
    "I am developing", "Advise on", "I want to design", "crash", "error", "bug", "problem",
    "issue", "wrong", "not working", "cannot", "unable"
]

solution_keywords = [
    "the best practice", "you should", "I am using", "you don't have to do", "In order to",
    "it is critical", "You should", "It is recommended", "A good approach is",
    "I suggest", "I propose", "fix", "I recommend", "refactor", "to use", "to limit", "could use"
]

# Classification functions
def is_problem_discovery(sentence: str) -> bool:
    if not isinstance(sentence, str):
        return False
    sentence_lower = sentence.lower()
    return any(kw.lower() in sentence_lower for kw in problem_keywords)

def is_solution_proposal(sentence: str) -> bool:
    if not isinstance(sentence, str):
        return False
    sentence_lower = sentence.lower()
    return any(kw.lower() in sentence_lower for kw in solution_keywords)

def classify_sentence(sentence: str) -> str:
    if is_problem_discovery(sentence):
        return "Problem Discovery"
    elif is_solution_proposal(sentence):
        return "Solution Proposal"
    else:
        return "Other"

# Load Excel
data = pd.read_excel(input_path, sheet_name="Sheet1")

# Apply classification
data["Question_pred"] = data["Question_body_cleaned"].apply(classify_sentence)
data["Answer_pred"] = data["Answer_body_cleaned"].apply(classify_sentence)

#  Evaluation against labeled issues and solutions
if "Reference_Question" in data.columns and "Reference_Solution" in data.columns:
    # Evaluation for Problem/Issue (Questions)
    q_true = data["Reference_Question"]
    q_pred = data["Question_pred"]
    precision, recall, f1, _ = precision_recall_fscore_support(
        q_true, q_pred, average="macro", zero_division=0
    )
    issue_scores = pd.Series({
        "Issue_precision": precision,
        "Issue_recall": recall,
        "Issue_f1": f1
    })
    print("\nPrecision, Recall, F1 Scores for Problems/Issues:")
    print(issue_scores)

    # Evaluation for Solution (Answers)
    a_true = data["Reference_Solution"]
    a_pred = data["Answer_pred"]
    precision, recall, f1, _ = precision_recall_fscore_support(
        a_true, a_pred, average="macro", zero_division=0
    )
    solution_scores = pd.Series({
        "Solution_precision": precision,
        "Solution_recall": recall,
        "Solution_f1": f1
    })
    print("\nPrecision, Recall, F1 Scores for Answers/Solutions:")
    print(solution_scores)

    # === Save both predictions + evaluation ===
    with pd.ExcelWriter(output_path, engine="openpyxl", mode="w") as writer:
        data.to_excel(writer, sheet_name="Predictions", index=False)
        pd.concat([issue_scores, solution_scores]).to_excel(
            writer, sheet_name="Evaluation", header=["Value"]
        )

    print(f"\nSaved predictions and evaluation to {output_path}")
    files.download(output_path)

else:
    print(" Labels not found (expected columns: 'Reference_Question', 'Reference_Solution'). Skipping evaluation step.")


In [None]:
Precision, Recall, F1 Scores for Problems/Issues:
Issue_precision    0.682
Issue_recall       0.556
Issue_f1           0.540
dtype: float64

Precision, Recall, F1 Scores for Answers/Solutions:
Solution_precision    0.650
Solution_recall       0.611
Solution_f1           0.571
dtype: float64