In [1]:
import polars as pl
from dotenv import load_dotenv
import os
from pathlib import Path
from google.cloud import language_v2
from google import genai
from google.genai import types
import utils as u


In [2]:
PROJECT_PATH = "dstack/d-stack-home"
RAW_PATH = "data/raw_issues.parquet"
CLEANED_PATH = "data/cleaned_issues.parquet"
ENRICHED_PATH = "data/issues_enriched.parquet"
LABELED_PATH = "data/issues_labeled.parquet"
POSTPROCESSED_PATH = "data/issues_postprocessed.parquet"
KEYWORDS_PATH = "keywords_config.txt"

load_dotenv()

# Access the API key
api_key = os.environ.get("API_KEY_GCP")
token_vertex = os.environ.get("ACCES_TOKEN_VERTEX")
PROJECT_ID = "project-8415b93b-4a16-4c2b-901"
LOCATION = "europe-west3"
print("API key loaded:", bool(api_key))
client = language_v2.LanguageServiceClient(client_options={"api_key": api_key})

keywords_path = Path(KEYWORDS_PATH)

LABELS = [
    line.strip()
    for line in keywords_path.read_text(encoding="utf-8").splitlines()
    if line.strip() and not line.lstrip().startswith("#")
]

# some iid s to exclude since they predate the feedback process
iids_to_exclude = list(range(1,9))
columns_to_keep = ["iid", "title", "description", "state", "created_at", "updated_at", "closed_at", "author_id", "author_name", "author_state", "user_notes_count", "upvotes", "downvotes", "references"]
desc_to_exclude = ["", "test", "Test"]

API key loaded: True


In [None]:
# issues = fetch_all_gitlab_issues("dstack/d-stack-home")
# raw_df = pl.DataFrame(issues)
# raw_df.write_parquet(RAW_PATH)

In [None]:
raw_df = pl.read_parquet(RAW_PATH)

In [None]:
df_clean = u.clean_issues_df(raw_df, columns_to_keep, iids_to_exclude)

In [None]:
df_prepared = u.prepare_issues_df(df_clean, desc_to_exclude)

## Some analysis of the feedback

- form-generated vs manual issues (anonymous vs non-anonymous feedbackers)
- Issues per page 
- length of issues
- temporal analysis - when were issues commited?
- sentiment analysis
- keyword / label counts: first get the distinct labels, clean them, manually add some more, keyword search them in titles and descriptions
- correlations sentiments and labels / topics
- correlations between upvotes / downvotes and topics
- top comments
- Amount of non-content tickets ("Test" etc.)

## Sentiment analysis

In [None]:
def get_sentiment_score(text_content: str) -> float:
    """
    Analyzes the sentiment of a single string and returns only the float score.
    Returns 0.0 if the input is None or the API call fails.
    """
    if text_content is None:
        return 0.0

    try:
        # Create the document object
        document = language_v2.Document(
            content=text_content, 
            type_=language_v2.Document.Type.PLAIN_TEXT
        )

        # Call the API
        response = client.analyze_sentiment(document=document)
        
        # Return the sentiment score (-1.0 to +1.0)
        return response.document_sentiment.score

    except Exception as e:
        print(f"Error processing text: {text_content[:20]}... Error: {e}")
        return 0.0 # Return a neutral score on error



In [None]:
# df_with_sentiment = df_prepared.with_columns(
#     pl.col("desc_clean")
#       .map_elements(get_sentiment_score, return_dtype=pl.Float64)
#       .alias("sentiment")
# )

# df_with_sentiment.write_parquet(ENRICHED_PATH)

In [None]:
df_with_sentiment = pl.read_parquet(ENRICHED_PATH)
df_with_sentiment.head()

# Labeling

In [None]:
df_with_sentiment

In [None]:
SYSTEM_INSTRUCTION = (
    "Du bekommst GitLab-Issues aus dem Deutschland-Stack-Konsultationsverfahren. "
    "Du bist ein Klassifizierungs-Experte. Deine Aufgabe ist es, die Beschreibung zu analysieren "
    "und sie anhand der Labels in der Liste zu klassifizieren. "
    "Nutze NUR die zur Verfügung gestellten Labels. Erfinde keine neuen Labels! "
    "Stelle das Ergebnis als Komma-separierten String zur Verfügung. "
    "Der String enthält NUR die von dir vergebenen Labels (eins oder bis zu 5). "
    "Versuche nur so viele Labels wie nötig zu vergeben. "
    "Wenn kein Label passt, nutze das Label Unklar"
)


In [None]:
client = genai.Client(vertexai=True)

MODEL = "gemini-2.0-flash"

In [None]:
def validate_labels(labels_str: str, allowed_labels: list[str]) -> list[str]:
    """
    Filtert die durch Komma getrennten Labels und entfernt alle,
    die nicht in allowed_labels sind.
    """
    labels = [label.strip() for label in labels_str.split(",")]
    return [label for label in labels if label in allowed_labels]

def classify_issue_multilabel(issue_text: str, labels: list[str]) -> str:
    user_prompt = f"""
Labels:
{", ".join(labels)}

Issue:
{issue_text}
"""

    response = client.models.generate_content(
        model=MODEL,
        contents=[
            types.Content(
                role="user",
                parts=[types.Part(text=SYSTEM_INSTRUCTION + "\n\n" + user_prompt)],
            ),
        ],
    )

    return response.text.strip()


In [None]:
# Process in batches to handle rate limits and save progress
import time

batch_size = 10  # Adjust based on rate limits

# Check if we have a partial result
if Path(LABELED_PATH).exists():
    df_labeled = pl.read_parquet(LABELED_PATH)
    start_idx = len(df_labeled)
else:
    df_labeled = df_with_sentiment.clone()
    labels_list = [[] for _ in range(len(df_labeled))]
    start_idx = 0

for i in range(start_idx, len(df_labeled), batch_size):
    end_idx = min(i + batch_size, len(df_labeled))
    print(f"Processing batch {i//batch_size + 1}: rows {i} to {end_idx-1}")

    for j in range(end_idx - i):
        row_idx = i + j
        desc = df_labeled["desc_clean"][row_idx]
        try:
            labels_str = classify_issue_multilabel(desc, LABELS)
            validated_labels = validate_labels(labels_str, LABELS)
            labels_list[row_idx] = validated_labels
            time.sleep(2)  # Increased sleep to avoid rate limits
        except Exception as e:
            print(f"Error processing row {row_idx}: {e}")
            labels_list[row_idx] = []  # Assign empty list on error
            time.sleep(5)  # Longer sleep on error

    # Update the dataframe with the current labels_list
    df_labeled = df_labeled.with_columns(pl.Series("labels", labels_list))

    # Save progress
    df_labeled.write_parquet(LABELED_PATH)
    print(f"Saved progress after batch {i//batch_size + 1}")

print("Labeling completed!")

In [None]:
df_labeled.sample(2).select("desc_clean", "sentiment", "labels").rows()

In [None]:
# df_labeled.write_parquet(LABELED_PATH)


In [None]:
df_labeled = pl.read_parquet(LABELED_PATH)

In [None]:
# check if any labels are empty
empty_label_count = df_labeled.filter(pl.col("labels").list.len() == 0).height
print(f"Number of issues with empty labels: {empty_label_count}")

In [None]:
df_labeled.filter(pl.col("labels").list.len() == 0).select("iid", "desc_clean").rows()

## Postprocess

In [None]:
df_postprocessed = pl.read_parquet(POSTPROCESSED_PATH)
df_postprocessed = u.postprocess_issues(df_postprocessed)
df_postprocessed.write_parquet(POSTPROCESSED_PATH)