In [1]:
import os
import git
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np
import openai
from openai import OpenAI

# Set OpenAI API key
os.environ['OPENAI_API_KEY'] = 'myapikey'

### Cluster git commits from repo 'pytorch' using k-means and generate a description of each cluster using openAI api with model gpt-3.5-turbo.

In [2]:
# Set your OpenAI API key
openai.api_key = os.getenv('OPENAI_API_KEY')

def describe_clusters(commit_messages, labels, num_clusters):
    cluster_descriptions = []

    for cluster_id in range(num_clusters):
        # Get all commit messages for this cluster
        cluster_messages = [commit_messages[i] for i in range(len(labels)) if labels[i] == cluster_id]
        
        # Use only a subset if there are too many messages
        if len(cluster_messages) > 10:
            cluster_messages = cluster_messages[:10]
        
        prompt = "Here are some commit messages and diffs from a project. First, give a short summaries and then provide details of the main themes and topics described in these messages and diffs:\n\n"
        prompt += "\n\n".join(cluster_messages)
        
        client = OpenAI(
        # This is the default and can be omitted
            api_key=openai.api_key
        )
        response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
        {
            "role": "user",
            "content": prompt,
        }
        ],
        max_tokens=100
        )
        description = response.choices[0].message.content
        cluster_descriptions.append("Cluster {}: {}".format(cluster_id, description) + "\n")
    
    return cluster_descriptions

repo_path = '~/Downloads/pytorch'  # Update this path to your cloned repo
repo = git.Repo(repo_path)

# Extract commit messages and diffs
commits = list(repo.iter_commits('main', max_count=7))
commit_contents = []

for commit in commits:
    commit_message = commit.message
    commit_diff = commit.diff(create_patch=True)

    diff_texts = []
    for diff in commit_diff:
        diff_texts.append(diff.diff.decode('utf-8'))

    commit_diff_text = '\n'.join(diff_texts)
    content = f"Commit Message:\n{commit_message}\n\nCommit Diff:\n{commit_diff_text}"
    commit_contents.append(content)

# Check if commit contents are not empty
if not commit_contents:
    raise ValueError("No commit contents found in the repository")

# Vectorize commit contents using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(commit_contents)

# Perform k-means clustering
num_clusters = 5
kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
kmeans.fit(X)
labels = kmeans.labels_

# Get cluster descriptions
cluster_descriptions = describe_clusters(commit_contents, labels, num_clusters)

# Print cluster descriptions
for description in cluster_descriptions:
    print(description)


Cluster 0: Summary: This commit message and diff involve reverting a previous commit related to distributed debug handlers in a project.

Main Themes:
1. Reverting Changes: The primary focus of this commit is to revert a specific commit that was previously made related to distributed debug handlers within the project.
2. Build Execution Configuration: There are adjustments made to the build configuration with changes to shared linker flags on Linux and the display of LD flags for different types (Shared, Static, Module).
3. Tensor Creation: Some

Cluster 1: **Themes and Topics:**

1. **New Features in CommDebugMode:**
   - The primary theme across both commits is the enhancement of CommDebugMode functionality with the addition of new tracing features - `c10d alltoall_` and `c10d alltoall_base_`.
   - This indicates a focus on improving debugging and monitoring capabilities within the distributed communication module.

2. **Integration of WorkerServer:**
   - Both commits introduce the 

### Here I list all the commit message for each cluster

In [3]:
import os
import git
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np
from collections import Counter

def describe_clusters(commit_contents, labels, num_clusters, commit_messages):
    cluster_descriptions = []

    for cluster_id in range(num_clusters):
        cluster_contents = [commit_messages[i] for i in range(len(labels)) if labels[i] == cluster_id]
        combined_text = "\n\n\n".join(cluster_contents)
        
        description = "Cluster {}: Commit messages: {}".format(cluster_id, ", ".join(commit_messages))
        cluster_descriptions.append(description)
    
    return cluster_descriptions

# The rest of your script remains the same
repo_path = '~/Downloads/pytorch'  # Update this path to your cloned repo
repo = git.Repo(repo_path)

# Extract commit messages and diffs
commits = list(repo.iter_commits('main', max_count=20))
commit_contents = []
commit_messages = []

for commit in commits:
    commit_message = commit.message
    commit_diff = commit.diff(create_patch=True)

    diff_texts = []
    for diff in commit_diff:
        diff_texts.append(diff.diff.decode('utf-8'))

    commit_diff_text = '\n'.join(diff_texts)
    content = f"Commit Message:\n{commit_message}\n\nCommit Diff:\n{commit_diff_text}"
    commit_contents.append(content)
    commit_messages.append("Commit----------------------------------\n"+commit_message)
    
# Check if commit contents are not empty
if not commit_contents:
    raise ValueError("No commit contents found in the repository")

# Vectorize commit contents using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(commit_contents)

# Perform k-means clustering
num_clusters = 5
kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
kmeans.fit(X)
labels = kmeans.labels_

# Get cluster descriptions
cluster_descriptions = describe_clusters(commit_contents, labels, num_clusters, commit_messages)

# Print cluster descriptions
for description in cluster_descriptions:
    print(description)


Cluster 0: Commit messages: Commit----------------------------------
Add linker script optimization flag to CMAKE rule for CUDA ARM wheel (#127514)

Original PR - https://github.com/pytorch/pytorch/pull/127220

Pull Request resolved: https://github.com/pytorch/pytorch/pull/127514
Approved by: https://github.com/Aidyn-A, https://github.com/atalman
, Commit----------------------------------
Revert "[dynamo] Bugfix for nn parameter construction (#127806)"

This reverts commit f27c4dd862bf79f37019ef277957cd577d57b66f.

Reverted https://github.com/pytorch/pytorch/pull/127806 on behalf of https://github.com/PaliC due to causing nn tests to fail ([comment](https://github.com/pytorch/pytorch/pull/127806#issuecomment-2148393903))
, Commit----------------------------------
Reapply "distributed debug handlers (#126601)" (#127805)

This reverts commit 7646825c3eb687030c4f873b01312be0eed80174.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/127805
Approved by: https://github.com/Pal