**Import Libraries**

In [1]:
# Import necessary libraries
import pandas as pd  # Pandas for data manipulation and analysis
from sklearn.preprocessing import LabelEncoder  # LabelEncoder for encoding categorical target labels
from sklearn.feature_extraction.text import TfidfVectorizer  # TfidfVectorizer for converting text data to TF-IDF features
from sklearn.svm import SVC  # SVC (Support Vector Classifier) for SVM classification
import joblib  # Joblib for saving and loading models
from sklearn.cluster import KMeans  # KMeans for clustering data into k groups
from sklearn.model_selection import train_test_split  # Train_test_split for splitting the data into training and testing sets
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, homogeneity_score, completeness_score, v_measure_score # Metrics for evaluating model performance
import matplotlib.pyplot as plt
from collections import defaultdict

**Load The Dataset**

In [2]:
# Mount Google Drive to access files and save outputs
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Specify the file path in Google Drive and load the dataset after preprocessing
file_path = '/content/drive/My Drive/kaggle_dataset_after_preprocessing.csv'
dataset = pd.read_csv(file_path)

In [4]:
# Display the DataFrame 'dataset'
# This will print the first and last 5 rows of the DataFrame along with the column names and index
# Useful for a quick overview of the data after processing
dataset

Unnamed: 0,Summary_Stemmed,processed_summary,Assignee
0,"['scroll', 'scroll', 'mice', 'touchpad', 'etc'...",scroll scroll mice touchpad etc scroll,amit@chromium.org
1,"['add', 'check', 'item', 'download', 'panel', ...",add check item download panel browser test,achuith@chromium.org
2,"['useafterfre', 'navig', 'document', 'form', '...",useafterfre navig document form valid messag s...,tkent@chromium.org
3,"['add', 'address', 'properli', 'autofil', 'opt...",add address properli autofil option dialog box,sky@chromium.org
4,"['libxmlgyp', 'defin', 'libxmlstat', 'direct',...",libxmlgyp defin libxmlstat direct depend,wtc@chromium.org
...,...,...,...
33446,"['onaddstream', 'fire', 'second', 'stream', 's...",onaddstream fire second stream sdp contain ams...,deadbeef@chromium.org
33447,"['default', 'toolchain', 'clang', 'linux', 'ca...",default toolchain clang linux caus build failu...,vivek...@samsung.com
33448,"['scatter', 'chart', 'updat', 'modifi', 'valu'...",scatter chart updat modifi valu refer cell,amol.w...@synerzip.com
33449,"['consid', 'use', '1em', 'margin', 'list', 'it...",consid use 1em margin list item identifi speci...,glebl@chromium.org


**Split the dataset into training and testing sets**

In [5]:
def leave_out_bugs(df, leave_out_ratio=0.2):
    """
    Split the dataset into training and testing sets, leaving out a specified ratio of bugs for each assignee.
    
    Args:
        df (pd.DataFrame): The input dataset containing bug reports.
        leave_out_ratio (float): The ratio of bugs to leave out for testing (default is 0.2).
    
    Returns:
        pd.DataFrame: The training set with the specified ratio of bugs left out.
        pd.DataFrame: The testing set with the specified ratio of bugs left out.
    """
    # Lists to store the resulting training and testing dataframes
    train_dfs = []
    test_dfs = []

    # Group the dataframe by 'Assignee' and split each group into training and testing sets
    for _, group in df.groupby('Assignee'):
        # Split the group into training and testing sets based on the leave_out_ratio
        train_group, test_group = train_test_split(group, test_size=leave_out_ratio, random_state=42)
        
        # Append the resulting splits to the respective lists
        train_dfs.append(train_group)
        test_dfs.append(test_group)
    
    # Concatenate all the training splits to form the final training dataframe
    # Concatenate all the testing splits to form the final testing dataframe
    return pd.concat(train_dfs), pd.concat(test_dfs)

# Split the data into training and testing sets using the leave_out_bugs function
train_df, test_df = leave_out_bugs(dataset, leave_out_ratio=0.2)

**Extract features and labels for training and testing datasets**

In [6]:
# Extract features and labels for training and testing datasets
X_train = train_df['processed_summary']
y_train = train_df['Assignee']
X_test = test_df['processed_summary']
y_test = test_df['Assignee']

**Apply TF-IDF Transformation**

In [43]:
# Initialize the TF-IDF vectorizer with n-gram range (1, 2)
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2))

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
"""
Purpose:
- Initializes a TF-IDF vectorizer object with a specified n-gram range (1, 2).
- Fits the vectorizer on the training data and transforms it into a TF-IDF matrix.

Parameters:
- ngram_range=(1, 2): Specifies to extract unigrams and bigrams.

Outputs:
- X_train_tfidf: TF-IDF matrix for training data where rows are documents and columns are TF-IDF features.
"""

# Transform the left-out (test) data
X_test_tfidf = tfidf_vectorizer.transform(X_test)
"""
Purpose:
- Transforms the test data using the same TF-IDF vectorizer fitted on the training data.

Outputs:
- X_test_tfidf: TF-IDF matrix for test data using the fitted TF-IDF vectorizer.
"""



**Clustering Bugs Using KMeans**

In [None]:
# Define the number of clusters to use for clustering the bugs
num_clusters = 200  # Adjust the number of clusters as needed

# Initialize the KMeans clustering algorithm with the specified number of clusters and a random state for reproducibility
kmeans = KMeans(n_clusters=num_clusters, random_state=42)

# Fit the KMeans model to the training TF-IDF data
kmeans.fit(X_train_tfidf)

# Predict the cluster labels for the training TF-IDF data
cluster_labels_train = kmeans.predict(X_train_tfidf)

# Predict clusters for the left-out bugs
cluster_labels_test = kmeans.predict(X_test_tfidf)

**Mapping Developers to Clusters and Evaluating Left-out Bug Assignments**

In [41]:
# Create a mapping from developers to their cluster assignments
developer_to_clusters = defaultdict(list)
for developer, cluster_label in zip(y_train, cluster_labels_train):
    developer_to_clusters[developer].append(cluster_label)

# Determine the majority cluster for each developer
majority_cluster_for_developer = {}
for developer, clusters in developer_to_clusters.items():
    # Assign the majority cluster (most frequent cluster) for each developer
    majority_cluster_for_developer[developer] = max(set(clusters), key=clusters.count)

# Verify the left-out bugs by checking if they are assigned to the correct majority cluster
correct_assignments = 0
total_assignments = len(cluster_labels_test)

# Loop through each cluster label and corresponding developer in the test set
for cluster_label, developer in zip(cluster_labels_test, y_test):
    # Check if the developer is in the majority_cluster_for_developer dictionary and if the cluster label matches
    if developer in majority_cluster_for_developer and cluster_label == majority_cluster_for_developer[developer]:
        correct_assignments += 1

# Calculate the accuracy of assigning left-out bugs
accuracy = correct_assignments / total_assignments

# Print the accuracy of assigning left-out bugs
print(f"Accuracy of assigning left-out bugs: {accuracy:.2f}")


Accuracy of assigning left-out bugs: 0.22


In [None]:
# Extract true labels from the training and validation data
true_labels_train = train_df['Assignee'].values
true_labels_val = test_df['Assignee'].values

# Evaluation metrics for the training set
ari_train = adjusted_rand_score(true_labels_train, cluster_labels_train)
"""
Measures the similarity between true labels and predicted clusters, 
considering all pairs of samples and counting pairs that are assigned in the same or different clusters in both true and predicted sets.
Range: [-1, 1], where 1 indicates perfect labeling agreement.
"""
nmi_train = normalized_mutual_info_score(true_labels_train, cluster_labels_train)
"""
Measures the amount of information shared between true labels and predicted clusters, adjusted for chance.
Range: [0, 1], where 1 indicates perfect agreement between clusters and labels.
"""
homogeneity_train = homogeneity_score(true_labels_train, cluster_labels_train)
"""
Measures whether all clusters contain only data points that are members of a single class.
Range: [0, 1], where 1 indicates perfectly homogeneous clustering.
"""
completeness_train = completeness_score(true_labels_train, cluster_labels_train)
"""
Measures whether all data points that are members of a given class are assigned to the same cluster.
Range: [0, 1], where 1 indicates perfectly complete clustering.
"""
v_measure_train = v_measure_score(true_labels_train, cluster_labels_train)
"""
Harmonic mean of homogeneity and completeness, providing a balance between these two measures.
Range: [0, 1], where 1 indicates the best possible clustering.
"""

print(f"Training set evaluation:")
print(f"Adjusted Rand Index: {ari_train}")
print(f"Normalized Mutual Information: {nmi_train}")
print(f"Homogeneity: {homogeneity_train}")
print(f"Completeness: {completeness_train}")
print(f"V-measure: {v_measure_train}")

# Evaluation metrics for the validation set
ari_val = adjusted_rand_score(true_labels_val, cluster_labels_test)
nmi_val = normalized_mutual_info_score(true_labels_val, cluster_labels_test)
homogeneity_val = homogeneity_score(true_labels_val, cluster_labels_test)
completeness_val = completeness_score(true_labels_val, cluster_labels_test)
v_measure_val = v_measure_score(true_labels_val, cluster_labels_test)

print(f"\nValidation set evaluation:")
print(f"Adjusted Rand Index: {ari_val}")
print(f"Normalized Mutual Information: {nmi_val}")
print(f"Homogeneity: {homogeneity_val}")
print(f"Completeness: {completeness_val}")
print(f"V-measure: {v_measure_val}")

Training set evaluation:
Adjusted Rand Index: 0.01085768696902282
Normalized Mutual Information: 0.38075345293706814
Homogeneity: 0.3352502440076427
Completeness: 0.4405487288918974
V-measure: 0.38075345293706825
Validation set evaluation:
Adjusted Rand Index: 0.01036117981469603
Normalized Mutual Information: 0.5181144162419392
Homogeneity: 0.45274150313976985
Completeness: 0.6055521940614514
V-measure: 0.5181144162419391


**Elbow Method for Optimal Number of Clusters**

In [None]:
# Calculate Within-Cluster Sum of Squares (WCSS) for different number of clusters
wcss = []
max_clusters = 100

# Iterate over different numbers of clusters
for i in range(1, max_clusters + 1):
    kmeans = KMeans(n_clusters=i, random_state=42)
    kmeans.fit(X_train_tfidf)
    wcss.append(kmeans.inertia_)

# Plot the elbow curve to find the optimal number of clusters
plt.figure(figsize=(10, 6))
plt.plot(range(1, max_clusters + 1), wcss, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS (Within-Cluster Sum of Squares)')
plt.title('Elbow Method for Optimal Number of Clusters')
plt.show()

