In [1]:
pip install nimfa

Collecting nimfa
  Downloading nimfa-1.4.0-py2.py3-none-any.whl (4.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.7/4.7 MB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: nimfa
Successfully installed nimfa-1.4.0


In [2]:
!pip install nltk



In [3]:
import pandas as pd
import os
import numpy as np
import string
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
# remove the stop words from the preprocessed data using nltk
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [8]:
def convert_lower_case(data):
    return str(data).lower()

In [9]:
def remove_punctuation(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in symbols:
        data = np.char.replace(data, i, ' ')

    return str(data)


In [10]:
def remove_apostrophe(data):
    return np.char.replace(data, "'", "")

In [11]:
def remove_numbers(data):
    return re.sub(r'\d+', '', str(data))

In [12]:
def remove_single_characters(tokens):
    new_text = ""
    for w in tokens:
        if len(w) > 1:
            new_text = new_text + " " + w
    return new_text

In [13]:
def lemmatization(data):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(data)
    data = remove_single_characters(tokens)
    lemmatized_output = ' '.join([lemmatizer.lemmatize(word) for word in tokens])
    return lemmatized_output

In [14]:
def preprocess(data):
    data = convert_lower_case(data)
    data = remove_punctuation(data)
    data = remove_apostrophe(data)
    data = remove_numbers(data)
    data = lemmatization(data)
    return data

In [16]:
# read the preprocessed data from the new file
preprocessed_train_df = pd.read_csv('/content/preprocessed_train_data2.csv')

# show the first 5 rows of the preprocessed training data
print(preprocessed_train_df.head())

                                     bug_description class_name
0  for any event on my bookmarked project option ...    Backend
1               switch to using full ln id in urlbar   Frontend
2  consider removing hasicon property to simplify...   Frontend
3  method to obtain current url from webbrowsered...   Frontend
4                fix migration fails in m sql server    Backend


In [17]:
# read the preprocessed data from the new file
preprocessed_test_df = pd.read_csv('/content/preprocessed_test_data2.csv')

# show the first 5 rows of the preprocessed training data
print(preprocessed_test_df.head())

                                     bug_description class_name
0  rest api ability to list sub project for a pro...    Backend
1  support selective text on right if set in gnom...   Frontend
2  meta userstory ship v of pre populated topsite...   Frontend
3  include updated on and passwd changed on colum...    Backend
4         problem with email integration to m office    Backend


In [24]:
# remove the stop words from the preprocessed data using nltk
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [25]:
def remove_stop_words(data):
    tokens = word_tokenize(data)
    data = ' '.join([i for i in tokens if not i in stop_words])
    return data

# preprocess the first report of the training data
print(preprocess(preprocessed_train_df['bug_description'][0]))

# remove the stop words from the preprocessed data
print(remove_stop_words(preprocess(preprocessed_train_df['bug_description'][0])))

# preprocess the first report of the testing data
print(preprocess(preprocessed_test_df['bug_description'][0]))

# remove the stop words from the preprocessed data
print(remove_stop_words(preprocess(preprocessed_test_df['bug_description'][0])))


event bookmarked project option sending notification non member bookmarked project
event bookmarked project option sending notification non member bookmarked project
rest api ability list sub project project
rest api ability list sub project project


In [26]:
# Convert non-string values to strings in 'bug_description' column
preprocessed_train_df['bug_description'] = preprocessed_train_df['bug_description'].apply(lambda x: str(x))
preprocessed_test_df['bug_description'] = preprocessed_test_df['bug_description'].apply(lambda x: str(x))

# Remove stop words from 'bug_description' column
preprocessed_train_df['bug_description'] = preprocessed_train_df['bug_description'].apply(lambda x: remove_stop_words(x))
preprocessed_test_df['bug_description'] = preprocessed_test_df['bug_description'].apply(lambda x: remove_stop_words(x))

# Show the first 5 rows of the preprocessed training data
print(preprocessed_train_df.head())

# Show the first 5 rows of the preprocessed testing data
print(preprocessed_test_df.head())


                                     bug_description class_name
0  event bookmarked project option sending notifi...    Backend
1                     switch using full ln id urlbar   Frontend
2  consider removing hasicon property simplify st...   Frontend
3         method obtain current url webbrowsereditor   Frontend
4                     fix migration fails sql server    Backend
                                     bug_description class_name
0          rest api ability list sub project project    Backend
1     support selective text right set gnome setting   Frontend
2  meta userstory ship v pre populated topsites a...   Frontend
3  include updated passwd changed column user api...    Backend
4                   problem email integration office    Backend


In [29]:
# keep only the reports that has class_name of Frontend, Backend, Security, Documentation
# Filter the training data
filtered_train_df = preprocessed_train_df[
    (preprocessed_train_df['class_name'] == 'Frontend') |
    (preprocessed_train_df['class_name'] == 'Backend') |
    (preprocessed_train_df['class_name'] == 'Security') |
    (preprocessed_train_df['class_name'] == 'Documentation')
]

# Filter the testing data
filtered_test_df = preprocessed_test_df[
    (preprocessed_test_df['class_name'] == 'Frontend') |
    (preprocessed_test_df['class_name'] == 'Backend') |
    (preprocessed_test_df['class_name'] == 'Security') |
    (preprocessed_test_df['class_name'] == 'Documentation')
]

# Show the first 5 rows of the filtered training data
print("Filtered Training Data:")
print(filtered_train_df.head())

# Show the first 5 rows of the filtered testing data
print("\nFiltered Testing Data:")
print(filtered_test_df.head())


Filtered Training Data:
                                     bug_description class_name
0  event bookmarked project option sending notifi...    Backend
1                     switch using full ln id urlbar   Frontend
2  consider removing hasicon property simplify st...   Frontend
3         method obtain current url webbrowsereditor   Frontend
4                     fix migration fails sql server    Backend

Filtered Testing Data:
                                     bug_description class_name
0          rest api ability list sub project project    Backend
1     support selective text right set gnome setting   Frontend
2  meta userstory ship v pre populated topsites a...   Frontend
3  include updated passwd changed column user api...    Backend
4                   problem email integration office    Backend


In [30]:
# print the unique class names in the training data
print(filtered_train_df['class_name'].unique())

# print the unique class names in the testing data
print(filtered_test_df['class_name'].unique())

['Backend' 'Frontend' 'Security' 'Documentation']
['Backend' 'Frontend' 'Documentation' 'Security']


## Feature Exraction

In [31]:
print(len(filtered_train_df))

13777


In [32]:
# Define the mapping of class names to the desired order
class_name_mapping = {
    'Backend': 1,
    'Frontend': 0,
    'Security': 2,
    'Documentation': 3
}

In [34]:
import torchtext.vocab as vocab

# Load pre-trained GloVe embeddings
glove = vocab.GloVe(name='6B', dim=300)


.vector_cache/glove.6B.zip: 862MB [02:39, 5.40MB/s]                           
100%|█████████▉| 399999/400000 [01:05<00:00, 6118.55it/s]


In [35]:
# Tokenize bug reports and map tokens to GloVe embeddings
def tokenize_and_map_to_glove(text):
    tokens = text.split()
    embeddings = [glove[token.lower()] for token in tokens if token.lower() in glove.stoi]
    return embeddings

In [36]:
# Example usage:
tokenized_bug_reports_train = [tokenize_and_map_to_glove(text) for text in filtered_train_df['bug_description']]
tokenized_bug_reports_test = [tokenize_and_map_to_glove(text) for text in filtered_test_df['bug_description']]

In [37]:
# Aggregate token embeddings (e.g., by averaging)
def aggregate_embeddings(embeddings):
    if embeddings:
        return torch.stack(embeddings).mean(dim=0)
    else:
        # Return a zero vector if no embeddings are found
        return torch.zeros(glove.vectors.shape[1])

In [39]:
import torch

# Example usage:
X_train = torch.stack([aggregate_embeddings(embeddings) for embeddings in tokenized_bug_reports_train])
X_test = torch.stack([aggregate_embeddings(embeddings) for embeddings in tokenized_bug_reports_test])

# Now you can use X_train and X_test as features for your classification model


In [40]:
import numpy as np

# Save X_train and X_test to files
np.save('X_train.npy', X_train)
np.save('X_test.npy', X_test)


In [None]:
import numpy as np

# Load X_train and X_test from files
X_train = np.load('X_train.npy')
X_test = np.load('X_test.npy')


In [53]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support
import numpy as np

def apply_clustering(data, class_name, num_clusters=4):
    """
    Apply hierarchical clustering to GloVe embeddings and evaluate the performance.

    Parameters:
    - data: DataFrame containing the GloVe embeddings.
    - class_name: Series containing the class names corresponding to the data.
    - num_clusters: Number of clusters to be generated (default is 4).

    Returns:
    - None
    """
    # Apply hierarchical clustering
    clustering = AgglomerativeClustering(n_clusters=num_clusters)
    cluster_labels = clustering.fit_predict(data)

    # Map cluster labels to class names
    cluster_class_mapping = {
        0: 'Backend',
        1: 'Frontend',
        2: 'Security',
        3: 'Documentation'
        # Add more mappings if needed
    }
    predicted_class_names = [cluster_class_mapping[label] for label in cluster_labels]

    # Evaluate clustering performance
    print("Classification Report:")
    print(classification_report(class_name, predicted_class_names))

    print("Confusion Matrix:")
    print(confusion_matrix(class_name, predicted_class_names))

    accuracy = np.mean(class_name == predicted_class_names)
    print("Accuracy:", accuracy)

    precision, recall, f1_score, _ = precision_recall_fscore_support(class_name, predicted_class_names, average='weighted')
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1_score)

# Assuming you have obtained GloVe embeddings and stored them in X_train and X_test
# Define the number of clusters
num_clusters = 4  # Adjust as needed

# Apply clustering on training data
apply_clustering(X_train, filtered_train_df['class_name'], num_clusters=num_clusters)

# Apply clustering on test data
apply_clustering(X_test, filtered_test_df['class_name'], num_clusters=num_clusters)


Classification Report:
               precision    recall  f1-score   support

      Backend       0.65      0.41      0.50      7437
Documentation       0.01      0.03      0.01       174
     Frontend       0.77      0.38      0.51      5799
     Security       0.04      0.52      0.07       367

     accuracy                           0.39     13777
    macro avg       0.37      0.33      0.27     13777
 weighted avg       0.68      0.39      0.49     13777

Confusion Matrix:
[[3012  938  555 2932]
 [  84    5   30   55]
 [1458   20 2228 2093]
 [ 108    3   66  190]]
Accuracy: 0.3944980765043188
Precision: 0.6755257556522494
Recall: 0.3944980765043188
F1 Score: 0.48680963728651483
Classification Report:
               precision    recall  f1-score   support

      Backend       0.61      0.54      0.57      1345
Documentation       0.01      0.38      0.02        21
     Frontend       0.36      0.15      0.21       987
     Security       0.01      0.01      0.01        70

     ac

In [56]:
from sklearn.mixture import GaussianMixture
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support
import numpy as np

def apply_gmm_clustering(data, class_name, num_clusters=4, random_state=42):
    """
    Apply Gaussian Mixture Model clustering to GloVe embeddings and evaluate the performance.

    Parameters:
    - data: DataFrame containing the GloVe embeddings.
    - class_name: Series containing the class names corresponding to the data.
    - num_clusters: Number of clusters to be generated (default is 4).
    - random_state: Random seed for reproducibility (default is 42).

    Returns:
    - None
    """
    # Apply Gaussian Mixture Model clustering
    gmm = GaussianMixture(n_components=num_clusters, random_state=random_state)
    cluster_labels = gmm.fit_predict(data)

    # Map cluster labels to class names
    cluster_class_mapping = {
        0: 'Backend',
        1: 'Frontend',
        2: 'Security',
        3: 'Documentation'
        # Add more mappings if needed
    }
    predicted_class_names = [cluster_class_mapping[label] for label in cluster_labels]

    # Evaluate clustering performance
    print("Classification Report:")
    print(classification_report(class_name, predicted_class_names))

    print("Confusion Matrix:")
    print(confusion_matrix(class_name, predicted_class_names))

    accuracy = np.mean(class_name == predicted_class_names)
    print("Accuracy:", accuracy)

    precision, recall, f1_score, _ = precision_recall_fscore_support(class_name, predicted_class_names, average='weighted')
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1_score)

# Assuming you have obtained GloVe embeddings and stored them in X_train and X_test
# Define the number of clusters
num_clusters = 4  # Adjust as needed

# Apply GMM clustering on training data
apply_gmm_clustering(X_train, filtered_train_df['class_name'], num_clusters=num_clusters)

# Apply GMM clustering on test data
apply_gmm_clustering(X_test, filtered_test_df['class_name'], num_clusters=num_clusters)


Classification Report:
               precision    recall  f1-score   support

      Backend       0.25      0.18      0.21      7437
Documentation       0.01      0.03      0.01       174
     Frontend       0.40      0.27      0.32      5799
     Security       0.01      0.09      0.02       367

     accuracy                           0.21     13777
    macro avg       0.17      0.14      0.14     13777
 weighted avg       0.30      0.21      0.25     13777

Confusion Matrix:
[[1330  977 2103 3027]
 [  19    6  102   47]
 [3804   26 1544  425]
 [ 178    5  152   32]]
Accuracy: 0.21136677070479784
Precision: 0.3015888718062606
Recall: 0.21136677070479784
F1 Score: 0.24702568549861834
Classification Report:
               precision    recall  f1-score   support

      Backend       0.58      0.44      0.50      1345
Documentation       0.01      0.05      0.01        21
     Frontend       0.63      0.42      0.50       987
     Security       0.04      0.27      0.06        70

     

In [None]:
from sklearn.cluster import SpectralClustering
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support
import numpy as np

def apply_spectral_clustering(data, class_name, num_clusters=4, affinity='rbf', random_state=42):
    """
    Apply Spectral Clustering to GloVe embeddings and evaluate the performance.

    Parameters:
    - data: DataFrame containing the GloVe embeddings.
    - class_name: Series containing the class names corresponding to the data.
    - num_clusters: Number of clusters to be generated (default is 4).
    - affinity: The affinity matrix to use (default is 'rbf').
    - random_state: Random seed for reproducibility (default is 42).

    Returns:
    - None
    """
    # Apply Spectral Clustering
    spectral = SpectralClustering(n_clusters=num_clusters, affinity=affinity, random_state=random_state)
    cluster_labels = spectral.fit_predict(data)

    # Map cluster labels to class names
    cluster_class_mapping = {
        label: f'Cluster_{label}' for label in np.unique(cluster_labels)
    }
    predicted_class_names = [cluster_class_mapping[label] for label in cluster_labels]

    # Evaluate clustering performance
    print("Classification Report:")
    print(classification_report(class_name, predicted_class_names))

    print("Confusion Matrix:")
    print(confusion_matrix(class_name, predicted_class_names))

    accuracy = np.mean(class_name == predicted_class_names)
    print("Accuracy:", accuracy)

    precision, recall, f1_score, _ = precision_recall_fscore_support(class_name, predicted_class_names, average='weighted')
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1_score)

# Assuming you have obtained GloVe embeddings and stored them in X_train and X_test
# Define the number of clusters and affinity metric
num_clusters = 4  # Adjust as needed
affinity = 'rbf'  # Options: {'nearest_neighbors', 'rbf', 'precomputed'}

# Apply Spectral Clustering on training data
apply_spectral_clustering(X_train, filtered_train_df['class_name'], num_clusters=num_clusters, affinity=affinity)

# Apply Spectral Clustering on test data
apply_spectral_clustering(X_test, filtered_test_df['class_name'], num_clusters=num_clusters, affinity=affinity)


In [None]:
from sklearn.cluster import AffinityPropagation
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support
import numpy as np

def apply_affinity_propagation(data, class_name, damping=0.5, random_state=None):
    """
    Apply Affinity Propagation to GloVe embeddings and evaluate the performance.

    Parameters:
    - data: DataFrame containing the GloVe embeddings.
    - class_name: Series containing the class names corresponding to the data.
    - damping: Damping factor (between 0.5 and 1) (default is 0.5).
    - random_state: Random seed for reproducibility (default is None).

    Returns:
    - None
    """
    # Apply Affinity Propagation
    affinity_propagation = AffinityPropagation(damping=damping, random_state=random_state)
    cluster_labels = affinity_propagation.fit_predict(data)

    # Number of clusters
    num_clusters = len(np.unique(cluster_labels))

    # Map cluster labels to class names
    cluster_class_mapping = {
        label: f'Cluster_{label}' for label in np.unique(cluster_labels)
    }
    predicted_class_names = [cluster_class_mapping[label] for label in cluster_labels]

    # Evaluate clustering performance
    print("Number of Clusters:", num_clusters)
    print("Classification Report:")
    print(classification_report(class_name, predicted_class_names))

    print("Confusion Matrix:")
    print(confusion_matrix(class_name, predicted_class_names))

    accuracy = np.mean(class_name == predicted_class_names)
    print("Accuracy:", accuracy)

    precision, recall, f1_score, _ = precision_recall_fscore_support(class_name, predicted_class_names, average='weighted')
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1_score)

# Assuming you have obtained GloVe embeddings and stored them in X_train and X_test
# Define damping factor and random state
damping = 0.5  # Adjust as needed
random_state = 42  # Adjust as needed

# Apply Affinity Propagation on training data
apply_affinity_propagation(X_train, filtered_train_df['class_name'], damping=damping, random_state=random_state)

# Apply Affinity Propagation on test data
apply_affinity_propagation(X_test, filtered_test_df['class_name'], damping=damping, random_state=random_state)


In [None]:
from sklearn.cluster import OPTICS
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support
import numpy as np

def apply_optics(data, class_name, min_samples=5, xi=0.05, min_cluster_size=0.1):
    """
    Apply OPTICS clustering to GloVe embeddings and evaluate the performance.

    Parameters:
    - data: DataFrame containing the GloVe embeddings.
    - class_name: Series containing the class names corresponding to the data.
    - min_samples: The number of samples in a neighborhood for a data point to be considered a core point (default is 5).
    - xi: Determines the minimum steepness of the cluster hierarchy that will be returned (default is 0.05).
    - min_cluster_size: The minimum number of samples in a cluster (default is 0.1).

    Returns:
    - None
    """
    # Apply OPTICS clustering
    optics = OPTICS(min_samples=min_samples, xi=xi, min_cluster_size=min_cluster_size)
    cluster_labels = optics.fit_predict(data)

    # Number of clusters
    num_clusters = len(np.unique(cluster_labels))

    # Map cluster labels to class names
    cluster_class_mapping = {
        label: f'Cluster_{label}' for label in np.unique(cluster_labels)
    }
    predicted_class_names = [cluster_class_mapping[label] for label in cluster_labels]

    # Evaluate clustering performance
    print("Number of Clusters:", num_clusters)
    print("Classification Report:")
    print(classification_report(class_name, predicted_class_names))

    print("Confusion Matrix:")
    print(confusion_matrix(class_name, predicted_class_names))

    accuracy = np.mean(class_name == predicted_class_names)
    print("Accuracy:", accuracy)

    precision, recall, f1_score, _ = precision_recall_fscore_support(class_name, predicted_class_names, average='weighted')
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1_score)

# Assuming you have obtained GloVe embeddings and stored them in X_train and X_test
# Define parameters for OPTICS
min_samples = 5  # Adjust as needed
xi = 0.05  # Adjust as needed
min_cluster_size = 0.1  # Adjust as needed

# Apply OPTICS on training data
apply_optics(X_train, filtered_train_df['class_name'], min_samples=min_samples, xi=xi, min_cluster_size=min_cluster_size)

# Apply OPTICS on test data
apply_optics(X_test, filtered_test_df['class_name'], min_samples=min_samples, xi=xi, min_cluster_size=min_cluster_size)


In [None]:
from sklearn.cluster import MeanShift
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support
import numpy as np

def apply_mean_shift(data, class_name, bandwidth=None):
    """
    Apply Mean Shift clustering to GloVe embeddings and evaluate the performance.

    Parameters:
    - data: DataFrame containing the GloVe embeddings.
    - class_name: Series containing the class names corresponding to the data.
    - bandwidth: Bandwidth parameter for the Mean Shift algorithm (optional).

    Returns:
    - None
    """
    # Apply Mean Shift clustering
    mean_shift = MeanShift(bandwidth=bandwidth)
    cluster_labels = mean_shift.fit_predict(data)

    # Number of clusters
    num_clusters = len(np.unique(cluster_labels))

    # Map cluster labels to class names
    cluster_class_mapping = {
        label: f'Cluster_{label}' for label in np.unique(cluster_labels)
    }
    predicted_class_names = [cluster_class_mapping[label] for label in cluster_labels]

    # Evaluate clustering performance
    print("Number of Clusters:", num_clusters)
    print("Classification Report:")
    print(classification_report(class_name, predicted_class_names))

    print("Confusion Matrix:")
    print(confusion_matrix(class_name, predicted_class_names))

    accuracy = np.mean(class_name == predicted_class_names)
    print("Accuracy:", accuracy)

    precision, recall, f1_score, _ = precision_recall_fscore_support(class_name, predicted_class_names, average='weighted')
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1_score)

# Assuming you have obtained GloVe embeddings and stored them in X_train and X_test
# Define bandwidth parameter for Mean Shift
bandwidth = 0.1  # Adjust as needed

# Apply Mean Shift clustering on training data
apply_mean_shift(X_train, filtered_train_df['class_name'], bandwidth=bandwidth)

# Apply Mean Shift clustering on test data
apply_mean_shift(X_test, filtered_test_df['class_name'], bandwidth=bandwidth)


In [None]:
from sklearn.cluster import Birch
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support
import numpy as np

def apply_birch(data, class_name, threshold=0.5, branching_factor=50):
    """
    Apply Birch clustering to GloVe embeddings and evaluate the performance.

    Parameters:
    - data: DataFrame containing the GloVe embeddings.
    - class_name: Series containing the class names corresponding to the data.
    - threshold: The branching factor threshold (optional).
    - branching_factor: The number of subclusters in each node (optional).

    Returns:
    - None
    """
    # Apply Birch clustering
    birch = Birch(threshold=threshold, branching_factor=branching_factor)
    cluster_labels = birch.fit_predict(data)

    # Number of clusters
    num_clusters = len(np.unique(cluster_labels))

    # Map cluster labels to class names
    cluster_class_mapping = {
        label: f'Cluster_{label}' for label in np.unique(cluster_labels)
    }
    predicted_class_names = [cluster_class_mapping[label] for label in cluster_labels]

    # Evaluate clustering performance
    print("Number of Clusters:", num_clusters)
    print("Classification Report:")
    print(classification_report(class_name, predicted_class_names))

    print("Confusion Matrix:")
    print(confusion_matrix(class_name, predicted_class_names))

    accuracy = np.mean(class_name == predicted_class_names)
    print("Accuracy:", accuracy)

    precision, recall, f1_score, _ = precision_recall_fscore_support(class_name, predicted_class_names, average='weighted')
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1_score)

# Assuming you have obtained GloVe embeddings and stored them in X_train and X_test
# Define Birch parameters
threshold = 0.5  # Adjust as needed
branching_factor = 50  # Adjust as needed

# Apply Birch clustering on training data
apply_birch(X_train, filtered_train_df['class_name'], threshold=threshold, branching_factor=branching_factor)

# Apply Birch clustering on test data
apply_birch(X_test, filtered_test_df['class_name'], threshold=threshold, branching_factor=branching_factor)


In [None]:
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support
import numpy as np

def apply_mini_batch_kmeans(data, class_name, num_clusters=4, random_state=42):
    """
    Apply MiniBatchKMeans clustering to GloVe embeddings and evaluate the performance.

    Parameters:
    - data: DataFrame containing the GloVe embeddings.
    - class_name: Series containing the class names corresponding to the data.
    - num_clusters: Number of clusters to be generated (default is 4).
    - random_state: Random seed for reproducibility (default is 42).

    Returns:
    - None
    """
    # Apply MiniBatchKMeans clustering
    mbkmeans = MiniBatchKMeans(n_clusters=num_clusters, random_state=random_state)
    cluster_labels = mbkmeans.fit_predict(data)

    # Map cluster labels to class names
    cluster_class_mapping = {
        label: f'Cluster_{label}' for label in np.unique(cluster_labels)
    }
    predicted_class_names = [cluster_class_mapping[label] for label in cluster_labels]

    # Evaluate clustering performance
    print("Number of Clusters:", num_clusters)
    print("Classification Report:")
    print(classification_report(class_name, predicted_class_names))

    print("Confusion Matrix:")
    print(confusion_matrix(class_name, predicted_class_names))

    accuracy = np.mean(class_name == predicted_class_names)
    print("Accuracy:", accuracy)

    precision, recall, f1_score, _ = precision_recall_fscore_support(class_name, predicted_class_names, average='weighted')
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1_score)

# Assuming you have obtained GloVe embeddings and stored them in X_train and X_test
# Define the number of clusters
num_clusters = 4  # Adjust as needed

# Apply MiniBatchKMeans clustering on training data
apply_mini_batch_kmeans(X_train, filtered_train_df['class_name'], num_clusters=num_clusters)

# Apply MiniBatchKMeans clustering on test data
apply_mini_batch_kmeans(X_test, filtered_test_df['class_name'], num_clusters=num_clusters)


In [None]:
from sklearn.cluster import DBSCAN
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support
import numpy as np

def apply_dbscan(data, class_name, eps=0.5, min_samples=5):
    """
    Apply DBSCAN clustering to GloVe embeddings and evaluate the performance.

    Parameters:
    - data: DataFrame containing the GloVe embeddings.
    - class_name: Series containing the class names corresponding to the data.
    - eps: The maximum distance between two samples for one to be considered as in the neighborhood of the other (default is 0.5).
    - min_samples: The number of samples (or total weight) in a neighborhood for a point to be considered as a core point (default is 5).

    Returns:
    - None
    """
    # Apply DBSCAN clustering
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    cluster_labels = dbscan.fit_predict(data)

    # Map cluster labels to class names
    cluster_class_mapping = {
        label: f'Cluster_{label}' for label in np.unique(cluster_labels)
    }
    predicted_class_names = [cluster_class_mapping[label] for label in cluster_labels]

    # Evaluate clustering performance
    print("DBSCAN Parameters:")
    print("Epsilon:", eps)
    print("Min Samples:", min_samples)
    print("Classification Report:")
    print(classification_report(class_name, predicted_class_names))

    print("Confusion Matrix:")
    print(confusion_matrix(class_name, predicted_class_names))

    accuracy = np.mean(class_name == predicted_class_names)
    print("Accuracy:", accuracy)

    precision, recall, f1_score, _ = precision_recall_fscore_support(class_name, predicted_class_names, average='weighted')
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1_score)

# Assuming you have obtained GloVe embeddings and stored them in X_train and X_test
# Define DBSCAN parameters
eps = 0.5  # Maximum distance between two samples
min_samples = 5  # Minimum number of samples in a neighborhood

# Apply DBSCAN clustering on training data
apply_dbscan(X_train, filtered_train_df['class_name'], eps=eps, min_samples=min_samples)

# Apply DBSCAN clustering on test data
apply_dbscan(X_test, filtered_test_df['class_name'], eps=eps, min_samples=min_samples)


In [None]:
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support
import numpy as np

def apply_pca_kmeans(data, class_name, num_clusters=4, pca_components=50, random_state=42):
    """
    Apply PCA followed by K-means clustering to GloVe embeddings and evaluate the performance.

    Parameters:
    - data: DataFrame containing the GloVe embeddings.
    - class_name: Series containing the class names corresponding to the data.
    - num_clusters: Number of clusters to be generated by K-means (default is 4).
    - pca_components: Number of principal components to retain (default is 50).
    - random_state: Random seed for reproducibility (default is 42).

    Returns:
    - None
    """
    # Apply PCA for dimensionality reduction
    pca = PCA(n_components=pca_components, random_state=random_state)
    reduced_data = pca.fit_transform(data)

    # Apply K-means clustering on the reduced data
    kmeans = KMeans(n_clusters=num_clusters, random_state=random_state)
    cluster_labels = kmeans.fit_predict(reduced_data)

    # Map cluster labels to class names
    cluster_class_mapping = {
        label: f'Cluster_{label}' for label in np.unique(cluster_labels)
    }
    predicted_class_names = [cluster_class_mapping[label] for label in cluster_labels]

    # Evaluate clustering performance
    print("PCA Components:", pca_components)
    print("Classification Report:")
    print(classification_report(class_name, predicted_class_names))

    print("Confusion Matrix:")
    print(confusion_matrix(class_name, predicted_class_names))

    accuracy = np.mean(class_name == predicted_class_names)
    print("Accuracy:", accuracy)

    precision, recall, f1_score, _ = precision_recall_fscore_support(class_name, predicted_class_names, average='weighted')
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1_score)

# Assuming you have obtained GloVe embeddings and stored them in X_train and X_test
# Define the number of clusters and PCA components
num_clusters = 4  # Adjust as needed
pca_components = 50  # Adjust as needed

# Apply PCA followed by K-means clustering on training data
apply_pca_kmeans(X_train, filtered_train_df['class_name'], num_clusters=num_clusters, pca_components=pca_components)

# Apply PCA followed by K-means clustering on test data
apply_pca_kmeans(X_test, filtered_test_df['class_name'], num_clusters=num_clusters, pca_components=pca_components)


In [None]:
pip install MiniSom

In [None]:
from minisom import MiniSom
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support
import numpy as np

def apply_som(data, class_name, grid_size=(10, 10), num_iterations=100, random_seed=42):
    """
    Apply Self-Organizing Maps (SOM) clustering to GloVe embeddings and evaluate the performance.

    Parameters:
    - data: DataFrame containing the GloVe embeddings.
    - class_name: Series containing the class names corresponding to the data.
    - grid_size: Tuple specifying the size of the SOM grid (default is (10, 10)).
    - num_iterations: Number of iterations for training the SOM (default is 100).
    - random_seed: Random seed for reproducibility (default is 42).

    Returns:
    - None
    """
    # Initialize the SOM
    som = MiniSom(grid_size[0], grid_size[1], data.shape[1], sigma=1.0, learning_rate=0.5, random_seed=random_seed)

    # Train the SOM
    som.train_random(data, num_iterations)

    # Find the best-matching units (BMUs) for each data point
    bmu_indices = np.array([som.winner(x) for x in data])

    # Map BMUs to cluster labels
    cluster_labels = np.ravel_multi_index(bmu_indices.T, grid_size)

    # Evaluate clustering performance
    print("Classification Report:")
    print(classification_report(class_name, cluster_labels))

    print("Confusion Matrix:")
    print(confusion_matrix(class_name, cluster_labels))

    accuracy = np.mean(class_name == cluster_labels)
    print("Accuracy:", accuracy)

    precision, recall, f1_score, _ = precision_recall_fscore_support(class_name, cluster_labels, average='weighted')
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1_score)

# Assuming you have obtained GloVe embeddings and stored them in X_train and X_test
# Define the grid size and number of iterations
grid_size = (10, 10)  # Adjust as needed
num_iterations = 100   # Adjust as needed

# Apply SOM clustering on training data
apply_som(X_train, filtered_train_df['class_name'], grid_size=grid_size, num_iterations=num_iterations)

# Apply SOM clustering on test data
apply_som(X_test, filtered_test_df['class_name'], grid_size=grid_size, num_iterations=num_iterations)


In [None]:
pip install hdbscan


In [None]:
import hdbscan
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support
import numpy as np

def apply_hdbscan(data, class_name, min_cluster_size=5, min_samples=5):
    """
    Apply HDBSCAN clustering to GloVe embeddings and evaluate the performance.

    Parameters:
    - data: DataFrame containing the GloVe embeddings.
    - class_name: Series containing the class names corresponding to the data.
    - min_cluster_size: The minimum size of clusters (default is 5).
    - min_samples: The number of samples in a neighborhood for a point to be considered as a core point (default is 5).

    Returns:
    - None
    """
    # Apply HDBSCAN clustering
    clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples)
    cluster_labels = clusterer.fit_predict(data)

    # Evaluate clustering performance
    print("Classification Report:")
    print(classification_report(class_name, cluster_labels))

    print("Confusion Matrix:")
    print(confusion_matrix(class_name, cluster_labels))

    accuracy = np.mean(class_name == cluster_labels)
    print("Accuracy:", accuracy)

    precision, recall, f1_score, _ = precision_recall_fscore_support(class_name, cluster_labels, average='weighted')
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1_score)

# Assuming you have obtained GloVe embeddings and stored them in X_train and X_test
# Define the minimum cluster size and minimum samples
min_cluster_size = 5  # Adjust as needed
min_samples = 5       # Adjust as needed

# Apply HDBSCAN clustering on training data
apply_hdbscan(X_train, filtered_train_df['class_name'], min_cluster_size=min_cluster_size, min_samples=min_samples)

# Apply HDBSCAN clustering on test data
apply_hdbscan(X_test, filtered_test_df['class_name'], min_cluster_size=min_cluster_size, min_samples=min_samples)


In [None]:
pip install scikit-learn-extra


In [None]:
from sklearn_extra.cluster import OPTICS
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support
import numpy as np

def apply_optics_hierarchical(data, class_name, min_samples=5):
    """
    Apply OPTICS clustering with hierarchical clustering to GloVe embeddings and evaluate the performance.

    Parameters:
    - data: DataFrame containing the GloVe embeddings.
    - class_name: Series containing the class names corresponding to the data.
    - min_samples: The number of samples in a neighborhood for a point to be considered as a core point (default is 5).

    Returns:
    - None
    """
    # Apply OPTICS clustering
    optics = OPTICS(min_samples=min_samples)
    optics.fit(data)

    # Apply hierarchical clustering to reachability plot
    hierarchical_clusterer = AgglomerativeClustering(n_clusters=None, distance_threshold=0)
    hierarchical_clusterer.fit(optics.reachability_)

    # Evaluate clustering performance
    print("Classification Report:")
    print(classification_report(class_name, hierarchical_clusterer.labels_))

    print("Confusion Matrix:")
    print(confusion_matrix(class_name, hierarchical_clusterer.labels_))

    accuracy = np.mean(class_name == hierarchical_clusterer.labels_)
    print("Accuracy:", accuracy)

    precision, recall, f1_score, _ = precision_recall_fscore_support(class_name, hierarchical_clusterer.labels_, average='weighted')
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1_score)

# Assuming you have obtained GloVe embeddings and stored them in X_train and X_test
# Define the minimum number of samples
min_samples = 5  # Adjust as needed

# Apply OPTICS with hierarchical clustering on training data
apply_optics_hierarchical(X_train, filtered_train_df['class_name'], min_samples=min_samples)

# Apply OPTICS with hierarchical clustering on test data
apply_optics_hierarchical(X_test, filtered_test_df['class_name'], min_samples=min_samples)
