In [13]:
import pandas as pd
import os
import numpy as np
import string
import re
from itertools import combinations
from sklearn.feature_extraction.text import TfidfVectorizer


import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [14]:
def convert_lower_case(data):
    return str(data).lower()

In [15]:
def remove_punctuation(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in symbols:
        data = np.char.replace(data, i, ' ')

    return str(data)


In [16]:
def remove_apostrophe(data):
    return np.char.replace(data, "'", "")

In [17]:
def remove_numbers(data):
    return re.sub(r'\d+', '', str(data))

In [18]:
def remove_single_characters(tokens):
    new_text = ""
    for w in tokens:
        if len(w) > 1:
            new_text = new_text + " " + w
    return new_text

In [19]:
def lemmatization(data):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(data)
    data = remove_single_characters(tokens)
    lemmatized_output = ' '.join([lemmatizer.lemmatize(word) for word in tokens])
    return lemmatized_output

In [20]:
def preprocess(data):
    data = convert_lower_case(data)
    data = remove_punctuation(data)
    data = remove_apostrophe(data)
    data = remove_numbers(data)
    data = lemmatization(data)
    return data

In [23]:
# read the preprocessed data from the new file
preprocessed_train_df = pd.read_csv('preprocessed_train_data2.csv')

# show the first 5 rows of the preprocessed training data
print(preprocessed_train_df.head())

                                     bug_description class_name
0  for any event on my bookmarked project option ...    Backend
1               switch to using full ln id in urlbar   Frontend
2  consider removing hasicon property to simplify...   Frontend
3  method to obtain current url from webbrowsered...   Frontend
4                fix migration fails in m sql server    Backend


In [24]:
# read the preprocessed data from the new file
preprocessed_test_df = pd.read_csv('preprocessed_test_data2.csv')

# show the first 5 rows of the preprocessed training data
print(preprocessed_test_df.head())

                                     bug_description class_name
0  rest api ability to list sub project for a pro...    Backend
1  support selective text on right if set in gnom...   Frontend
2  meta userstory ship v of pre populated topsite...   Frontend
3  include updated on and passwd changed on colum...    Backend
4         problem with email integration to m office    Backend


In [25]:
def remove_stop_words(data):
    tokens = word_tokenize(data)
    data = ' '.join([i for i in tokens if not i in stop_words])
    return data

# preprocess the first report of the training data
print(preprocess(preprocessed_train_df['bug_description'][0]))

# remove the stop words from the preprocessed data
print(remove_stop_words(preprocess(preprocessed_train_df['bug_description'][0])))

# preprocess the first report of the testing data
print(preprocess(preprocessed_test_df['bug_description'][0]))

# remove the stop words from the preprocessed data
print(remove_stop_words(preprocess(preprocessed_test_df['bug_description'][0])))

preprocessed_test_df['bug_description'][0]

for any event on my bookmarked project option not sending notification for non member bookmarked project
event bookmarked project option sending notification non member bookmarked project
rest api ability to list sub project for a project
rest api ability list sub project project


'rest api ability to list sub project for a project'

In [26]:
# Convert non-string values to strings in 'bug_description' column
preprocessed_train_df['bug_description'] = preprocessed_train_df['bug_description'].apply(lambda x: str(x))
preprocessed_test_df['bug_description'] = preprocessed_test_df['bug_description'].apply(lambda x: str(x))

# Remove stop words from 'bug_description' column
preprocessed_train_df['bug_description'] = preprocessed_train_df['bug_description'].apply(lambda x: remove_stop_words(x))
preprocessed_test_df['bug_description'] = preprocessed_test_df['bug_description'].apply(lambda x: remove_stop_words(x))

print( preprocessed_test_df['bug_description'][0] )
print( preprocessed_train_df['bug_description'][0] )


rest api ability list sub project project
event bookmarked project option sending notification non member bookmarked project


In [27]:
# keep only the reports that has class_name of Frontend, Backend, Security, Documentation
# Filter the training data
filtered_train_df = preprocessed_train_df[
    (preprocessed_train_df['class_name'] == 'Frontend') |
    (preprocessed_train_df['class_name'] == 'Backend') |
    (preprocessed_train_df['class_name'] == 'Security')
]

# Filter the testing data
filtered_test_df = preprocessed_test_df[
    (preprocessed_test_df['class_name'] == 'Frontend') |
    (preprocessed_test_df['class_name'] == 'Backend') |
    (preprocessed_test_df['class_name'] == 'Security')
]

# Show the first 5 rows of the filtered training data
print("Filtered Training Data:")
print(filtered_train_df.head())

# Show the first 5 rows of the filtered testing data
print("\nFiltered Testing Data:")
print(filtered_test_df.head())


Filtered Training Data:
                                     bug_description class_name
0  event bookmarked project option sending notifi...    Backend
1                     switch using full ln id urlbar   Frontend
2  consider removing hasicon property simplify st...   Frontend
3         method obtain current url webbrowsereditor   Frontend
4                     fix migration fails sql server    Backend

Filtered Testing Data:
                                     bug_description class_name
0          rest api ability list sub project project    Backend
1     support selective text right set gnome setting   Frontend
2  meta userstory ship v pre populated topsites a...   Frontend
3  include updated passwd changed column user api...    Backend
4                   problem email integration office    Backend


In [28]:
# Define the mapping of class names to the desired order
class_name_mapping = {
    'Backend': 1,
    'Frontend': 0,
    'Security': 2
}

# Map class names in both training and testing data to the desired order
filtered_train_df['class_label'] = filtered_train_df['class_name'].map(class_name_mapping)
filtered_test_df['class_label'] = filtered_test_df['class_name'].map(class_name_mapping)

# order them based on the number of class_label
filtered_train_df = filtered_train_df.sort_values(by=['class_label'])
filtered_test_df = filtered_test_df.sort_values(by=['class_label'])

# Print the unique class names in the training data
print(filtered_train_df['class_name'].unique())

# Print the unique class names in the testing data
print(filtered_test_df['class_name'].unique())


['Frontend' 'Backend' 'Security']
['Frontend' 'Backend' 'Security']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_train_df['class_label'] = filtered_train_df['class_name'].map(class_name_mapping)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_test_df['class_label'] = filtered_test_df['class_name'].map(class_name_mapping)


## Feature Exraction

In [29]:
import pandas as pd

def try_ngram_combinations(data, ngram_ranges, sample_size=200):
    """
    Try different combinations of n-grams using TfidfVectorizer.

    Args:
    - data: The input data to be transformed.
    - ngram_ranges: A list of tuples representing the n-gram ranges to try.
    - sample_size: The number of samples to take from each class.

    Returns:
    - A list of tuples containing the transformed data and corresponding vectorizer instances for each n-gram combination.
    """
    results = []

    # Initialize an empty DataFrame to store the sampled data
    sampled_data = pd.DataFrame(columns=data.columns)

    # Sample the specified number of samples from each class
    for class_name in data['class_name'].unique():
        class_samples = data[data['class_name'] == class_name].sample(n=sample_size, random_state=42)
        sampled_data = pd.concat([sampled_data, class_samples])

    for ngrams in ngram_ranges:
        print(f"Processing n-gram range: {ngrams}")

        # Initialize the TfidfVectorizer
        vectorizer = TfidfVectorizer(ngram_range=ngrams)

        # Fit and transform the sampled data
        X_transformed = vectorizer.fit_transform(sampled_data['bug_description'])

        # Append the transformed data and vectorizer to the results list
        results.append((X_transformed, vectorizer))

    return results

# Example usage:

# all n-gram ranges till (6,7)
ngram_ranges = [(1, 1), (1, 2), (1, 3) , (1,4) , (1,5) , (1,6) , (1,7) , (1,8) , (1,9) , (1,10) ,(1,11) , (1,12) , (1,13) , (1,14) ]
#ngram_ranges = [(2,2) , (2,3) , (2,4) , (2,5) , (2,6) , (2,7) , (2,8) , (2,9) , (2,10) , (2,11) , (2,12) , (2,13) , (2,14) , (2,15)]

transformed_data = try_ngram_combinations(filtered_train_df, ngram_ranges, sample_size=350)

# Print the shape of transformed data for each combination
for data, vectorizer in transformed_data:
    print(f"Shape of transformed data for n-gram range {vectorizer.ngram_range}: {data.shape}")


Processing n-gram range: (1, 1)
Processing n-gram range: (1, 2)
Processing n-gram range: (1, 3)
Processing n-gram range: (1, 4)
Processing n-gram range: (1, 5)
Processing n-gram range: (1, 6)
Processing n-gram range: (1, 7)
Processing n-gram range: (1, 8)
Processing n-gram range: (1, 9)
Processing n-gram range: (1, 10)
Processing n-gram range: (1, 11)
Processing n-gram range: (1, 12)
Processing n-gram range: (1, 13)
Processing n-gram range: (1, 14)
Shape of transformed data for n-gram range (1, 1): (1050, 2119)
Shape of transformed data for n-gram range (1, 2): (1050, 7725)
Shape of transformed data for n-gram range (1, 3): (1050, 12830)
Shape of transformed data for n-gram range (1, 4): (1050, 16999)
Shape of transformed data for n-gram range (1, 5): (1050, 20222)
Shape of transformed data for n-gram range (1, 6): (1050, 22575)
Shape of transformed data for n-gram range (1, 7): (1050, 24202)
Shape of transformed data for n-gram range (1, 8): (1050, 25273)
Shape of transformed data for

In [30]:
# print the vector representation of the first report
print(transformed_data[0][0][0])

  (0, 298)	0.46191225500985983
  (0, 1642)	0.40364883059940254
  (0, 389)	0.3822538474978055
  (0, 1696)	0.34798112074686754
  (0, 1016)	0.37376309676775177
  (0, 424)	0.39205569986792155
  (0, 39)	0.25117583800989696


In [31]:
# Print the number of unique class_name in the training data
print(filtered_train_df['class_name'].nunique())

# print their unique values
print(filtered_train_df['class_name'].unique())

# print the number of reports in each class
print(filtered_train_df['class_name'].value_counts())



3
['Frontend' 'Backend' 'Security']
class_name
Backend     7437
Frontend    5799
Security     367
Name: count, dtype: int64


In [32]:
# Create a mapping between cluster labels and class names
cluster_class_mapping = {
    1: 'Backend',  # Example mapping, adjust based on your actual clusters
    0: 'Frontend',
    2: 'Security'
}


In [34]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
import pandas as pd

# Initialize the SVM model
model = SVC(C = 100)

# Iterate over each combination of transformed data and vectorizer
for data, vectorizer in transformed_data:
    print(f"Model trained using n-gram range: {vectorizer.ngram_range}")

    # Perform cross-validation to evaluate the model
    scores = cross_val_score(model, data, filtered_test_df['class_name'], cv=5)
    print(f"Cross-Validation Scores: {scores}")

    # Fit the model on the entire training data
    model.fit(data, filtered_test_df['class_name'])

    # Predict the class labels for the testing data
    X_test_transformed = vectorizer.transform(filtered_train_df['bug_description'])
    y_pred = model.predict(X_test_transformed)

    # Print the classification report
    print(classification_report(filtered_train_df['class_name'], y_pred, target_names=filtered_train_df['class_name'].unique()))

    # Print the confusion matrix
    print(pd.crosstab(filtered_train_df['class_name'], y_pred, rownames=['Actual'], colnames=['Predicted']))

    # Print the accuracy
    accuracy = model.score(X_test_transformed, filtered_train_df['class_name'])
    print(f"Accuracy: {accuracy}")


Model trained using n-gram range: (1, 1)


ValueError: Found input variables with inconsistent numbers of samples: [696, 2423]

In [None]:
from sklearn.svm import SVC
from sklearn.semi_supervised import LabelSpreading
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
import pandas as pd

In [None]:
import pandas as pd

# Initialize the SVM model
svm_model = SVC(C=100)

# Iterate over each combination of transformed data and vectorizer
for data, vectorizer in transformed_data:
    print(f"Model trained using n-gram range: {vectorizer.ngram_range}")

    # Perform cross-validation to evaluate the SVM model
    scores = cross_val_score(svm_model, data, filtered_test_df['class_name'], cv=5)
    print(f"Cross-Validation Scores: {scores}")

    # Fit the SVM model on the entire training data
    svm_model.fit(data, filtered_test_df['class_name'])

    # Predict the class labels for the unlabeled data
    X_unlabeled = vectorizer.transform(filtered_train_df['bug_description'])
    pseudo_labels = svm_model.predict(X_unlabeled)

    # Combine the labeled and pseudo-labeled data
    labeled_data = pd.DataFrame(data.todense(), columns=vectorizer.get_feature_names_out())
    unlabeled_data = pd.DataFrame(X_unlabeled.todense(), columns=vectorizer.get_feature_names_out())
    X_combined = pd.concat([labeled_data, unlabeled_data])
    y_combined = np.hstack((filtered_test_df['class_name'], pseudo_labels))

    # Convert documents to lowercase for string columns
    X_combined_lower = X_combined.applymap(lambda x: x.lower() if isinstance(x, str) else x)

    # Convert X_combined_lower to a dense numpy array
    X_combined_lower_dense = X_combined_lower.to_numpy()

    # Train an unsupervised model on the combined labeled and pseudo-labeled data
    semi_supervised_model = LabelSpreading(kernel='knn')
    semi_supervised_model.fit(X_combined_lower_dense, y_combined)

    # Evaluate the semi-supervised model on the labeled test data
    X_test_transformed = vectorizer.transform(filtered_train_df['bug_description'])
    y_pred = semi_supervised_model.predict(X_test_transformed)

    # Print the classification report
    print(classification_report(filtered_train_df['class_name'], y_pred, target_names=filtered_train_df['class_name'].unique()))

    # Print the confusion matrix
    print(pd.crosstab(filtered_train_df['class_name'], y_pred, rownames=['Actual'], colnames=['Predicted']))

    # Print the accuracy
    accuracy = semi_supervised_model.score(X_test_transformed, filtered_train_df['class_name'])
    print(f"Accuracy: {accuracy}")


Model trained using n-gram range: (1, 1)
Cross-Validation Scores: [0.84948454 0.80412371 0.83505155 0.84090909 0.82438017]
               precision    recall  f1-score   support

     Frontend       0.59      0.97      0.73      7437
      Backend       0.50      0.01      0.01       174
     Security       0.84      0.20      0.32      5799
Documentation       0.95      0.05      0.09       367

     accuracy                           0.61     13777
    macro avg       0.72      0.31      0.29     13777
 weighted avg       0.70      0.61      0.53     13777

Predicted      Backend  Documentation  Frontend  Security
Actual                                                   
Backend           7235              0       202         0
Documentation      167              1         6         0
Frontend          4629              1      1168         1
Security           329              0        20        18
Accuracy: 0.6113087029106482
Model trained using n-gram range: (1, 2)
Cross-Validation

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


               precision    recall  f1-score   support

     Frontend       0.58      0.98      0.73      7437
      Backend       0.00      0.00      0.00       174
     Security       0.87      0.18      0.30      5799
Documentation       0.89      0.07      0.12       367

     accuracy                           0.61     13777
    macro avg       0.58      0.31      0.29     13777
 weighted avg       0.70      0.61      0.52     13777

Predicted      Backend  Frontend  Security
Actual                                    
Backend           7296       139         2
Documentation      168         6         0
Frontend          4754      1044         1
Security           329        14        24
Accuracy: 0.6070987878347971
Model trained using n-gram range: (1, 3)
Cross-Validation Scores: [0.83298969 0.80824742 0.82061856 0.82231405 0.82231405]


MemoryError: Unable to allocate 3.03 GiB for an array with shape (25123, 16200) and data type float64

In [None]:
from sklearn.semi_supervised import LabelPropagation
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
import pandas as pd

# Initialize the SVM model
svm_model = SVC(C=100)

# Initialize the LabelPropagation model
label_prop_model = LabelPropagation()

# Iterate over each combination of transformed data and vectorizer
for data, vectorizer in transformed_data:
    print(f"Model trained using n-gram range: {vectorizer.ngram_range}")

    # Perform cross-validation to evaluate the SVM model
    scores = cross_val_score(svm_model, data, filtered_test_df['class_name'], cv=5)
    print(f"Cross-Validation Scores (SVM): {scores}")

    # Fit the SVM model on the entire training data
    svm_model.fit(data, filtered_test_df['class_name'])

    # Predict the class labels for the unlabeled data
    X_unlabeled = vectorizer.transform(filtered_train_df['bug_description'])
    pseudo_labels = svm_model.predict(X_unlabeled)

    # Combine the labeled and pseudo-labeled data
    labeled_data = pd.DataFrame(data.todense(), columns=vectorizer.get_feature_names_out())
    unlabeled_data = pd.DataFrame(X_unlabeled.todense(), columns=vectorizer.get_feature_names_out())
    X_combined = pd.concat([labeled_data, unlabeled_data])
    y_combined = np.hstack((filtered_test_df['class_name'], pseudo_labels))

    # Train the LabelPropagation model on the combined data
    label_prop_model.fit(X_combined, y_combined)

    # Evaluate the LabelPropagation model on the labeled test data
    X_test_transformed = vectorizer.transform(filtered_train_df['bug_description'])
    y_pred = label_prop_model.predict(X_test_transformed)

    # Print the classification report
    print(classification_report(filtered_train_df['class_name'], y_pred, target_names=filtered_train_df['class_name'].unique()))

    # Print the confusion matrix
    print(pd.crosstab(filtered_train_df['class_name'], y_pred, rownames=['Actual'], colnames=['Predicted']))

    # Print the accuracy
    accuracy = label_prop_model.score(X_test_transformed, filtered_train_df['class_name'])
    print(f"Accuracy (LabelPropagation): {accuracy}")


Model trained using n-gram range: (1, 1)
Cross-Validation Scores (SVM): [0.84948454 0.80412371 0.83505155 0.84090909 0.82438017]




               precision    recall  f1-score   support

     Frontend       0.84      0.90      0.87      7437
      Backend       0.80      0.05      0.09       174
     Security       0.84      0.82      0.83      5799
Documentation       0.92      0.16      0.27       367

     accuracy                           0.84     13777
    macro avg       0.85      0.48      0.52     13777
 weighted avg       0.84      0.84      0.83     13777

Predicted      Backend  Documentation  Frontend  Security
Actual                                                   
Backend           6717              1       717         2
Documentation      122              8        44         0
Frontend          1028              1      4767         3
Security           156              0       152        59




Accuracy (LabelPropagation): 0.838426362778544
Model trained using n-gram range: (1, 2)
Cross-Validation Scores (SVM): [0.84948454 0.82061856 0.82061856 0.83884298 0.82438017]




               precision    recall  f1-score   support

     Frontend       0.84      0.90      0.87      7437
      Backend       1.00      0.02      0.04       174
     Security       0.84      0.83      0.83      5799
Documentation       0.91      0.14      0.24       367

     accuracy                           0.84     13777
    macro avg       0.90      0.47      0.50     13777
 weighted avg       0.84      0.84      0.83     13777

Predicted      Backend  Documentation  Frontend  Security
Actual                                                   
Backend           6730              0       703         4
Documentation      131              4        39         0
Frontend          1009              0      4789         1
Security           162              0       155        50




Accuracy (LabelPropagation): 0.8400232271176599
Model trained using n-gram range: (1, 3)
Cross-Validation Scores (SVM): [0.83298969 0.80824742 0.82061856 0.82231405 0.82231405]


MemoryError: Unable to allocate 1.96 GiB for an array with shape (16200, 16200) and data type float64

In [None]:
from sklearn.semi_supervised import LabelPropagation
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
import pandas as pd

# Initialize the SVM model
svm_model = SVC(C=100)

# Initialize the LabelPropagation model
label_prop_model = LabelPropagation()

# Iterate over each combination of transformed data and vectorizer
for data, vectorizer in transformed_data:
    print(f"Model trained using n-gram range: {vectorizer.ngram_range}")

    # Perform cross-validation to evaluate the SVM model
    scores = cross_val_score(svm_model, data, filtered_test_df['class_name'], cv=5)
    print(f"Cross-Validation Scores (SVM): {scores}")

    # Fit the SVM model on the entire training data
    svm_model.fit(data, filtered_test_df['class_name'])

    # Predict the class labels for the unlabeled data
    X_unlabeled = vectorizer.transform(filtered_train_df['bug_description'])
    pseudo_labels = svm_model.predict(X_unlabeled)

    # Combine the labeled and pseudo-labeled data
    labeled_data = pd.DataFrame(data.todense(), columns=vectorizer.get_feature_names_out())
    unlabeled_data = pd.DataFrame(X_unlabeled.todense(), columns=vectorizer.get_feature_names_out())
    X_combined = pd.concat([labeled_data, unlabeled_data])
    y_combined = np.hstack((filtered_test_df['class_name'], pseudo_labels))

    # Train the LabelPropagation model on the combined data
    label_prop_model.fit(X_combined, y_combined)

    # Evaluate the LabelPropagation model on the labeled test data
    X_test_transformed = vectorizer.transform(filtered_train_df['bug_description'])
    y_pred = label_prop_model.predict(X_test_transformed)

    # Print the classification report
    print(classification_report(filtered_train_df['class_name'], y_pred, target_names=filtered_train_df['class_name'].unique()))

    # Print the confusion matrix
    print(pd.crosstab(filtered_train_df['class_name'], y_pred, rownames=['Actual'], colnames=['Predicted']))

    # Print the accuracy
    accuracy = label_prop_model.score(X_test_transformed, filtered_train_df['class_name'])
    print(f"Accuracy (LabelPropagation): {accuracy}")


In [35]:
from sklearn.semi_supervised import LabelPropagation
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
import pandas as pd
from scipy.sparse import csr_matrix

# Initialize the SVM model
svm_model = SVC(C=100)

# Initialize the LabelPropagation model
label_prop_model = LabelPropagation()

# Initialize a DataFrame to store the sampled data
sampled_data = pd.DataFrame(columns=['bug_description', 'class_name'])

# Sample 100 samples from each class
for class_name in filtered_train_df['class_name'].unique():
    class_samples = filtered_train_df[filtered_train_df['class_name'] == class_name].sample(n=100, random_state=42)
    sampled_data = sampled_data.append(class_samples)

# Iterate over each combination of transformed data and vectorizer
for data, vectorizer in transformed_data:
    print(f"Model trained using n-gram range: {vectorizer.ngram_range}")

    # Perform cross-validation to evaluate the SVM model
    scores = cross_val_score(svm_model, data, sampled_data['class_name'], cv=5)
    print(f"Cross-Validation Scores (SVM): {scores}")

    # Fit the SVM model on the entire training data
    svm_model.fit(data, sampled_data['class_name'])

    # Predict the class labels for the unlabeled data
    X_unlabeled = vectorizer.transform(filtered_train_df['bug_description'])
    pseudo_labels = svm_model.predict(X_unlabeled)

    # Combine the labeled and pseudo-labeled data
    labeled_data = pd.DataFrame(data.todense(), columns=vectorizer.get_feature_names_out())
    unlabeled_data = pd.DataFrame(X_unlabeled.todense(), columns=vectorizer.get_feature_names_out())
    X_combined = pd.concat([labeled_data, unlabeled_data])
    y_combined = np.hstack((sampled_data['class_name'], pseudo_labels))

    # Train the LabelPropagation model on the combined data
    label_prop_model.fit(X_combined, y_combined)

    # Evaluate the LabelPropagation model on the labeled test data
    X_test_transformed = vectorizer.transform(filtered_train_df['bug_description'])
    y_pred = label_prop_model.predict(X_test_transformed)

    # Print the classification report
    print(classification_report(filtered_train_df['class_name'], y_pred, target_names=filtered_train_df['class_name'].unique()))

    # Print the confusion matrix
    print(pd.crosstab(filtered_train_df['class_name'], y_pred, rownames=['Actual'], colnames=['Predicted']))

    # Print the accuracy
    accuracy = label_prop_model.score(X_test_transformed, filtered_train_df['class_name'])
    print(f"Accuracy (LabelPropagation): {accuracy}")


Model trained using n-gram range: (1, 1)
Cross-Validation Scores (SVM): [0.76190476 0.73809524 0.7047619  0.8        0.71428571]




              precision    recall  f1-score   support

    Frontend       0.88      0.77      0.82      7437
     Backend       0.79      0.79      0.79      5799
    Security       0.28      0.98      0.44       367

    accuracy                           0.78     13603
   macro avg       0.65      0.85      0.68     13603
weighted avg       0.82      0.78      0.80     13603

Predicted  Backend  Frontend  Security
Actual                                
Backend       5724      1208       505
Frontend       811      4579       409
Security         2         6       359




Accuracy (LabelPropagation): 0.7837976916856576
Model trained using n-gram range: (1, 2)
Cross-Validation Scores (SVM): [0.73809524 0.75238095 0.71904762 0.8        0.74285714]




              precision    recall  f1-score   support

    Frontend       0.88      0.78      0.83      7437
     Backend       0.80      0.81      0.80      5799
    Security       0.31      0.97      0.47       367

    accuracy                           0.80     13603
   macro avg       0.66      0.85      0.70     13603
weighted avg       0.83      0.80      0.81     13603

Predicted  Backend  Frontend  Security
Actual                                
Backend       5827      1183       427
Frontend       768      4677       354
Security         2         9       356




Accuracy (LabelPropagation): 0.798353304418143
Model trained using n-gram range: (1, 3)
Cross-Validation Scores (SVM): [0.73809524 0.75714286 0.72380952 0.8        0.72857143]




              precision    recall  f1-score   support

    Frontend       0.89      0.78      0.83      7437
     Backend       0.80      0.80      0.80      5799
    Security       0.28      0.97      0.44       367

    accuracy                           0.79     13603
   macro avg       0.66      0.85      0.69     13603
weighted avg       0.83      0.79      0.81     13603

Predicted  Backend  Frontend  Security
Actual                                
Backend       5790      1156       491
Frontend       743      4647       409
Security         3         8       356




Accuracy (LabelPropagation): 0.793427920311696
Model trained using n-gram range: (1, 4)
Cross-Validation Scores (SVM): [0.73333333 0.75238095 0.72380952 0.78571429 0.73333333]




              precision    recall  f1-score   support

    Frontend       0.89      0.78      0.83      7437
     Backend       0.80      0.79      0.80      5799
    Security       0.27      0.97      0.43       367

    accuracy                           0.79     13603
   macro avg       0.65      0.85      0.68     13603
weighted avg       0.83      0.79      0.81     13603

Predicted  Backend  Frontend  Security
Actual                                
Backend       5803      1130       504
Frontend       748      4605       446
Security         2         8       357




Accuracy (LabelPropagation): 0.7913695508343748
Model trained using n-gram range: (1, 5)
Cross-Validation Scores (SVM): [0.71904762 0.75238095 0.72380952 0.78095238 0.73333333]


KeyboardInterrupt: 

In [None]:
from sklearn.semi_supervised import LabelPropagation
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np

# Initialize the SVM model
svm_model = SVC(C=100, probability=True)

# Initialize the LabelPropagation model
label_prop_model = LabelPropagation()

# Initialize a DataFrame to store the sampled data
sampled_data = pd.DataFrame(columns=['bug_description', 'class_name'])

# Sample 100 samples from each class
for class_name in filtered_train_df['class_name'].unique():
    class_samples = filtered_train_df[filtered_train_df['class_name'] == class_name].sample(n=350, random_state=42)
    sampled_data = pd.concat([sampled_data, class_samples])

# Iterate over each combination of transformed data and vectorizer
for data, vectorizer in transformed_data:
    print(f"Model trained using n-gram range: {vectorizer.ngram_range}")

    # Perform cross-validation to evaluate the SVM model
    scores = cross_val_score(svm_model, data, sampled_data['class_name'], cv=5)
    print(f"Cross-Validation Scores (SVM): {scores}")

    # Fit the SVM model on the entire training data
    svm_model.fit(data, sampled_data['class_name'])

    # Predict the class labels and probabilities for the unlabeled data
    X_unlabeled = vectorizer.transform(filtered_train_df['bug_description'])
    pseudo_labels = svm_model.predict(X_unlabeled)
    pseudo_proba = svm_model.predict_proba(X_unlabeled)

    # Filter high-confidence predictions
    threshold = 0.9
    high_confidence_indices = np.max(pseudo_proba, axis=1) > threshold
    X_unlabeled_filtered = X_unlabeled[high_confidence_indices]
    pseudo_labels_filtered = pseudo_labels[high_confidence_indices]

    # Convert the sparse matrix X_unlabeled_filtered to a DataFrame
    X_unlabeled_filtered_df = pd.DataFrame(X_unlabeled_filtered.toarray(), columns=vectorizer.get_feature_names_out())

    # Combine the labeled and pseudo-labeled data
    labeled_data = pd.DataFrame(data.todense(), columns=vectorizer.get_feature_names_out())
    X_combined = pd.concat([labeled_data, X_unlabeled_filtered_df])
    y_combined = np.hstack((sampled_data['class_name'], pseudo_labels_filtered))

    # Train the LabelPropagation model on the combined data
    label_prop_model.fit(X_combined, y_combined)

    # Evaluate the LabelPropagation model on the labeled test data
    X_test_transformed = vectorizer.transform(filtered_train_df['bug_description'])
    y_pred = label_prop_model.predict(X_test_transformed)

    # Print the classification report
    print(classification_report(filtered_train_df['class_name'], y_pred, target_names=filtered_train_df['class_name'].unique()))

    # Print the confusion matrix
    print(pd.crosstab(filtered_train_df['class_name'], y_pred, rownames=['Actual'], colnames=['Predicted']))

    # Print the accuracy
    accuracy = label_prop_model.score(X_test_transformed, filtered_train_df['class_name'])
    print(f"Accuracy (LabelPropagation): {accuracy}")


In [36]:
from sklearn.semi_supervised import LabelPropagation
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np

# Initialize the SVM model
svm_model = SVC(C=100, probability=True)

# Initialize the LabelPropagation model
label_prop_model = LabelPropagation()

# Initialize a DataFrame to store the sampled data
sampled_data = pd.DataFrame(columns=['bug_description', 'class_name'])

# Sample 100 samples from each class
for class_name in filtered_train_df['class_name'].unique():
    class_samples = filtered_train_df[filtered_train_df['class_name'] == class_name].sample(n=350, random_state=42)
    sampled_data = pd.concat([sampled_data, class_samples])

# Iterate over each combination of transformed data and vectorizer
for data, vectorizer in transformed_data:
    print(f"Model trained using n-gram range: {vectorizer.ngram_range}")

    # Perform cross-validation to evaluate the SVM model
    scores = cross_val_score(svm_model, data, sampled_data['class_name'], cv=5)
    print(f"Cross-Validation Scores (SVM): {scores}")

    # Fit the SVM model on the entire training data
    svm_model.fit(data, sampled_data['class_name'])

    # Predict the class labels and probabilities for the unlabeled data
    X_unlabeled = vectorizer.transform(filtered_train_df['bug_description'])
    pseudo_labels = svm_model.predict(X_unlabeled)
    pseudo_proba = svm_model.predict_proba(X_unlabeled)

    # Filter high-confidence predictions
    threshold = 0.9
    high_confidence_indices = np.max(pseudo_proba, axis=1) > threshold
    X_unlabeled_filtered = X_unlabeled[high_confidence_indices]
    pseudo_labels_filtered = pseudo_labels[high_confidence_indices]

    # Convert the sparse matrix X_unlabeled_filtered to a DataFrame
    X_unlabeled_filtered_df = pd.DataFrame(X_unlabeled_filtered.toarray(), columns=vectorizer.get_feature_names_out())

    # Combine the labeled and pseudo-labeled data
    labeled_data = pd.DataFrame(data.todense(), columns=vectorizer.get_feature_names_out())
    X_combined = pd.concat([labeled_data, X_unlabeled_filtered_df])
    y_combined = np.hstack((sampled_data['class_name'], pseudo_labels_filtered))

    # Train the LabelPropagation model on the combined data
    label_prop_model.fit(X_combined, y_combined)

    # Evaluate the LabelPropagation model on the labeled test data
    X_test_transformed = vectorizer.transform(filtered_train_df['bug_description'])
    y_pred = label_prop_model.predict(X_test_transformed)

    # Print the classification report
    print(classification_report(filtered_train_df['class_name'], y_pred, target_names=filtered_train_df['class_name'].unique()))

    # Print the confusion matrix
    print(pd.crosstab(filtered_train_df['class_name'], y_pred, rownames=['Actual'], colnames=['Predicted']))

    # Print the accuracy
    accuracy = label_prop_model.score(X_test_transformed, filtered_train_df['class_name'])
    print(f"Accuracy (LabelPropagation): {accuracy}")


Model trained using n-gram range: (1, 1)
Cross-Validation Scores (SVM): [0.76190476 0.73809524 0.7047619  0.8        0.71428571]




              precision    recall  f1-score   support

    Frontend       0.88      0.77      0.82      7437
     Backend       0.79      0.79      0.79      5799
    Security       0.29      0.98      0.45       367

    accuracy                           0.79     13603
   macro avg       0.65      0.85      0.69     13603
weighted avg       0.82      0.79      0.80     13603

Predicted  Backend  Frontend  Security
Actual                                
Backend       5725      1218       494
Frontend       804      4602       393
Security         2         6       359




Accuracy (LabelPropagation): 0.7855620083805043
Model trained using n-gram range: (1, 2)
Cross-Validation Scores (SVM): [0.73809524 0.75238095 0.71904762 0.8        0.74285714]




              precision    recall  f1-score   support

    Frontend       0.88      0.78      0.83      7437
     Backend       0.80      0.80      0.80      5799
    Security       0.31      0.97      0.47       367

    accuracy                           0.80     13603
   macro avg       0.66      0.85      0.70     13603
weighted avg       0.83      0.80      0.81     13603

Predicted  Backend  Frontend  Security
Actual                                
Backend       5832      1175       430
Frontend       772      4654       373
Security         3         8       356




Accuracy (LabelPropagation): 0.7970300668970081
Model trained using n-gram range: (1, 3)
Cross-Validation Scores (SVM): [0.73809524 0.75714286 0.72380952 0.8        0.72857143]




              precision    recall  f1-score   support

    Frontend       0.89      0.78      0.83      7437
     Backend       0.80      0.80      0.80      5799
    Security       0.28      0.97      0.44       367

    accuracy                           0.79     13603
   macro avg       0.66      0.85      0.69     13603
weighted avg       0.83      0.79      0.81     13603

Predicted  Backend  Frontend  Security
Actual                                
Backend       5783      1162       492
Frontend       736      4653       410
Security         3         8       356




Accuracy (LabelPropagation): 0.7933544071160773
Model trained using n-gram range: (1, 4)
Cross-Validation Scores (SVM): [0.73333333 0.75238095 0.72380952 0.78571429 0.73333333]




              precision    recall  f1-score   support

    Frontend       0.89      0.78      0.83      7437
     Backend       0.80      0.80      0.80      5799
    Security       0.27      0.97      0.43       367

    accuracy                           0.79     13603
   macro avg       0.65      0.85      0.68     13603
weighted avg       0.83      0.79      0.80     13603

Predicted  Backend  Frontend  Security
Actual                                
Backend       5774      1151       512
Frontend       731      4627       441
Security         2         8       357




Accuracy (LabelPropagation): 0.7908549584650445
Model trained using n-gram range: (1, 5)
Cross-Validation Scores (SVM): [0.71904762 0.75238095 0.72380952 0.78095238 0.73333333]


KeyboardInterrupt: 