In [1]:
import pandas as pd
import os
import numpy as np
import string
import re
from itertools import combinations
from sklearn.feature_extraction.text import TfidfVectorizer


import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# path of training data
train_path = 'train.xlsx'

# path of testing data
test_path = 'test.xlsx'

In [3]:
# show the first 5 rows of the training data
train_df = pd.read_excel(train_path)
print(train_df.head())

# show the first 5 rows of the testing data
test_df = pd.read_excel(test_path)
print(test_df.head())

                                              report class_name  class_index
0  "For any event on my bookmarked projects" opti...    Backend            1
1           Switch to using full l10n id's in urlbar   Frontend            2
2  Consider removing hasicon property to simplify...   Frontend            2
3  Method to obtain current URL from WebBrowserEd...   Frontend            2
4              Fix: migration fails in MS SQL-Server    Backend            1
                                              report class_name  class_index
0  REST API - ability to list sub projects for a ...    Backend            1
1  support selective text on right if set in GNOM...   Frontend            2
2  [meta][userstory] Ship v1 of Pre-populated top...   Frontend            2
3  Include updated_on and passwd_changed_on colum...    Backend            1
4    Problem with email integration to MS Office 365    Backend            1


In [4]:
def convert_lower_case(data):
    return str(data).lower()

In [5]:
def remove_punctuation(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in symbols:
        data = np.char.replace(data, i, ' ')

    return str(data)


In [6]:
def remove_apostrophe(data):
    return np.char.replace(data, "'", "")

In [7]:
def remove_numbers(data):
    return re.sub(r'\d+', '', str(data))

In [8]:
def remove_single_characters(tokens):
    new_text = ""
    for w in tokens:
        if len(w) > 1:
            new_text = new_text + " " + w
    return new_text

In [9]:
def lemmatization(data):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(data)
    data = remove_single_characters(tokens)
    lemmatized_output = ' '.join([lemmatizer.lemmatize(word) for word in tokens])
    return lemmatized_output

In [10]:
def preprocess(data):
    data = convert_lower_case(data)
    data = remove_punctuation(data)
    data = remove_apostrophe(data)
    data = remove_numbers(data)
    data = lemmatization(data)
    return data

In [11]:
# print the first report of the training data
print(train_df['report'][0])


"For any event on my bookmarked projects" option not sending notifications for non-member bookmarked projects


In [12]:
# preprocess the first report of the training data
print(preprocess(train_df['report'][0]))


for any event on my bookmarked project option not sending notification for non member bookmarked project


In [13]:
# read the preprocessed data from the new file
preprocessed_train_df = pd.read_csv('preprocessed_train_data2.csv')

# show the first 5 rows of the preprocessed training data
print(preprocessed_train_df.head())

                                     bug_description class_name
0  for any event on my bookmarked project option ...    Backend
1               switch to using full ln id in urlbar   Frontend
2  consider removing hasicon property to simplify...   Frontend
3  method to obtain current url from webbrowsered...   Frontend
4                fix migration fails in m sql server    Backend


In [14]:
# read the preprocessed data from the new file
preprocessed_test_df = pd.read_csv('preprocessed_test_data2.csv')

# show the first 5 rows of the preprocessed training data
print(preprocessed_test_df.head())

                                     bug_description class_name
0  rest api ability to list sub project for a pro...    Backend
1  support selective text on right if set in gnom...   Frontend
2  meta userstory ship v of pre populated topsite...   Frontend
3  include updated on and passwd changed on colum...    Backend
4         problem with email integration to m office    Backend


In [15]:
def remove_stop_words(data):
    tokens = word_tokenize(data)
    data = ' '.join([i for i in tokens if not i in stop_words])
    return data

# preprocess the first report of the training data
print(preprocess(preprocessed_train_df['bug_description'][0]))

# remove the stop words from the preprocessed data
print(remove_stop_words(preprocess(preprocessed_train_df['bug_description'][0])))

# preprocess the first report of the testing data
print(preprocess(preprocessed_test_df['bug_description'][0]))

# remove the stop words from the preprocessed data
print(remove_stop_words(preprocess(preprocessed_test_df['bug_description'][0])))

preprocessed_test_df['bug_description'][0]

for any event on my bookmarked project option not sending notification for non member bookmarked project
event bookmarked project option sending notification non member bookmarked project
rest api ability to list sub project for a project
rest api ability list sub project project


'rest api ability to list sub project for a project'

In [16]:
# Convert non-string values to strings in 'bug_description' column
preprocessed_train_df['bug_description'] = preprocessed_train_df['bug_description'].apply(lambda x: str(x))
preprocessed_test_df['bug_description'] = preprocessed_test_df['bug_description'].apply(lambda x: str(x))

# Remove stop words from 'bug_description' column
preprocessed_train_df['bug_description'] = preprocessed_train_df['bug_description'].apply(lambda x: remove_stop_words(x))
preprocessed_test_df['bug_description'] = preprocessed_test_df['bug_description'].apply(lambda x: remove_stop_words(x))

print( preprocessed_test_df['bug_description'][0] )
print( preprocessed_train_df['bug_description'][0] )


rest api ability list sub project project
event bookmarked project option sending notification non member bookmarked project


In [18]:
# keep only the reports that has class_name of Frontend, Backend, Security, Documentation
# Filter the training data
filtered_train_df = preprocessed_train_df[
    (preprocessed_train_df['class_name'] == 'Frontend') | 
    (preprocessed_train_df['class_name'] == 'Backend') |
    (preprocessed_train_df['class_name'] == 'Security') |
    (preprocessed_train_df['class_name'] == 'Documentation')
]

# Filter the testing data
filtered_test_df = preprocessed_test_df[
    (preprocessed_test_df['class_name'] == 'Frontend') | 
    (preprocessed_test_df['class_name'] == 'Backend') |
    (preprocessed_test_df['class_name'] == 'Security') |
    (preprocessed_test_df['class_name'] == 'Documentation')
]

# Show the first 5 rows of the filtered training data
print("Filtered Training Data:")
print(filtered_train_df.head())

# Show the first 5 rows of the filtered testing data
print("\nFiltered Testing Data:")
print(filtered_test_df.head())


Filtered Training Data:
                                     bug_description class_name
0  event bookmarked project option sending notifi...    Backend
1                     switch using full ln id urlbar   Frontend
2  consider removing hasicon property simplify st...   Frontend
3         method obtain current url webbrowsereditor   Frontend
4                     fix migration fails sql server    Backend

Filtered Testing Data:
                                     bug_description class_name
0          rest api ability list sub project project    Backend
1     support selective text right set gnome setting   Frontend
2  meta userstory ship v pre populated topsites a...   Frontend
3  include updated passwd changed column user api...    Backend
4                   problem email integration office    Backend


In [19]:
# Define the mapping of class names to the desired order
class_name_mapping = {
    'Backend': 1,
    'Frontend': 0,
    'Security': 2
}

# Map class names in both training and testing data to the desired order
filtered_train_df['class_label'] = filtered_train_df['class_name'].map(class_name_mapping)
filtered_test_df['class_label'] = filtered_test_df['class_name'].map(class_name_mapping)

# order them based on the number of class_label
filtered_train_df = filtered_train_df.sort_values(by=['class_label'])
filtered_test_df = filtered_test_df.sort_values(by=['class_label'])

# Print the unique class names in the training data
print(filtered_train_df['class_name'].unique())

# Print the unique class names in the testing data
print(filtered_test_df['class_name'].unique())


['Frontend' 'Backend' 'Security' 'Documentation']
['Frontend' 'Backend' 'Security' 'Documentation']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_train_df['class_label'] = filtered_train_df['class_name'].map(class_name_mapping)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_test_df['class_label'] = filtered_test_df['class_name'].map(class_name_mapping)


## Feature Exraction

In [22]:

def try_ngram_combinations(data, ngram_ranges):
    """
    Try different combinations of n-grams using TfidfVectorizer.

    Args:
    - data: The input data to be transformed.
    - ngram_ranges: A list of tuples representing the n-gram ranges to try.

    Returns:
    - A list of tuples containing the transformed data and corresponding vectorizer instances for each n-gram combination.
    """
    results = []

    for ngrams in ngram_ranges:
        print(f"Processing n-gram range: {ngrams}")

        # Initialize the TfidfVectorizer
        vectorizer = TfidfVectorizer(ngram_range=ngrams)

        # Fit and transform the data
        X_transformed = vectorizer.fit_transform(data)

        # Append the transformed data and vectorizer to the results list
        results.append((X_transformed, vectorizer))

    return results

# Example usage:

# all n-gram ranges till (6,7) 
ngram_ranges = [(1, 1), (1, 2), (1, 3) , (1,4) , (1,5) , (1,6) , (1,7) , (1,8) , (1,9) , (1,10) ,(1,11) , (1,12) , (1,13) , (1,14) , (1,15)
                , (2,2) , (2,3) , (2,4) , (2,5) , (2,6) , (2,7) , (2,8) , (2,9) , (2,10) , (2,11) , (2,12) , (2,13) , (2,14) , (2,15) ]


transformed_data = try_ngram_combinations(filtered_train_df['bug_description'], ngram_ranges)

# Print the shape of transformed data for each combination
for data, vectorizer in transformed_data:
    print(f"Shape of transformed data for n-gram range {vectorizer.ngram_range}: {data.shape}")


Processing n-gram range: (1, 1)
Processing n-gram range: (1, 2)
Processing n-gram range: (1, 3)
Processing n-gram range: (1, 4)
Processing n-gram range: (1, 5)
Processing n-gram range: (1, 6)
Processing n-gram range: (1, 7)
Processing n-gram range: (1, 8)
Processing n-gram range: (1, 9)
Processing n-gram range: (1, 10)
Processing n-gram range: (1, 11)
Processing n-gram range: (1, 12)
Processing n-gram range: (1, 13)
Processing n-gram range: (1, 14)
Processing n-gram range: (1, 15)
Processing n-gram range: (2, 2)
Processing n-gram range: (2, 3)
Processing n-gram range: (2, 4)
Processing n-gram range: (2, 5)
Processing n-gram range: (2, 6)
Processing n-gram range: (2, 7)
Processing n-gram range: (2, 8)
Processing n-gram range: (2, 9)
Processing n-gram range: (2, 10)
Processing n-gram range: (2, 11)
Processing n-gram range: (2, 12)
Processing n-gram range: (2, 13)
Processing n-gram range: (2, 14)
Processing n-gram range: (2, 15)
Shape of transformed data for n-gram range (1, 1): (13777, 7

In [23]:
# print the vector representation of the first report
print(transformed_data[0][0][0])

  (0, 6367)	0.396556165255602
  (0, 1009)	0.48935869459498377
  (0, 6719)	0.5879602895701754
  (0, 3518)	0.5075174614834652


In [24]:
# Print the number of unique class_name in the training data
print(filtered_train_df['class_name'].nunique())

# print their unique values
print(filtered_train_df['class_name'].unique())

# print the number of reports in each class
print(filtered_train_df['class_name'].value_counts())



4
['Frontend' 'Backend' 'Security' 'Documentation']
Backend          7437
Frontend         5799
Security          367
Documentation     174
Name: class_name, dtype: int64


In [26]:
# Create a mapping between cluster labels and class names
cluster_class_mapping = {
    1: 'Backend',  # Example mapping, adjust based on your actual clusters
    0: 'Frontend',
    2: 'Security', 
    3: 'Documentation'
}


In [29]:
# let's use supervised learning to predict the class_name of the reports
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Initialize the Logistic Regression model
model = LogisticRegression(max_iter=1000)

# Fit the model on the training data => try all the transformed data one by one
for data, vectorizer in transformed_data:
    model.fit(data, filtered_train_df['class_name'])
    print(f"Model trained using n-gram range: {vectorizer.ngram_range}")

    # Predict the class labels for the testing data
    X_test_transformed = vectorizer.transform(filtered_test_df['bug_description'])
    y_pred = model.predict(X_test_transformed)

    # Print the classification report
    print(classification_report(filtered_test_df['class_name'], y_pred, target_names=filtered_test_df['class_name'].unique()))

    # Print the confusion matrix
    print(pd.crosstab(filtered_test_df['class_name'], y_pred, rownames=['Actual'], colnames=['Predicted']))

    # Print the accuracy
    accuracy = model.score(X_test_transformed, filtered_test_df['class_name'])
    print(f"Accuracy: {accuracy}")

    

Model trained using n-gram range: (1, 1)
               precision    recall  f1-score   support

     Frontend       0.88      0.92      0.90      1345
      Backend       1.00      0.10      0.17        21
     Security       0.86      0.88      0.87       987
Documentation       0.87      0.29      0.43        70

     accuracy                           0.87      2423
    macro avg       0.90      0.54      0.59      2423
 weighted avg       0.88      0.87      0.87      2423

Predicted      Backend  Documentation  Frontend  Security
Actual                                                   
Backend           1232              0       113         0
Documentation       14              2         5         0
Frontend           119              0       865         3
Security            30              0        20        20
Accuracy: 0.8745356995460173
Model trained using n-gram range: (1, 2)
               precision    recall  f1-score   support

     Frontend       0.89      0.92      0.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Model trained using n-gram range: (2, 3)
               precision    recall  f1-score   support

     Frontend       0.91      0.82      0.86      1345
      Backend       0.00      0.00      0.00        21
     Security       0.75      0.91      0.83       987
Documentation       0.88      0.10      0.18        70

     accuracy                           0.83      2423
    macro avg       0.63      0.46      0.47      2423
 weighted avg       0.84      0.83      0.82      2423

Predicted      Backend  Frontend  Security
Actual                                    
Backend           1109       236         0
Documentation        8        13         0
Frontend            84       902         1
Security            19        44         7
Accuracy: 0.8328518365662402


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Model trained using n-gram range: (2, 4)
               precision    recall  f1-score   support

     Frontend       0.91      0.82      0.87      1345
      Backend       0.00      0.00      0.00        21
     Security       0.75      0.92      0.83       987
Documentation       1.00      0.09      0.16        70

     accuracy                           0.83      2423
    macro avg       0.67      0.46      0.46      2423
 weighted avg       0.84      0.83      0.82      2423

Predicted      Backend  Frontend  Security
Actual                                    
Backend           1108       237         0
Documentation        8        13         0
Frontend            83       904         0
Security            17        47         6
Accuracy: 0.8328518365662402


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Model trained using n-gram range: (2, 5)
               precision    recall  f1-score   support

     Frontend       0.91      0.82      0.86      1345
      Backend       0.00      0.00      0.00        21
     Security       0.75      0.92      0.83       987
Documentation       1.00      0.09      0.16        70

     accuracy                           0.83      2423
    macro avg       0.67      0.46      0.46      2423
 weighted avg       0.84      0.83      0.82      2423

Predicted      Backend  Frontend  Security
Actual                                    
Backend           1105       240         0
Documentation        8        13         0
Frontend            82       905         0
Security            16        48         6
Accuracy: 0.8320264135369376


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Model trained using n-gram range: (2, 6)
               precision    recall  f1-score   support

     Frontend       0.91      0.82      0.86      1345
      Backend       0.00      0.00      0.00        21
     Security       0.75      0.92      0.82       987
Documentation       1.00      0.09      0.16        70

     accuracy                           0.83      2423
    macro avg       0.67      0.46      0.46      2423
 weighted avg       0.84      0.83      0.82      2423

Predicted      Backend  Frontend  Security
Actual                                    
Backend           1104       241         0
Documentation        8        13         0
Frontend            83       904         0
Security            16        48         6
Accuracy: 0.8312009905076352


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Model trained using n-gram range: (2, 7)
               precision    recall  f1-score   support

     Frontend       0.91      0.82      0.86      1345
      Backend       0.00      0.00      0.00        21
     Security       0.75      0.92      0.82       987
Documentation       1.00      0.09      0.16        70

     accuracy                           0.83      2423
    macro avg       0.66      0.46      0.46      2423
 weighted avg       0.84      0.83      0.82      2423

Predicted      Backend  Frontend  Security
Actual                                    
Backend           1102       243         0
Documentation        8        13         0
Frontend            83       904         0
Security            16        48         6
Accuracy: 0.8303755674783326


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Model trained using n-gram range: (2, 8)
               precision    recall  f1-score   support

     Frontend       0.91      0.82      0.86      1345
      Backend       0.00      0.00      0.00        21
     Security       0.75      0.92      0.82       987
Documentation       1.00      0.07      0.13        70

     accuracy                           0.83      2423
    macro avg       0.66      0.45      0.45      2423
 weighted avg       0.84      0.83      0.82      2423

Predicted      Backend  Frontend  Security
Actual                                    
Backend           1099       246         0
Documentation        8        13         0
Frontend            83       904         0
Security            17        48         5
Accuracy: 0.8287247214197276


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Model trained using n-gram range: (2, 9)
               precision    recall  f1-score   support

     Frontend       0.91      0.82      0.86      1345
      Backend       0.00      0.00      0.00        21
     Security       0.75      0.92      0.82       987
Documentation       1.00      0.07      0.13        70

     accuracy                           0.83      2423
    macro avg       0.66      0.45      0.45      2423
 weighted avg       0.84      0.83      0.82      2423

Predicted      Backend  Frontend  Security
Actual                                    
Backend           1098       247         0
Documentation        8        13         0
Frontend            82       905         0
Security            17        48         5
Accuracy: 0.8287247214197276


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


KeyboardInterrupt: 

In [32]:
# let's use supervised learning to predict the class_name of the reports
# use SVM
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Initialize the 
model = SVC(C = 100)

# Fit the model on the training data => try all the transformed data one by one
for data, vectorizer in transformed_data:
    model.fit(data, filtered_train_df['class_name'])
    print(f"Model trained using n-gram range: {vectorizer.ngram_range}")

    # Predict the class labels for the testing data
    X_test_transformed = vectorizer.transform(filtered_test_df['bug_description'])
    y_pred = model.predict(X_test_transformed)

    # Print the classification report
    print(classification_report(filtered_test_df['class_name'], y_pred, target_names=filtered_test_df['class_name'].unique()))

    # Print the confusion matrix
    print(pd.crosstab(filtered_test_df['class_name'], y_pred, rownames=['Actual'], colnames=['Predicted']))

    # Print the accuracy
    accuracy = model.score(X_test_transformed, filtered_test_df['class_name'])
    print(f"Accuracy: {accuracy}")

    

KeyboardInterrupt: 

In [33]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
import pandas as pd

# Initialize the SVM model
model = SVC(C = 100)

# Iterate over each combination of transformed data and vectorizer
for data, vectorizer in transformed_data:
    print(f"Model trained using n-gram range: {vectorizer.ngram_range}")

    # Perform cross-validation to evaluate the model
    scores = cross_val_score(model, data, filtered_train_df['class_name'], cv=5)
    print(f"Cross-Validation Scores: {scores}")

    # Fit the model on the entire training data
    model.fit(data, filtered_train_df['class_name'])

    # Predict the class labels for the testing data
    X_test_transformed = vectorizer.transform(filtered_test_df['bug_description'])
    y_pred = model.predict(X_test_transformed)

    # Print the classification report
    print(classification_report(filtered_test_df['class_name'], y_pred, target_names=filtered_test_df['class_name'].unique()))

    # Print the confusion matrix
    print(pd.crosstab(filtered_test_df['class_name'], y_pred, rownames=['Actual'], colnames=['Predicted']))

    # Print the accuracy
    accuracy = model.score(X_test_transformed, filtered_test_df['class_name'])
    print(f"Accuracy: {accuracy}")


Model trained using n-gram range: (1, 1)
Cross-Validation Scores: [0.88134978 0.89187228 0.88421053 0.87876588 0.8907441 ]
               precision    recall  f1-score   support

     Frontend       0.92      0.92      0.92      1345
      Backend       1.00      0.43      0.60        21
     Security       0.87      0.91      0.89       987
Documentation       0.86      0.54      0.67        70

     accuracy                           0.90      2423
    macro avg       0.91      0.70      0.77      2423
 weighted avg       0.90      0.90      0.90      2423

Predicted      Backend  Documentation  Frontend  Security
Actual                                                   
Backend           1235              0       107         3
Documentation        7              9         5         0
Frontend            87              0       897         3
Security            14              0        18        38
Accuracy: 0.8992983904250929
Model trained using n-gram range: (1, 2)
Cross-Validation

KeyboardInterrupt: 

In [34]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score, GridSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import BaggingClassifier
import pandas as pd

In [35]:
# Initialize the SVM model
svm_model = SVC()

# Define the parameter grid for GridSearchCV
param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001], 'kernel': ['rbf', 'linear']}

# Iterate over each combination of transformed data and vectorizer
for data, vectorizer in transformed_data:
    print(f"Model trained using n-gram range: {vectorizer.ngram_range}")

    # Apply SMOTE to balance the class distribution
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(data, filtered_train_df['class_name'])

    # Initialize GridSearchCV
    grid_search = GridSearchCV(svm_model, param_grid, cv=5)

    # Fit the GridSearchCV to find the best parameters
    grid_search.fit(X_resampled, y_resampled)

    # Print the best parameters
    print("Best parameters found: ", grid_search.best_params_)

    # Initialize base SVM classifier with the best parameters
    best_svm_model = SVC(C=grid_search.best_params_['C'], gamma=grid_search.best_params_['gamma'], kernel=grid_search.best_params_['kernel'])

    # Initialize BaggingClassifier with base SVM classifier
    bagging_clf = BaggingClassifier(best_svm_model, n_estimators=10, random_state=42)

    # Fit the BaggingClassifier on resampled data
    bagging_clf.fit(X_resampled, y_resampled)

    # Predict the class labels for the testing data
    X_test_transformed = vectorizer.transform(filtered_test_df['bug_description'])
    y_pred = bagging_clf.predict(X_test_transformed)

    # Print the classification report
    print(classification_report(filtered_test_df['class_name'], y_pred, target_names=filtered_test_df['class_name'].unique()))

    # Print the confusion matrix
    print(pd.crosstab(filtered_test_df['class_name'], y_pred, rownames=['Actual'], colnames=['Predicted']))

    # Print the accuracy
    accuracy = bagging_clf.score(X_test_transformed, filtered_test_df['class_name'])
    print(f"Accuracy: {accuracy}")


Model trained using n-gram range: (1, 1)
