<a href="https://colab.research.google.com/github/NaziaToma/BugType-and-BugFix-Predictor/blob/main/BugFix_SVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

# Loading datasets
train_df = pd.read_csv('/content/drive/MyDrive/BugFix_TrainingSet.csv')
test_df = pd.read_csv('/content/drive/MyDrive/BugFix_Testset.csv')

In [12]:
# Downloading necessary NLTK datasets
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

# Initialize nltk's lemmatizer
lemmatizer = WordNetLemmatizer()

# Defining a function to encapsulate preprocessing
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize text
    tokens = nltk.word_tokenize(text)
    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    # Lemmatize each word
    lemmatized = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(lemmatized)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [13]:
# Filling NaN values with empty strings
train_df.fillna('', inplace=True)
test_df.fillna('', inplace=True)

In [14]:
# Combine Short Description, bug report, and comments into a single text feature for each dataset
train_df['combined_text'] = train_df['Short Description'] + ' ' + train_df['Bug Report'] + ' ' + train_df[train_df.columns[4:]].apply(lambda x: ' '.join(x.values.tolist()), axis=1)
test_df['combined_text'] = test_df['Short Description'] + ' ' + test_df['Bug Report'] + ' ' + test_df[test_df.columns[4:]].apply(lambda x: ' '.join(x.values.tolist()), axis=1)

In [15]:
# Apply the preprocessing to each row of the 'combined_text' column
train_df['processed_text'] = train_df['combined_text'].apply(preprocess_text)
test_df['processed_text'] = test_df['combined_text'].apply(preprocess_text)

In [16]:
# Defining the target variable for each dataset
y_train = train_df['Resolution']
y_test = test_df['Resolution']

In [17]:
# Defining a pipeline with TF-IDF Vectorizer and SVM Classifier
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('clf', SVC(kernel= 'linear', probability=True))
])

In [18]:
# Training the model on the training set
pipeline.fit(train_df['processed_text'], y_train)

# Predicting on the test set
y_pred = pipeline.predict(test_df['processed_text'])

# Printing the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       FIXED       0.70      0.83      0.76        53
     INVALID       0.72      0.63      0.67        46
     WONTFIX       0.80      0.73      0.77        45

    accuracy                           0.74       144
   macro avg       0.74      0.73      0.73       144
weighted avg       0.74      0.74      0.73       144

