In [2]:
import pandas as pd

# Load the CSV file.
# Note: This file uses a semicolon (;) as a delimiter, not a comma.
file_path = r"C:\projects\Zaalima_Development_project\bugreports\Eclipse.csv"
df = pd.read_csv(file_path, delimiter=';')

# See the first 5 rows
print("--- First 5 Rows ---")
print(df.head())

# See all the column names
print("\n--- Column Names ---")
print(df.columns)

# See the data summary (to check for missing values, etc.)
print("\n--- Data Info ---")
print(df.info())

--- First 5 Rows ---
    bugID                                                 sd  \
0  550000            PDE quickfix creates invalid @Since tag   
1  550001  Grant access to projects storage service to th...   
2  550002               Add relation information to REST-API   
3  550003  Provide platform independent plug-in to set th...   
4  550004  Inline method refacting reports "Inaccurate re...   

                   cl         pd          co   rp     os        bs  \
0     Eclipse Project        PDE   API Tools   PC  Linux  VERIFIED   
1  Eclipse Foundation  Community  CI-Jenkins   PC  Linux    CLOSED   
2          Automotive      MDMBL     General  All    All    CLOSED   
3     Eclipse Project   Platform          UI   PC  Linux    CLOSED   
4     Eclipse Project        JDT          UI   PC  Linux  RESOLVED   

           rs  pr     bsr  
0       FIXED  P3  normal  
1   DUPLICATE  P3  normal  
2       FIXED  P3   major  
3   DUPLICATE  P3  normal  
4  WORKSFORME  P3  normal  

--- 

In [3]:

df.rename(columns={
    'sd': 'Summary',
    'pd': 'Product',
    'pr': 'Priority'
}, inplace=True)

# Description column is missing so we will use Summary as our main text
df_cleaned = df[['Summary', 'Product', 'Priority']]


print("\n--- Saaf Kiya Hua Data (Cleaned Data) ---")
print(df_cleaned.head())


print("\n--- Cleaned Data Info ---")
print(df_cleaned.info())


--- Saaf Kiya Hua Data (Cleaned Data) ---
                                             Summary    Product Priority
0            PDE quickfix creates invalid @Since tag        PDE       P3
1  Grant access to projects storage service to th...  Community       P3
2               Add relation information to REST-API      MDMBL       P3
3  Provide platform independent plug-in to set th...   Platform       P3
4  Inline method refacting reports "Inaccurate re...        JDT       P3

--- Cleaned Data Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8478 entries, 0 to 8477
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Summary   8478 non-null   object
 1   Product   8478 non-null   object
 2   Priority  8478 non-null   object
dtypes: object(3)
memory usage: 198.8+ KB
None


In [4]:
# ==============================================================================
# 1. SETUP: Import Libraries and Download NLTK Data
# ==============================================================================
import pandas as pd
import nltk
import re
import joblib
import os
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Download required NLTK data (only needs to be run once)
print("--- Checking NLTK data ---")
try:
    stopwords.words('english')
except LookupError:
    nltk.download('stopwords')
try:
    nltk.word_tokenize('test')
except LookupError:
    nltk.download('punkt')
    nltk.download('punkt_tab')  # Add this line to download punkt_tab
print("NLTK data is ready.")


# ==============================================================================
# 2. DATA LOADING AND CLEANING
# ==============================================================================
print("\n--- Loading and Cleaning Data ---")
# Load the dataset

# Set the correct file path
file_path = os.path.join(os.path.dirname(os.getcwd()), "bugreports", "Eclipse.csv")
if not os.path.exists(file_path):
    # Try alternate path if first one doesn't exist
    file_path = os.path.join(os.getcwd(), "bugreports", "Eclipse.csv")

if not os.path.exists(file_path):
    raise FileNotFoundError(
        f"The file 'Eclipse.csv' was not found at {file_path}.\n"
        "Please ensure the file exists in the bugreports directory."
    )

print(f"Loading data from: {file_path}")
df = pd.read_csv(file_path, delimiter=';')

# Rename the important columns to be more readable
df.rename(columns={
    'sd': 'Summary',
    'pd': 'Product',
    'pr': 'Priority'
}, inplace=True)

# Select only the columns we need and create a clean copy
# Using .copy() here is important to avoid the SettingWithCopyWarning
df_cleaned = df[['Summary', 'Product', 'Priority']].copy()
print("Data has been loaded and initial cleaning is done.")
print(df_cleaned.head())


# ==============================================================================
# 3. NLP PREPROCESSING
# ==============================================================================
# Define the preprocessing function
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = re.sub(r'\W', ' ', str(text))  # Remove non-alphanumeric characters
    text = text.lower()  # Convert to lowercase
    tokens = nltk.word_tokenize(text)
    # Remove stopwords and apply stemming
    processed_tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return " ".join(processed_tokens)

# Apply the function to the 'Summary' column
print("\n--- Applying NLP Preprocessing ---")
df_cleaned['processed_text'] = df_cleaned['Summary'].apply(preprocess_text)
print("Preprocessing complete.")


# ==============================================================================
# 4. MODEL 1: TASK CLASSIFIER (Product Prediction)
# ==============================================================================
print("\n--- Training Task Classifier Model ---")
# Define features (X) and target (y)
X = df_cleaned['processed_text']
y = df_cleaned['Product']

# Count samples per class
class_counts = y.value_counts()
print("\nProduct distribution:")
print(class_counts)

# Filter out classes with too few samples (less than 5)
min_samples = 5
valid_classes = class_counts[class_counts >= min_samples].index
mask = y.isin(valid_classes)

# Filter X and y to keep only classes with sufficient samples
X_filtered = X[mask]
y_filtered = y[mask]

print(f"\nRemoved {len(y) - len(y_filtered)} samples from classes with less than {min_samples} instances")

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_filtered, y_filtered, test_size=0.2, random_state=42, stratify=y_filtered)

# Create and fit the TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=3000)
X_train_vec = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)

# Train the Naive Bayes model
classifier_model = MultinomialNB()
classifier_model.fit(X_train_vec, y_train)
print("Classifier Model has been trained.")

# Evaluate the model
y_pred = classifier_model.predict(X_test_vec)
print("\n--- Classification Report for Product Prediction ---")
print(classification_report(y_test, y_pred, zero_division=0))

# Save the model and vectorizer in the current working directory
model_dir = os.getcwd()
tfidf_path = os.path.join(model_dir, 'tfidf_vectorizer.joblib')
model_path = os.path.join(model_dir, 'classifier_model.joblib')

joblib.dump(tfidf, tfidf_path)
joblib.dump(classifier_model, model_path)
print(f"\nModels saved in: {model_dir}")
print("TF-IDF Vectorizer and Classifier Model have been saved!")

--- Checking NLTK data ---


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\A__I\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\A__I\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


NLTK data is ready.

--- Loading and Cleaning Data ---
Loading data from: c:\projects\Zaalima_Development_project\bugreports\Eclipse.csv
Data has been loaded and initial cleaning is done.
                                             Summary    Product Priority
0            PDE quickfix creates invalid @Since tag        PDE       P3
1  Grant access to projects storage service to th...  Community       P3
2               Add relation information to REST-API      MDMBL       P3
3  Provide platform independent plug-in to set th...   Platform       P3
4  Inline method refacting reports "Inaccurate re...        JDT       P3

--- Applying NLP Preprocessing ---
Preprocessing complete.

--- Training Task Classifier Model ---

Product distribution:
Product
Capella             1678
Platform            1455
Community           1107
JDT                  784
Kitalpha             429
                    ... 
WTP Common Tools       1
basyx                  1
STEM                   1
EMFT.Henshin      

In [5]:
from sklearn.ensemble import RandomForestClassifier
import joblib

# --- Model 2: Priority Predictor ---
print("\n--- Training Priority Predictor Model ---")

# The 'df_cleaned' DataFrame is already available from the previous steps

# 1. Define your features (X) and new target (y)
X = df_cleaned['processed_text']
y = df_cleaned['Priority']  # This time the target is Priority

# 2. Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 3. Load the previously saved TF-IDF Vectorizer
# We use the same vectorizer to keep the features consistent
tfidf = joblib.load('tfidf_vectorizer.joblib')
X_train_vec = tfidf.transform(X_train)
X_test_vec = tfidf.transform(X_test)
print("TF-IDF Vectorizer loaded and data transformed.")

# 4. Train the Random Forest Classifier model
# RandomForest is a good choice for this kind of problem
priority_model = RandomForestClassifier(n_estimators=100, random_state=42)
priority_model.fit(X_train_vec, y_train)
print("Priority Model has been trained.")

# 5. Evaluate the model
y_pred = priority_model.predict(X_test_vec)
print("\n--- Classification Report for Priority Prediction ---")
print(classification_report(y_test, y_pred, zero_division=0))

# 6. Save the priority model
joblib.dump(priority_model, 'priority_model.joblib')
print("\nPriority Model has been saved!")


--- Training Priority Predictor Model ---
TF-IDF Vectorizer loaded and data transformed.
Priority Model has been trained.

--- Classification Report for Priority Prediction ---
              precision    recall  f1-score   support

          P1       0.00      0.00      0.00         8
          P2       1.00      0.08      0.15        24
          P3       0.95      1.00      0.97      1609
          P4       0.20      0.02      0.03        54
          P5       0.00      0.00      0.00         1

    accuracy                           0.95      1696
   macro avg       0.43      0.22      0.23      1696
weighted avg       0.92      0.95      0.93      1696


Priority Model has been saved!
