In [None]:
#Import Python Libraries
import pandas as pd
import pickle
import time
import numpy as np
import matplotlib.pyplot as plt


#Import Self-written Functions
import os
import sys
src_dir = os.path.abspath(os.path.join(os.getcwd(), '..', 'src'))
sys.path.append(src_dir)
print(src_dir)

from load_data.data_loader import load_data_from_db,remove_illegal_chars
from clean_data.cleanCommitData import cleanCommitData
from clean_data.cleanJiraData import cleanJiraData
from clean_data.subsetAccordTime import subsetAccordTime
from clean_data.checkValidityTrace import checkValidityTrace
from clean_data.createCorpusFromDocumentList import createCorpusFromDocumentList

from features_engineering.calculateTimeDifference import *
from features_engineering.checkAuthorMatch import checkAuthorMatch

from model_similarity.embedding_choice import *
from model_similarity.createFittedTF_IDF import createFittedTF_IDF,calculateCosineSimilarity
from model_similarity.wordToVec import calculate_word2vec_similarity
from model_similarity.fastText import calculate_fasttext_similarity
from model_similarity.sentenceTransformer import *
from model_similarity.openAI import *


#Display full value of a column
pd.set_option('display.max_colwidth', None)

#Display all columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

print("done")

In [2]:
#current project name
project='derby'        #'derby','flink','hbase','kafka','pig','switchyard','teiid','zookeeper'

# 1. Load Jira&SVN Data

In [3]:
#intermediate datasets dir
intermediate_dir = f"../data/intermediate/{project}"
os.makedirs(intermediate_dir, exist_ok=True)

In [None]:
#Import raw JIRA & SVN data as a pandas dataframe
#formal projects
commit_df, issue_df, link_df = load_data_from_db(f"../data/raw_data/SEOSS/dataverse_files/{project}.sqlite3")

#temp
link_df.to_excel(excel_writer = f"{intermediate_dir}/link_df.xlsx", index = False)
link_df.to_pickle(path= f"{intermediate_dir}/link_df.pkl")

# 2. Data processing

## 2.1 Clean Raw Data - Commits

Clean the raw data of the SVN table

In [None]:
#Start timer
startTime = time.time() 

commit_df_clean = cleanCommitData(commit_df)

#temp 
commit_df_clean.to_excel(excel_writer = f"{intermediate_dir}/commit_df_clean.xlsx", index = False)
commit_df_clean.to_pickle(path= f"{intermediate_dir}/commit_df_clean.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished cleaning after " + timeDifference)

## 2.2 Clean Raw Data - JIRA Data
Clean the raw data of the Jira table

In [None]:
#Clean Data sets
jira_df_clean = cleanJiraData(issue_df)

#temp 
jira_df_clean.to_pickle(path= f"{intermediate_dir}/jira_df_clean.pkl")
jira_df_clean_ri = remove_illegal_chars(jira_df_clean)
jira_df_clean_ri.to_excel(excel_writer = f"{intermediate_dir}/jira_df_clean.xlsx", index = False)

# 2.3 Subset according to the time interval 
To reduce the size of the data

In [None]:
#Start timer
startTime = time.time() 
commit_df_clean,jira_df_clean,link_df=subsetAccordTime(commit_df_clean,jira_df_clean,link_df,interval=1000000)

#Create a temp XLSX file for all intermediate datasets
commit_df_clean.to_excel(excel_writer = f"{intermediate_dir}/commit_df_clean.xlsx", index = False)
jira_df_clean_ri = remove_illegal_chars(jira_df_clean)
jira_df_clean_ri.to_excel(excel_writer = f"{intermediate_dir}/jira_df_clean.xlsx", index = False)
link_df.to_excel(excel_writer = f"{intermediate_dir}/link_df.xlsx", index = False)


#Create a pickle file for all intermediate datasets
commit_df_clean.to_pickle(path= f"{intermediate_dir}/commit_df_clean.pkl")
jira_df_clean.to_pickle(path= f"{intermediate_dir}/jira_df_clean.pkl")
link_df.to_pickle(path= f"{intermediate_dir}/link_df.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished subset after " + timeDifference)

## 2.4 Clean Raw Data - Create Corpora
Create the corpora for JIRA Issues

In [98]:
#Create JIRA corpus for dataProcessing dataset
jira_corpus_summary  = createCorpusFromDocumentList(jira_df_clean.Summary)
jira_corpus_description = createCorpusFromDocumentList(jira_df_clean.Description)

#Merge all JIRA Corpora into 1 corpus
jira_corpus_all = [i+" "+j for i,j in zip(jira_corpus_summary,
                                          jira_corpus_description)]

#Save intermediate pickles
with open(f"{intermediate_dir}/jira_corpus_summary.pkl", 'wb') as f:
    pickle.dump(jira_corpus_summary, f)

with open(f"{intermediate_dir}/jira_corpus_description.pkl", 'wb') as f:
    pickle.dump(jira_corpus_description, f)

with open(f"{intermediate_dir}/jira_corpus_all.pkl", 'wb') as f:
    pickle.dump(jira_corpus_all, f)

Create the corpora for SVN

In [None]:
#Create corpus for log messages
svn_corpus_log = createCorpusFromDocumentList(commit_df_clean.Logs)

#Save intermediate pickles
with open(f"{intermediate_dir}/svn_corpus_log.pkl", 'wb') as f:
    pickle.dump(svn_corpus_log, f)

Create cartesian product JIRA x Commits

In [None]:
#Create cartesian products JIRA x Commits
cartesian_df = jira_df_clean.merge(commit_df_clean, how='cross')
print(cartesian_df.shape)

#Drop all rows which do not meet the rules of causality
cartesian_df = cartesian_df.drop(cartesian_df[cartesian_df.Jira_created_date > cartesian_df.Commit_date].index)
print(cartesian_df.shape)
#Create a pickle file for all intermediate datasets
cartesian_df.to_pickle(path= f"{intermediate_dir}/cartesian_df.pkl")

In [None]:
# Run line below to get cartesian df
cartesian_df = pd.read_pickle(f'{intermediate_dir}/cartesian_df.pkl')

print(cartesian_df.shape)

Create labels

In [12]:
#features dir
feature_dir = f"../data/features/{project}"
os.makedirs(feature_dir, exist_ok=True)

In [None]:
link_set = set(zip(link_df["issue_id"], link_df["commit_hash"]))

#Create new dataFrames for the time features
labels_df = pd.DataFrame() 

#Create a column, which indicates which traces are valid.
labels_df["is_valid"] = cartesian_df.apply(lambda x: checkValidityTrace(x.Issue_key_jira, x.commit_hash,link_set), axis=1)

print("Finished creating labels")

#Save labels
labels_df.to_pickle(path= f"{feature_dir}/labels_df.pkl")

In [None]:
print(labels_df.shape)

# 2.5 Prepare model-Word2Vec,FastText

In [None]:
from gensim.models import Word2Vec
from gensim.models import FastText
from nltk.tokenize import word_tokenize
import pandas as pd
import re
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\[[^\]]*\]', '', text)  
    text = re.sub(r'[{}()\[\]<>]', ' ', text)  
    text = re.sub(r'[^a-z\s]', ' ', text) 
    text = ' '.join(word for word in text.split() if word not in stop_words and len(word) > 1)
    return text

projects = ['derby', 'flink', 'hbase', 'kafka', 'pig', 'switchyard', 'teiid', 'zookeeper']
all_texts = []

for project in projects:
    intermediate_dir = f"../data/intermediate/{project}"
    jira_path = os.path.join(intermediate_dir, "jira_df_clean.pkl")
    commit_path = os.path.join(intermediate_dir, "commit_df_clean.pkl")

   
    jira_df = pd.read_pickle(jira_path)
    commit_df = pd.read_pickle(commit_path)

    
    jira_texts = (
        jira_df['summary'].fillna('') + ' ' + jira_df['description'].fillna('')
    ).tolist()
    commit_texts = commit_df['Message'].fillna('').tolist()

    project_texts = jira_texts + commit_texts
    all_texts.extend(project_texts)

print(f"Loaded texts from {len(projects)} projects. Total texts: {len(all_texts)}")
print("Done,all_texts")

cleaned_texts = [clean_text(text) for text in all_texts if text.strip()]
tokenized_texts = [word_tokenize(text) for text in cleaned_texts]
model_dir = f"../data/models"
print("Done,tokenized_texts")

model_1   = Word2Vec(sentences=tokenized_texts, vector_size=300, window=5, min_count=10, workers=4,epochs=30, sg=1)  
model_1.save(f"{model_dir}/word2vec_trained.model")
print("Done,word2vec")
model_2 = FastText(sentences=tokenized_texts,vector_size=300, window=5, min_count=10, workers=4,epochs=30, sg=1)
model_2.save(f"{model_dir}/fasttext_trained.model")

# 3.Feature Engineering

In [3]:
#Run this code block when you've restarted the kernel, and want to use previously gained results.
#features dir
intermediate_dir = f"../data/intermediate/{project}"
feature_dir = f"../data/features/{project}"
#Load JIRA Corpora
jira_corpus_summary = pd.read_pickle(f"{intermediate_dir}/jira_corpus_summary.pkl")
jira_corpus_description = pd.read_pickle(f"{intermediate_dir}/jira_corpus_description.pkl")
jira_corpus_all = pd.read_pickle(f"{intermediate_dir}/jira_corpus_all.pkl")

#Load SVN corora
svn_corpus_log = pd.read_pickle(f"{intermediate_dir}/svn_corpus_log.pkl")

#Load clean datasets
jira_df_clean = pd.read_pickle(f"{intermediate_dir}/jira_df_clean.pkl")
svn_df_clean = pd.read_pickle(f"{intermediate_dir}/commit_df_clean.pkl")

#load cartesian products JIRA x Commits
cartesian_df = pd.read_pickle(f"{intermediate_dir}/cartesian_df.pkl")
labels_df=pd.read_pickle(f"{feature_dir}/labels_df.pkl")

# 3.1 Create Non-textual Features

In [None]:
#Create new dataFrames for the time features
features_process_related = pd.DataFrame() 

#check whether the assignee/reporter name (in jira) matches the author name (in svn)
features_process_related['f1_assignee_is_commiter'] = cartesian_df.apply(lambda x: checkAuthorMatch(x.Author, x.Email,x.assignee,x.assignee_username,x.reporter,x.reporter_username), axis=1)

#Calculate the time features for data Processing Dataset
features_process_related['f2_timedif_issuecreation_and_commitcreation'] = cartesian_df.apply(lambda x: calculateTimeDif(x.Jira_created_date, x.Commit_date), axis=1)
features_process_related['f3_timedif_issueupdated_and_commitcreation'] = cartesian_df.apply(lambda x: calculateTimeDif(x.Jira_updated_date, x.Commit_date), axis=1)
features_process_related['f4_timedif_issueresolved_and_commitcreation'] = cartesian_df.apply(lambda x: calculateTimeDif(x.Jira_resolved_date, x.Commit_date), axis=1)
print("Finished data Processing")

#Create a pickle file for all intermediate datasets
features_process_related.to_pickle(path= f"{feature_dir}/features_process_related.pkl")

## 3.2 Create Textual Features --different techs
### 3.2.1 Create TF-IDF (VSM) for the corpus

In [17]:
#Instantiate the count vectorizer and tfidf for the corpus
from sklearn.feature_extraction.text import CountVectorizer 
#Create new dataFrame
features_information_retrieval = pd.DataFrame() 

In [18]:
#instantiate CountVectorizer() for SVN

svn_log_countvectorizer = CountVectorizer()
svn_log_tfidf = createFittedTF_IDF(svn_log_countvectorizer, svn_corpus_log)

#instantiate CountVectorizer() for JIRA - unigram
jira_all_countvectorizer = CountVectorizer()
jira_all_tfidf = createFittedTF_IDF(jira_all_countvectorizer, jira_corpus_all)

jira_summary_countvectorizer = CountVectorizer()
jira_summary_tfidf = createFittedTF_IDF(jira_summary_countvectorizer, jira_corpus_summary)

jira_description_countvectorizer = CountVectorizer()
jira_description_tfidf = createFittedTF_IDF(jira_description_countvectorizer, jira_corpus_description)

#### IR Features - Log Message and Summary

In [None]:
#Start timer
startTime = time.time() 

#Calculate cosine similarity for each trace
features_information_retrieval["f5_log_and_summary_log_as_query"] = cartesian_df.apply(lambda x: calculateCosineSimilarity(x.Summary, x.Logs, 
                                                                                                                                 svn_log_countvectorizer, 
                                                                                                                                 svn_log_tfidf), axis=1)

#Save results in pickle
features_information_retrieval.to_pickle(path= f"{feature_dir}/features_information_retrieval.pkl")
#features_information_retrieval.to_excel(f"{feature_dir}/features_information_retrieval.xlsx", index=False)

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished after " + timeDifference)

In [None]:
#Start timer
startTime = time.time() 

#Calculate cosine similarity for each trace
features_information_retrieval["f5_log_and_summary_summary_as_query"] = cartesian_df.apply(lambda x: calculateCosineSimilarity(x.Summary, x.Logs, 
                                                                                                                                    jira_summary_countvectorizer, 
                                                                                                                                    jira_summary_tfidf), axis=1)

#Save results in pickle
features_information_retrieval.to_pickle(path= f"{feature_dir}/features_information_retrieval.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished after " + timeDifference)

In [21]:
features_information_retrieval['f5_log_and_summary'] = (
    features_information_retrieval['f5_log_and_summary_log_as_query'] + 
    features_information_retrieval['f5_log_and_summary_summary_as_query']
) / 2

#Save results in pickle
features_information_retrieval.to_pickle(path= f"{feature_dir}/features_information_retrieval.pkl")



#### IR Features - Log Message and Description

In [None]:
#Start timer
startTime = time.time() 

#Calculate cosine similarity for each trace
features_information_retrieval["f6_log_and_description_log_as_query"] = cartesian_df.apply(lambda x: calculateCosineSimilarity(x.Description, x.Logs, 
                                                                                                                                 svn_log_countvectorizer, 
                                                                                                                                 svn_log_tfidf), axis=1)

#Save results in pickle
features_information_retrieval.to_pickle(path= f"{feature_dir}/features_information_retrieval.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished after " + timeDifference)

In [None]:
#Start timer
startTime = time.time() 

#Calculate cosine similarity for each trace
features_information_retrieval["f6_log_and_description_description_as_query"] = cartesian_df.apply(lambda x: calculateCosineSimilarity(x.Description, x.Logs, 
                                                                                                                                    jira_description_countvectorizer, 
                                                                                                                                    jira_description_tfidf), axis=1)

#Save results in pickle
features_information_retrieval.to_pickle(path= f"{feature_dir}/features_information_retrieval.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished after " + timeDifference)

In [24]:
features_information_retrieval['f6_log_and_description'] = (
    features_information_retrieval['f6_log_and_description_log_as_query'] + 
    features_information_retrieval['f6_log_and_description_description_as_query']
) / 2

#Save results in pickle
features_information_retrieval.to_pickle(path= f"{feature_dir}/features_information_retrieval.pkl")

#### IR Features - Log Message and JIRA All-Natural Text

In [None]:
#Start timer
startTime = time.time() 

#Calculate cosine similarity for each trace
features_information_retrieval["f7_log_and_jira_all_log_as_query"] = cartesian_df.apply(lambda x: calculateCosineSimilarity(x.Jira_natural_text, x.Logs, 
                                                                                                                                 svn_log_countvectorizer, 
                                                                                                                                 svn_log_tfidf), axis=1)

#Save results in pickle
features_information_retrieval.to_pickle(path= f"{feature_dir}/features_information_retrieval.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished after " + timeDifference)

In [None]:
#Start timer
startTime = time.time() 

#Calculate cosine similarity for each trace
features_information_retrieval["f7_log_and_jira_all_jira_all_as_query"] = cartesian_df.apply(lambda x: calculateCosineSimilarity(x.Jira_natural_text, x.Logs, 
                                                                                                                                    jira_all_countvectorizer, 
                                                                                                                                    jira_all_tfidf), axis=1)

#Save results in pickle
features_information_retrieval.to_pickle(path= f"{feature_dir}/features_information_retrieval.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished after " + timeDifference)

In [27]:
features_information_retrieval['f7_log_and_jira_all'] = (
    features_information_retrieval['f7_log_and_jira_all_log_as_query'] + 
    features_information_retrieval['f7_log_and_jira_all_jira_all_as_query']
) / 2

#Save results in pickle
features_information_retrieval.to_pickle(path= f"{feature_dir}/features_information_retrieval.pkl")


### 3.2.2 Create WordtoVec Features

In [4]:
#Create new dataFrame
features_word2vec = pd.DataFrame() 

In [5]:
from gensim.models import Word2Vec
model_path = f"../data/models/word2vec_trained.model"
model=Word2Vec.load(model_path)

#### WordtoVec Features - Log Message and Summary

In [None]:
#Start timer
startTime = time.time() 

# Split computation into smaller batches
batch_size = 8  # Adjust batch size as needed to fit GPU memory

#Calculate cosine similarity for each trace
features_word2vec["f5_log_and_summary"] = calculate_word2vec_similarity(cartesian_df['summary'].tolist(), cartesian_df['Message'].tolist(), model,batch_size=batch_size)

#Save results in pickle
os.makedirs(f"{feature_dir}/word2vec", exist_ok=True)
features_word2vec.to_pickle(path=f"{feature_dir}/word2vec/features_word2vec.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished after " + timeDifference)

#### WordtoVec Features - Log Message and Description

In [None]:
#Start timer
startTime = time.time() 

# Split computation into smaller batches
batch_size = 8  # Adjust batch size as needed to fit GPU memory

#Calculate cosine similarity for each trace
features_word2vec["f6_log_and_description"] = calculate_word2vec_similarity(cartesian_df['description'].tolist(), cartesian_df['Message'].tolist(), model,batch_size=batch_size)

#Save results in pickle
features_word2vec.to_pickle(path= f"{feature_dir}/word2vec/features_word2vec.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished after " + timeDifference)

#### WordtoVec Features - Log Message and JIRA All-Natural Text

In [None]:
#Start timer
startTime = time.time() 

# Split computation into smaller batches
batch_size = 8  # Adjust batch size as needed to fit GPU memory

#Calculate cosine similarity for each trace
features_word2vec["f7_log_and_jira_all"] = calculate_word2vec_similarity(cartesian_df['jira_natual_text'].tolist(), cartesian_df['Message'].tolist(), model,batch_size=batch_size)

#Save results in pickle
features_word2vec.to_pickle(path= f"{feature_dir}/word2vec/features_word2vec.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished after " + timeDifference)

### 3.2.3 Create FastText for the corpus

In [10]:
#Create new dataFrame
features_fastText = pd.DataFrame() 

In [11]:
from gensim.models import FastText
model_path = f"../data/models/fasttext_trained.model"
model=FastText.load(model_path)

#### FastText Features - Log Message and Summary

In [None]:
#Start timer
startTime = time.time() 

# Split computation into smaller batches
batch_size = 8  # Adjust batch size as needed to fit GPU memory

#Calculate cosine similarity for each trace
features_fastText["f5_log_and_summary"] = calculate_fasttext_similarity(cartesian_df['summary'].tolist(), cartesian_df['Message'].tolist(), model=model,batch_size=batch_size)

#Save results in pickle
os.makedirs(f"{feature_dir}/fast_text", exist_ok=True)
features_fastText.to_pickle(path= f"{feature_dir}/fast_text/features_fast_text.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished after " + timeDifference)

#### FastText Features - Log Message and Description

In [None]:
#Start timer
startTime = time.time() 

# Split computation into smaller batches
batch_size = 8 

#Calculate cosine similarity for each trace

features_fastText["f6_log_and_description"] = calculate_fasttext_similarity(cartesian_df['description'].tolist(), cartesian_df['Message'].tolist(), model=model,batch_size=batch_size)

#Save results in pickle
features_fastText.to_pickle(path= f"{feature_dir}/fast_text/features_fast_text.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished after " + timeDifference)

#### FastText Features - Log Message and JIRA All-Natural Text

In [None]:
#Start timer
startTime = time.time() 

# Split computation into smaller batches
batch_size = 8  # Adjust batch size as needed to fit GPU memory

#Calculate cosine similarity for each trace

features_fastText["f7_log_and_jira_all"] = calculate_fasttext_similarity(cartesian_df['jira_natual_text'].tolist(), cartesian_df['Message'].tolist(), model=model,batch_size=batch_size)

#Save results in pickle
features_fastText.to_pickle(path= f"{feature_dir}/fast_text/features_fast_text.pkl")

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished after " + timeDifference)

### 3.2.4 Create SentenceTransformer  embeddings

In [39]:
#Create new dataFrame
features_sentence_transformer = pd.DataFrame() 
#Create embedding path
model_type='sentence_transformer'
embeddings_dir = f"{feature_dir}/{model_type}"

### Generate embeddings for texts

In [40]:
#Summary embedding (Jira)
embedding_summary_df=process_jira_embeddings(jira_df_clean, 'summary', 'summary_embedding', embeddings_dir,model_type='sentence_transformer')

In [41]:
#Description embedding (Jira)
embedding_description_df=process_jira_embeddings(jira_df_clean, 'description', 'description_embedding', embeddings_dir,model_type='sentence_transformer')

In [42]:
#All: summary+description (Jira)
embedding_all_df=process_jira_embeddings(jira_df_clean, 'jira_natual_text', 'jira_natual_text_embedding', embeddings_dir,model_type='sentence_transformer')

In [43]:
#Log Message (SVN)
embedding_message_df=process_svn_embeddings(svn_df_clean, 'Message', 'Message_embedding', embeddings_dir,model_type='sentence_transformer')

In [44]:
#Run this code block when you've restarted the kernel, and want to use previously gained embeddings.
embedding_summary_df = pd.read_pickle(f"{embeddings_dir}/embedding_summary.pkl")
embedding_description_df = pd.read_pickle(f"{embeddings_dir}/embedding_description.pkl")
embedding_all_df = pd.read_pickle(f"{embeddings_dir}/embedding_jira_natual_text.pkl")
embedding_message_df = pd.read_pickle(f"{embeddings_dir}/embedding_Message.pkl")

#### SentenceTransformer  - Log Message and Summary

In [45]:
#Create cartesian products JIRA x Commits
cartesian_embeddings = embedding_summary_df.merge(embedding_message_df, how='cross')
#Drop all rows which do not meet the rules of causality
cartesian_embeddings = cartesian_embeddings.drop(cartesian_embeddings[cartesian_embeddings.Jira_created_date > cartesian_embeddings.Commit_date].index)

In [None]:
#Start timer
startTime = time.time() 
f5_log_and_summary(features_sentence_transformer,cartesian_embeddings,feature_dir,model_type)

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished after " + timeDifference)

#### SentenceTransformer Features - Log Message and Description

In [47]:
#Create cartesian products JIRA x Commits
cartesian_embeddings = embedding_description_df.merge(embedding_message_df, how='cross')
#Drop all rows which do not meet the rules of causality
cartesian_embeddings = cartesian_embeddings.drop(cartesian_embeddings[cartesian_embeddings.Jira_created_date > cartesian_embeddings.Commit_date].index)

In [None]:
#Start timer
startTime = time.time() 
f6_log_and_description(features_sentence_transformer,cartesian_embeddings,feature_dir,model_type)

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished after " + timeDifference)

#### SentenceTransformer - Log Message and JIRA All-Natural Text

In [49]:
#Create cartesian products JIRA x Commits
cartesian_embeddings = embedding_all_df.merge(embedding_message_df, how='cross')
#Drop all rows which do not meet the rules of causality
cartesian_embeddings = cartesian_embeddings.drop(cartesian_embeddings[cartesian_embeddings.Jira_created_date > cartesian_embeddings.Commit_date].index)

In [None]:
#Start timer
startTime = time.time() 
f7_log_and_jira_all(features_sentence_transformer,cartesian_embeddings,feature_dir,model_type)

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished after " + timeDifference)

### 3.2.5 Create LLM embeddings

In [52]:
#Create new dataFrame
features_openai = pd.DataFrame() 
#Create embedding path
model_type='openai'
embeddings_dir = f"{feature_dir}/{model_type}"

### Generate embeddings for texts

In [53]:
#Summary embedding (Jira)
embedding_summary_df=process_jira_embeddings(jira_df_clean, 'summary', 'summary_embedding', embeddings_dir,model_type='openai')

In [54]:
#Description embedding (Jira)
embedding_description_df=process_jira_embeddings(jira_df_clean, 'description', 'description_embedding', embeddings_dir,model_type='openai')

In [55]:
#All: summary+description (Jira)
embedding_all_df=process_jira_embeddings(jira_df_clean, 'jira_natual_text', 'jira_natual_text_embedding', embeddings_dir,model_type='openai')

In [56]:
#Log Message (SVN)
embedding_message_df=process_svn_embeddings(svn_df_clean, 'Message', 'Message_embedding', embeddings_dir,model_type='openai')

In [57]:
#Run this code block when you've restarted the kernel, and want to use previously gained embeddings.
embedding_summary_df = pd.read_pickle(f"{embeddings_dir}/embedding_summary.pkl")
embedding_description_df = pd.read_pickle(f"{embeddings_dir}/embedding_description.pkl")
embedding_all_df = pd.read_pickle(f"{embeddings_dir}/embedding_jira_natual_text.pkl")
embedding_message_df = pd.read_pickle(f"{embeddings_dir}/embedding_Message.pkl")

#### openAI  - Log Message and Summary

In [58]:
#Create cartesian products JIRA x Commits
cartesian_embeddings = embedding_summary_df.merge(embedding_message_df, how='cross')
#Drop all rows which do not meet the rules of causality
cartesian_embeddings = cartesian_embeddings.drop(cartesian_embeddings[cartesian_embeddings.Jira_created_date > cartesian_embeddings.Commit_date].index)

In [None]:
#Start timer
startTime = time.time() 
f5_log_and_summary(features_openai,cartesian_embeddings,feature_dir,model_type)

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished after " + timeDifference)

#### openAI - Log Message and Description

In [62]:
#Create cartesian products JIRA x Commits
cartesian_embeddings = embedding_description_df.merge(embedding_message_df, how='cross')
#Drop all rows which do not meet the rules of causality
cartesian_embeddings = cartesian_embeddings.drop(cartesian_embeddings[cartesian_embeddings.Jira_created_date > cartesian_embeddings.Commit_date].index)

In [None]:
#Start timer
startTime = time.time() 
f6_log_and_description(features_openai,cartesian_embeddings,feature_dir,model_type)

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished after " + timeDifference)

#### openAI - Log Message and JIRA All-Natural Text

In [65]:
#Create cartesian products JIRA x Commits
cartesian_embeddings = embedding_all_df.merge(embedding_message_df, how='cross')
#Drop all rows which do not meet the rules of causality
cartesian_embeddings = cartesian_embeddings.drop(cartesian_embeddings[cartesian_embeddings.Jira_created_date > cartesian_embeddings.Commit_date].index)

In [None]:
#Start timer
startTime = time.time() 
f7_log_and_jira_all(features_openai,cartesian_embeddings,feature_dir,model_type)

endTime = time.time()
timeDifference = calculateTimeDifference(startTime=startTime, endTime=endTime)
print("Finished after " + timeDifference)

## 3.3 Generate feature sets - Load and transform feature families needed for training
Load features 

In [None]:
#Load Process-Related Features
features_process_related = pd.read_pickle(f"{feature_dir}/features_process_related.pkl")

#Load IR TF-IDF Features
features_information_retrieval = pd.read_pickle(f"{feature_dir}/features_information_retrieval.pkl")

#Load Word2Vec Features
features_word2vec = pd.read_pickle(f"{feature_dir}/features_word2vec.pkl")

#Load fastText Features
features_fastText= pd.read_pickle(f"{feature_dir}/features_fast_text.pkl")

#Load SentenceTransformer Features
features_sentence_transformer= pd.read_pickle(f"{feature_dir}/features_sentence_transformer.pkl")

#Load openAi Features
features_openai= pd.read_pickle(f"{feature_dir}/features_openai.pkl")


# 4 Evaluation

## 4.1 RQ1

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    roc_auc_score, precision_recall_curve, auc, fbeta_score,
    f1_score, accuracy_score, precision_score, recall_score, average_precision_score
)
import matplotlib.pyplot as plt
feature_dir = f"../data/features/{project}"
results_dir = f"../data/results/{project}"
model_types = ['information_retrieval', 'fast_text', 'word2vec', 'sentence_transformer', 'openai']#[
similarity_cols = ['f5_log_and_summary', 'f6_log_and_description', 'f7_log_and_jira_all']
results_all = []
def get_best_threshold(y_true, y_scores, beta_list=[1.0]):#[0.5, 1.0, 2.0]
    precision, recall, thresholds = precision_recall_curve(y_true, y_scores)
    # Add a maximum value to ensure alignment with precision/recall
    thresholds = np.append(thresholds, 1.0)  

    best_results = {}
    for beta in beta_list:
        f_beta = (1 + beta**2) * precision * recall / (beta**2 * precision + recall + 1e-8)
        best_idx = np.argmax(f_beta)
        best_threshold = thresholds[best_idx]
        best_results[beta] = {
            'threshold': best_threshold,
            'precision': precision[best_idx],
            'recall': recall[best_idx],
            f'F{beta}': f_beta[best_idx]
        }
    return best_results

for model_type in model_types:
    print(f"\n====== Evaluating model: {model_type} ======")

    # Load features and labels
    features_df = pd.read_pickle(f"{feature_dir}/{model_type}/features_{model_type}.pkl")
    labels_df = pd.read_pickle(f"{feature_dir}/labels_df.pkl")


    # Calculate the average similarity feature
    features_df['avg_similarity'] = features_df[similarity_cols].mean(axis=1)
    all_cols = similarity_cols + ['avg_similarity']

    labels = labels_df
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        features_df[all_cols], labels, test_size=0.2, stratify=labels, random_state=42
    )
    X_train= X_train.fillna(0)
    X_test= X_test.fillna(0)

    for col in all_cols:
        print(f"\n--- Evaluating similarity feature: {col} ---")
        
        # Best thresholds for F0.5, F1, F2
        best_thresholds = get_best_threshold(y_train.values, X_train[col].values)

        y_scores = X_test[col].values
        y_true = y_test.values


        for beta, result in best_thresholds.items():
            threshold = result['threshold']
            preds = (y_scores >= threshold).astype(int)

            acc = accuracy_score(y_true, preds)
            prec = precision_score(y_true, preds)
            rec = recall_score(y_true, preds)
            f1 = f1_score(y_true, preds)
            f2 = fbeta_score(y_true, preds, beta=2)
            f05 = fbeta_score(y_true, preds, beta=0.5)
            ap_score = average_precision_score(y_true, preds)
            roc_auc = roc_auc_score(y_true, preds)

            results_all.append({
                'model_type': model_type,
                'similarity_feature': col,
                'F_type': f'F{beta}',
                'threshold': round(threshold, 4),
                'precision': round(prec, 4),
                'recall': round(rec, 4),
                'accuracy': round(acc, 4),
                'F1': round(f1, 4),
                'F2':round(f2,4),
                'F0.5':round(f05,4),
                'avg_precision_score': round(ap_score, 4),
                #f'F{beta}': round(result[f'F{beta}'], 4),
                'ROC_AUC': round(roc_auc, 4),
                'PR_AUC': round(ap_score, 4),
                
            })

        # Optional: Draw a PR curve
        precision_curve, recall_curve, _ = precision_recall_curve(y_true, y_scores)
        plt.plot(recall_curve, precision_curve, label=f"{col} (PR AUC={ap_score:.2f})")

    plt.title(f"PR Curves - {model_type}")
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.legend()
    plt.grid()
    plt.show()

# Save results to Excel
results_df = pd.DataFrame(results_all)
results_df.to_excel(f"{results_dir}/local_rq1_test_results.xlsx", index=False)
print(f"All model evaluation results have been saved to: {results_dir}/local_rq1_test_results.xlsx")

## 4.2 RQ2 & RQ3

### ALL

In [None]:
#models dir
models_dir = f"../data/models/{project}"
os.makedirs(models_dir, exist_ok=True)
print(models_dir)

# Load similarity features 
model_type= 'sentence_transformer' #''information_retrieval', 'fast_text', 'word2vec', 'sentence_transformer', 'openai'
#results dir
results_dir = f"../data/results/{project}/all/{model_type}"
os.makedirs(results_dir, exist_ok=True)
print(results_dir)

#load data
features_similarity = pd.read_pickle(f"{feature_dir}/features_{model_type}.pkl")

features_process = pd.read_pickle(f"{feature_dir}/features_process_related.pkl")

labels = pd.read_pickle(f"{feature_dir}/labels_df.pkl")
features_similarity = features_similarity.reset_index(drop=True)
features_process = features_process.reset_index(drop=True)
labels=labels.reset_index(drop=True)
print(features_similarity.shape,features_process.shape,labels.shape)
#concat data
process_cols = ['f1_assignee_is_commiter', 'f2_timedif_issuecreation_and_commitcreation', 
                'f3_timedif_issueupdated_and_commitcreation', 'f4_timedif_issueresolved_and_commitcreation']
similarity_cols = ['f5_log_and_summary', 'f6_log_and_description', 'f7_log_and_jira_all']

features_df = pd.concat([features_similarity[similarity_cols], features_process[process_cols]], axis=1)

### similarity

In [None]:
#models dir
models_dir = f"../data/models/{project}"
os.makedirs(models_dir, exist_ok=True)

# Load similarity features 
model_type= 'openai' #''information_retrieval', 'fast_text', 'word2vec', 'sentence_transformer', 'openai'
#results dir
results_dir = f"../data/results/{project}/similarity/{model_type}"
os.makedirs(results_dir, exist_ok=True)

#load data
features_similarity = pd.read_pickle(f"{feature_dir}/features_{model_type}.pkl")
labels = pd.read_pickle(f"{feature_dir}/labels_df.pkl")

features_similarity = features_similarity.reset_index(drop=True)
labels=labels.reset_index(drop=True)

#print(features_similarity.shape,labels.shape)
#concat data

similarity_cols = ['f5_log_and_summary', 'f6_log_and_description', 'f7_log_and_jira_all']
features_df = features_similarity[similarity_cols]

### process

In [None]:
#models dir
models_dir = f"../data/models/{project}"
os.makedirs(models_dir, exist_ok=True)
#results dir
results_dir = f"../data/results/{project}/process"
os.makedirs(results_dir, exist_ok=True)

#load data
features_process = pd.read_pickle(f"{feature_dir}/features_process_related.pkl")
labels = pd.read_pickle(f"{feature_dir}/labels_df.pkl")

features_process = features_process.reset_index(drop=True)
labels=labels.reset_index(drop=True)

print(features_process.shape,labels.shape)
#concat data

process_cols = ['f1_assignee_is_commiter', 'f2_timedif_issuecreation_and_commitcreation', 
                'f3_timedif_issueupdated_and_commitcreation', 'f4_timedif_issueresolved_and_commitcreation']
features_df =features_process[process_cols]

In [None]:
#Set the NaN to 0
features_all_df = features_df.fillna(0)
feature_name_df = list(features_all_df.columns)
#Transform pandas data frame into numpy arrays
features_all_array = np.array(features_all_df)
labels_array = np.array(labels["is_valid"])
print(feature_name_df)

In [2]:
#Import Python Libraries
import pandas as pd
import numpy as np


from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import recall_score
from sklearn.metrics import average_precision_score

from imblearn.pipeline import Pipeline 
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# Import the model we are using
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
#import lightgbm as lgb

import matplotlib.pyplot as plt
import shap


#Method to show the different model evaluation metrics
def showModelPerformance(trainedModel, testFeatures, testLabels):#, threshold=0.5
    # Use the fitted model to predict the labels of the test set
    predictionLabels = trainedModel.predict(testFeatures)
    
    #Calculate the different metrics for the test vs predicted labels
    accuracyValue = accuracy_score(testLabels.astype(bool), predictionLabels)
    precisionValue = precision_score(testLabels.astype(bool), predictionLabels, average='binary')
    f1Value = f1_score(testLabels.astype(bool), predictionLabels)
    f2Value = fbeta_score(testLabels.astype(bool), predictionLabels, beta=2.0)
    f05Value = fbeta_score(testLabels.astype(bool), predictionLabels, beta=0.5)
    recallValue = recall_score(testLabels.astype(bool), predictionLabels)
    averagePrecisionValue = average_precision_score(testLabels.astype(bool), predictionLabels)
    
    #Create a dataframe to output all evaluation metrics in
    performanceData = {'Accuracy':  [accuracyValue],
                       'Precision': [precisionValue],
                       'Recall': [recallValue],
                       'F1': [f1Value],
                       'F2': [f2Value],
                       'F0.5': [f05Value],
                       'Average Precision': [averagePrecisionValue]
                      }
    performanceDf = pd.DataFrame(performanceData)
    return(performanceDf)

#Method to define the Pipeline steps based on the given rebalancing strategy and classification algorithm
def define_steps(rebalancing_strategy, classification_algorithm):
    steps = None
    if(rebalancing_strategy == 'none'):
        if(classification_algorithm == 'random_forests'):
            steps = [['classifier', RandomForestClassifier(n_jobs=-1)]]
        elif (classification_algorithm == 'xg_boost'):
            steps = [['classifier', xgb.XGBClassifier(n_jobs=-1)]]
            return(steps)

    elif(rebalancing_strategy == 'over'):
        if(classification_algorithm == 'random_forests'):
            steps = [['smote', SMOTE()],
                    ['classifier', RandomForestClassifier(n_jobs=-1)]]
        elif (classification_algorithm == 'xg_boost'):
            steps = [['smote', SMOTE()],
                    ['classifier', xgb.XGBClassifier(n_jobs=-1)]]

    elif(rebalancing_strategy == 'under'):
        if(classification_algorithm == 'random_forests'):
            steps = [['under', RandomUnderSampler()],
                    ['classifier', RandomForestClassifier(n_jobs=-1)]]
        elif (classification_algorithm == 'xg_boost'):
            steps = [['under', RandomUnderSampler()],
                    ['classifier', xgb.XGBClassifier(n_jobs=-1)]]

    elif(rebalancing_strategy == '5050'):
        if(classification_algorithm == 'random_forests'):
            steps = [['smote', SMOTE(sampling_strategy = 0.5)],
                    ['under', RandomUnderSampler()],
                    ['classifier', RandomForestClassifier(n_jobs=-1)]]
        elif (classification_algorithm == 'xg_boost'):
            steps = [['smote', SMOTE(sampling_strategy = 0.5)],
                    ['under', RandomUnderSampler()],
                    ['classifier', xgb.XGBClassifier(n_jobs=-1)]]


    return steps

def get_param_space(algorithm_name):
    if algorithm_name == "random_forests":
        return {
            'classifier__n_estimators': [],
            'classifier__max_depth': [],
            'classifier__min_samples_split': []
        }
    elif algorithm_name == "xg_boost":
        return {
            'classifier__n_estimators': [],
            'classifier__learning_rate': [],
            'classifier__max_depth': []
        }


#Method to generate the f1, f2, f0.5, accuracy, precision, recall, and average precision
def generate_evaluation_metrics(rebalancing_strategy, classification_algorithm, data, labels, is_normalized, n_runs, feature_names):
    #Create a dataframe to append to the results of each individual run
    evaluation_df = pd.DataFrame(
    {
        'Accuracy':  [],
        'Precision': [],
        'Recall': [],
        'F1': [],
        'F2': [],
        'F0.5': [],
        'Average Precision': []
    })
    
    #Create a np array to put the importances per feature in
    importance_array = np.empty(shape=(n_runs, data.shape[1])) #85 data.shape[1]
    
    #Perform the described pipeline steps to produce the results for the defined number of runs
    for i in range(n_runs):
        
        X_train, X_test, y_train, y_test = train_test_split(data,
                                                        labels,
                                                        test_size=0.2,
                                                        stratify=labels)
        
        #Set the pipeline steps according to the defined rebalancing strategy and classification algorithm
        steps = define_steps(rebalancing_strategy, classification_algorithm)
        
        #Create the pipeline
        model_pipeline = Pipeline(steps=steps)
        
        space_empty = dict()    
        
        #hyper-parameters
        space = get_param_space(classification_algorithm)

        stratified_kfold = StratifiedKFold(n_splits=10,shuffle=True)           
        randomized_search = RandomizedSearchCV(
            estimator=model_pipeline,
            param_distributions=space,
            n_iter=10,  # The number of search rounds
            n_jobs=1, #-1
            cv=stratified_kfold,
            scoring='f1',
            verbose=1,
            random_state=42
        )
        print("model_pipeline",model_pipeline)


        print(f"\n====== The {i+1}th round of model parameter adjustment... ======")
        randomized_search.fit(X_train, y_train)
        best_params = randomized_search.best_params_
        print("Best parameters:", best_params)

        steps = define_steps(rebalancing_strategy, classification_algorithm)
        classifier_params = {k.replace('classifier__', ''): v for k, v in best_params.items()}

        for idx, (step_name, estimator) in enumerate(steps):
            if step_name == 'classifier':
                if classification_algorithm == "random_forests":
                    steps[idx][1] = RandomForestClassifier(**classifier_params)
                elif classification_algorithm == "xg_boost":
                    steps[idx][1] = xgb.XGBClassifier(**classifier_params)

        model = Pipeline(steps=steps)
        
        #Fit the model on the training data
        fitted_model = model.fit(X_train, y_train)
        
        #Evaluate the fitted model
        fitted_model_evaluation_df = showModelPerformance(trainedModel = fitted_model, 
                         testFeatures = X_test, 
                         testLabels = y_test)     #,threshold=0.3
        fitted_model_evaluation_df['Best Params'] = str(best_params)
        #Add the evaluation of the current run to the results of the previous runs
        evaluation_df = pd.concat([evaluation_df,
                                   fitted_model_evaluation_df])
        
        
        #Find the feature importances of the fitted model
        if(classification_algorithm == "light_gbm"):
            #current_importances = fitted_model.best_estimator_._final_estimator.booster_.feature_importance(importance_type='gain')
            current_importances = fitted_model.named_steps['classifier'].booster_.feature_importance(importance_type='gain')
        else:
            #current_importances = fitted_model.best_estimator_._final_estimator.feature_importances_
            current_importances = fitted_model.named_steps['classifier'].feature_importances_
        #Add the feature importances of the current fitted model to the results of the previous runs

        importance_array[i] = current_importances  
    #Set the index as the run number
    evaluation_df = evaluation_df.reset_index(drop = True)
    evaluation_df.index += 1 
    evaluation_df.index.name = "run"
    
    #Output the evaluation data to a csv file
    evaluation_df.to_csv(f"{results_dir}/{classification_algorithm}_{rebalancing_strategy}_{n_runs}_fnew_results.csv")
    
    #Transform the importance array to a data frame
    importance_df = pd.DataFrame(data=importance_array, 
                                 columns= feature_names, 
                                 index=list(range(1, n_runs +1)))
    
    #Set the index as the run number
    importance_df.index.name = "run"
    
    #Output the importance data to a csv file
    importance_df.to_csv(f"{results_dir}/{classification_algorithm}_{rebalancing_strategy}_{n_runs}_fnew_feature_importance_results.csv")
    

In [None]:
#random_forests
generate_evaluation_metrics(rebalancing_strategy = 'none', 
                            classification_algorithm = 'random_forests', 
                            data = features_all_array, 
                            labels = labels_array, 
                            feature_names = feature_name_df,
                            is_normalized = False,
                            n_runs = 10)#10

In [None]:
#XG Boost

generate_evaluation_metrics(rebalancing_strategy = 'none', 
                            classification_algorithm = 'xg_boost', 
                            data = features_all_array, 
                            labels = labels_array, 
                            feature_names = feature_name_df,
                            is_normalized = False,
                            n_runs = 10)