# Set the target repository

In [None]:
# Project name and URL link to it.
project_name = "elasticsearch"
git_repo_url = "https://github.com/elastic/elasticsearch"

# Installing the required libraries

In [None]:
!pip install --upgrade pip
!pip install --upgrade nltk
!pip install --upgrade pathvalidate

# Importing libraries

In [None]:
import os
import re
import csv
import nltk
import subprocess
from datetime import datetime
from nltk.corpus import stopwords
from pathvalidate import is_valid_filepath

# Define analysis settings

In [None]:
threshold = 0.4  # Messages with a count less than this value are not saved.
save_details = False

# 1. Preparing data for analysis

## 1.1. We receive all commit messages from the specified repository

In [None]:
fresh = True
if os.path.exists("repo"):
    print("local 'repo' folder found. reuse [Y/n]? ", end=" ")
    user_input = input()
    if user_input.lower() == "y" or user_input == "":
        fresh = False
        
# If an update is required, we clone the repository.
if fresh:
    print("clone into 'repo'")
    if not os.path.exists("repo"):
        os.makedirs("repo")
    subprocess.run(["git", "clone", git_repo_url, "repo"])
else:
    print("reuse local 'repo'")

In [None]:
field_separator = "^@@@^"
line_separator = "^&_@&_@&_@^"
git_format = "--pretty=format:%H" + field_separator + "%s%n%b" + line_separator

In [None]:
# Run git log and get the output.
proc = subprocess.run(["git", "log", git_format], cwd='repo/', stdout=subprocess.PIPE)
raw_data = proc.stdout.decode("iso-8859-1")
# We split the output into elements (hash and commit message).
item_list = raw_data.split(line_separator)

In [None]:
# Create a list to store hashes and commit messages.
res = []
for item in item_list:
    item = item.strip()
    if item.find(field_separator) > 0:
        parts = item.split(field_separator)
        commit_hash = parts[0].strip()
        msg = parts[1].strip()
        res.append([commit_hash, msg])

In [None]:
# Display a message about the number of loaded commit messages.
print("* %d commit messages dumped" % len(res))

## 1.2. Filtering some patterns from commit messages

In [None]:
res_filtered = []
for each_instance in res:
    id_, msg_ = each_instance
    msg_list = msg_.split("\n")
    filtered = []
    for sent_ in msg_list:
        sent_ = sent_.strip('\t\n\r\f\v').strip()
        # Exclude lines containing the following patterns:
        # 1. The "-[Bb][Yy]:" pattern, which is often used to indicate the author of the changes (by, By, bY, BY);
        # 2. The "[Cc][Cc]:" pattern, which can indicate a copyright or some kind of contact address (Cc, CC, cc, cC);
        # 3. The pattern "[Hh][Tt][Tt][Pp]s?:", which usually points to a link (HTTP, https, Http, http);
        # 4. Empty lines (len(sent_) == 0).
        if re.search('-[Bb][Yy]:', sent_) or re.search('[Cc][Cc]:', sent_) or re.search('[Hh][Tt][Tt][Pp]s?:', sent_) or len(sent_) == 0:
            pass
        else:
            filtered.append(sent_)
    res_filtered.append([id_, msg_, filtered])

## 1.3. We process the original tokens

In [None]:
nltk.download('stopwords', download_dir="nltk_data")
nltk.download('punkt', download_dir="nltk_data")
nltk.data.path.append("nltk_data")

In [None]:
# Tokenize the remaining lines except the first.
filtered_token = []
for each in res_filtered:
    id_, old, new = each
    if len(new) >= 2:
        other_sentence = " ".join(new[1:])
        other_sentence = other_sentence.strip('\t\n\r\f\v').strip().lower()
        # Tokenize the string using the NLTK library.
        tokens = nltk.word_tokenize(other_sentence)
        # Exclude punctuation marks and numbers from the list of tokens.
        tokens = [tok for tok in tokens if tok.isalpha()]
        # Remove stopwords from the list of tokens.
        stop_words = set(stopwords.words("english"))
        tokens = [w for w in tokens if not w in stop_words]
        # Remove tokens containing less than three characters.
        tokens = [w for w in tokens if len(w) > 2]
        if len(tokens) > 0:
            filtered_token.append([id_, old, tokens])

# 2. Analysis

# 2.1. We read keywords from the file

In [None]:
keyword_dict = dict()
with open("keywords.csv", encoding="utf8") as csvfile:
    csv_reader = csv.reader(csvfile)
    birth_header = next(csv_reader)
    for row in csv_reader:  
        class_ = row[0]
        words = row[1]
        score = row[2]
        if class_ != "":
            cur_class = class_
            keyword_dict[cur_class] = dict()
        word_list = words.split("/")
        for each in word_list:
            keyword_dict[cur_class][each] = score

# 2.2. We obtain all proposals from pre-processed data

In [None]:
sentences = []
for each in filtered_token:
    id_, org, s = each
    sentences.append([id_, " ".join(s), org])

# 2.3. We calculate initial estimates for each proposal

In [None]:
scores = []
all_original_msgs = []
for each in sentences:  
    s = 0
    id_, each_s, org_s = each
    classes, key_words = [], []
    for each_class in keyword_dict:  
        cur_class = keyword_dict[each_class]
        for each_key in cur_class:  
            c_sc = float(cur_class[each_key])  
            if each_s.find(each_key) >= 0:  
                s += c_sc 
                classes.append(each_class)
                key_words.append(each_key)
    scores.append([id_, s, classes, key_words, each_s, org_s])
    all_original_msgs.append(org_s)

## 2.4 We process sentences and assign tags to them based on keywords

In [None]:
all_tagged = []
total = len(all_original_msgs)
idx = 0 
for each in all_original_msgs:
    # Analyze each message using a keyword dictionary.
    new_msg = []
    sentences = each.split("\n")
    for each_sentence in sentences:
        each_sentence = each_sentence.strip('\t\n\r\f\v').strip().lower()
        tokens = each_sentence.split(" ")
        new_tokens = []
        for tok in tokens:
            tok = tok.strip("\t\n\r\f\v.:',;")
            if tok == "":
                continue
            if tok.isalpha():
                # Checking the token for presence in the key dictionary.
                category = None
                for each_cate in keyword_dict:
                    cur_dict = keyword_dict[each_cate]
                    if tok in cur_dict:
                        category = each_cate

                if category:
                    tok = "<keyword category=" + category + ">" + tok + "</keyword>"
                new_tokens.append(tok)
            else:
                if "(" in tok and tok.endswith(")"):
                    parts = tok[: -1].split("(")
                    if parts[0].isidentifier():
                        tok = "<function>" + tok + "</function>"
                elif "." in tok:
                    if is_valid_filepath(tok):
                        _, f_ext = os.path.splitext(tok)
                        if f_ext == ".c" or f_ext == ".h" or f_ext == ".cpp" or f_ext == ".hpp":
                            tok = "<file_name>" + tok + "</file_name>"
                elif tok.isidentifier():
                    tok = "<variable>" + tok + "</variable>"
                new_tokens.append(tok)
        new_msg.append(new_tokens)
    
    all_tagged.append(new_msg)

## 2.5. We collect tags and combine them for each message

In [None]:
def get_nearest(key_index, cur_set, set_indexes):
    """
    Returns the closest element from the given set to the specified index.

    :param key_index: Key element index;
    :param cur_set: Set of elements;
    :param set_indexes: List of indexes of elements from the set.
    :return: The closest element to the specified index.
    """
    near = None
    if len(cur_set) == 0:
        pass
    elif len(cur_set) == 1:
        near = cur_set[0]
    else:
        val = abs(key_index - set_indexes[0])
        index = 0 
        for i in range(1, len(set_indexes)):
            cur_val = abs(key_index - set_indexes[i])
            if cur_val < val:
                index = i
        near = cur_set[index]
    return near

In [None]:
def clean_variable(tok):
    new_tok = tok[len('<variable>'): tok.find('</')]
    return new_tok
def clean_function(tok):
    new_tok = tok[len('<function>'): tok.find('</')]
    return new_tok
def clean_filename(tok):
    new_tok = tok[len('<file_name>'): tok.find('</')]
    return new_tok

def compute_pattern(key, key_index, variables, variable_indexes, functions, function_indexes, files,
                    file_indexes):
    """
    Computes a pattern for a tag.

    :param key: Keyword;
    :param key_index: Index of the keyword in the sentence;
    :param variables: List of variables;
    :param variable_indexes: Indexes of variables in the clause;
    :param functions: List of functions;
    :param function_indexes: Function indexes in the sentence;
    :param files: List of files;
    :param file_indexes: File indexes in the sentence.
    :return: key, category, nearest variable, nearest function, nearest file.
    """
    category = key[len('<keyword category='): key.find('>')]
    key = key[key.find('>') + 1: key.find('</')]

    near_var = get_nearest(key_index, variables, variable_indexes)  # Finding the closest variable.
    near_fun = get_nearest(key_index, functions, function_indexes)  # Finding the closest function.
    near_file = get_nearest(key_index, files, file_indexes)  # Finding the nearest file.
    if near_var is not None:
        near_var = clean_variable(near_var)
    if near_fun is not None:
        near_fun = clean_function(near_fun)
    if near_file is not None:
        near_file = clean_filename(near_file)
    return key, category, near_var, near_fun, near_file

In [None]:
def combine_collected_tags(collected_tags):
    """
    Combines collected tags.
    
    :param collected_tags: List of collected tags.
    :return: List of new combination tags.
    """
    new_combined = []
    if len(collected_tags) == 0:
        pass
    elif len(collected_tags) == 1:
        key_word, category, sentence, variable, function, file = collected_tags[0]
        new_ins = [category, sentence, [key_word], [variable], [function], [file]]
        new_combined.append(new_ins)
    else:
        start = collected_tags[0]
        kw_list, variable_list, function_list, file_list = [], [], [], []
        s_key_word, s_category, s_sentence, s_variable, s_function, s_file = start
        # Adding tags.
        kw_list.append(s_key_word)
        variable_list.append(s_variable)
        function_list.append(s_function)
        file_list.append(s_file)
        for index in range(1, len(collected_tags)):
            cur_instance = collected_tags[index]
            kw, cate, sent, var, fun, fil = cur_instance
            if cate == s_category and sent == s_sentence:
                kw_list.append(kw)
                variable_list.append(var)
                function_list.append(fun)
                file_list.append(fil)
                if index == len(collected_tags) - 1:
                    new_combined.append([s_category, s_sentence, kw_list, variable_list, function_list, file_list])
            else:
                new_combined.append([s_category, s_sentence, kw_list, variable_list, function_list, file_list])
                s_category = cate
                s_sentence = sent
                kw_list = [kw]
                variable_list = [var]
                function_list = [fun]
                file_list = [fil]
                if index == len(collected_tags) - 1:
                    new_combined.append([s_category, s_sentence, kw_list, variable_list, function_list, file_list])
    return new_combined 

In [None]:
def compute_strings(new_combined):
    """
    Generates strings based on combined tags.
    
    :param new_combined: List of combination tags.
    :return: List of strings generated from combination tags.
    """
    string_list = []
    if len(new_combined) == 0:
        pass
    else:
        # Iterate through each combination tag.
        for each in new_combined:  
            category, sentence, kws, varis, funs, files = each
            # Remove duplicates.
            kws = list(set(kws))
            varis = list(set(varis))
            funs = list(set(funs)) 
            files = list(set(files))
            # Remove None from the list.
            if None in kws:
                kws.remove(None)  
            if None in varis:
                varis.remove(None) 
            if None in funs:
                funs.remove(None) 
            if None in files:
                files.remove(None) 
            cur_str = "<Category: " + category + "> " + "<Sentence: " + sentence + "> " + "<Keyword: "+ " ".join(kws) + "> "
            # Formation of a string with category, sentence and keywords.
            if len(varis) > 0:
                cur_str += "<Variable: " + " ".join(varis) + "> "
            if len(funs) > 0:
                cur_str += "<Function: " + " ".join(funs) + "> "
            if len(files) > 0:
                cur_str += "<File: " + " ".join(files) + ">"
            string_list.append(cur_str)
    return string_list

In [None]:
statistics = {
    "all_none": 0,
    "nonkey_withother": 0,
    "withkey_nonother": 0,
    "withkey_withother": 0
}
collected_tags = []
for msg_index, each_msg in enumerate(all_tagged):
    cur_collected_tags = []
    for sent_index, each_sentence in enumerate(each_msg):
        keywords = [] 
        variables, functions, files = [], [], []
        keyword_indexes, variable_indexes, function_indexes, file_indexes = [], [], [], []
        for index, each_tok in enumerate(each_sentence):
            if "<keyword" in each_tok:
                keywords.append(each_tok)
                keyword_indexes.append(index)
            elif "<variable>" in each_tok:
                variables.append(each_tok)
                variable_indexes.append(index)
            elif "<function>" in each_tok:
                functions.append(each_tok)
                function_indexes.append(index)
            elif "<file_name>" in each_tok:
                files.append(each_tok)
                file_indexes.append(index)
            else:
                pass
        if len(keywords) > 0:
            if len(keywords) == 1:
                if len(variables) == 0 and len(functions) == 0 and len(files) == 0:
                    statistics["withkey_nonother"] += 1
                else:
                    statistics["withkey_withother"] += 1
                    key = keywords[0]
                    key_index = keyword_indexes[0]
                    key, cate, var, fun, fil = compute_pattern(key, key_index, variables, variable_indexes,
                                                                functions, function_indexes, files,
                                                                file_indexes)
                    ori_sent = all_original_msgs[msg_index][sent_index]
                    cur_collected_tags.append((key, cate, ori_sent, var, fun, fil))
            elif len(keywords) > 1:
                if len(variables) == 0 and len(functions) == 0 and len(files) == 0:
                    statistics["withkey_nonother"] += 1
                else:
                    statistics["withkey_withother"] += 1
                    for key_i in range(len(keywords)):
                        key = keywords[key_i]
                        key_index = keyword_indexes[key_i]
                        key, cate, var, fun, fil = compute_pattern(key, key_index, variables,
                                                                    variable_indexes,
                                                                    functions, function_indexes, files,
                                                                    file_indexes)
                        ori_sent = all_original_msgs[msg_index][sent_index]
                        cur_collected_tags.append((key, cate, ori_sent, var, fun, fil))
        else:
            if len(variables) == 0 and len(functions) == 0 and len(files) == 0:
                statistics["all_none"] += 1
            elif len(variables) > 0 or len(functions) > 0 or len(files) > 0:
                statistics["nonkey_withother"] += 1
    new_combined = combine_collected_tags(cur_collected_tags)  # Combining collected tags.
    string_represts = compute_strings(new_combined)  # Getting a string representation of tags.
    collected_tags.append(string_represts)  # Adding a string representation to the list of collected tags.

# 2.6. We save the analysis results

In [None]:
timestamp = datetime.now().strftime("%Y%m%d")  # Current time in YYYYMMDD format.
commit_cnt = 0
output_path = "result"  # Folder for saving results.
if not os.path.exists(output_path):
    os.makedirs(output_path)

# Saving hashes of commits with positive scores in a separate file.
with open(os.path.join(output_path, "%s_%s_commit_only.txt" % (timestamp, project_name)), "w") as fp:
    for index, each in enumerate(scores):
        commit_hash, score, _, _, _, _ = each
        if score > threshold:
            fp.write(commit_hash + "\n")
            commit_cnt += 1
print("* %d commits with positive scores identified" % commit_cnt)

In [None]:
# Saving commit hashes and their scores to a CSV file.
if save_details:
    with open(os.path.join(output_path, "%s_%s_commit_score.csv" % (timestamp, project_name)), "w") as fp:
        writer = csv.writer(fp)
        writer.writerow(["commit_hash", "score"])
        for index, each in enumerate(scores):
            commit_hash, score, _, _, _, _ = each
            writer.writerow([commit_hash, score])

In [None]:
# If the save_details flag is set to True, additional analysis details are saved.
if save_details:
    def remove_repeat(old_list):
        list1 = []
        for element in old_list:
            if (element not in list1):
                list1.append(element)
        return list1
    
    with open(os.path.join(output_path, "%s_%s_details.csv" % (timestamp, project_name)), "w") as fp:
        writer = csv.writer(fp)
        writer.writerow(
            ["hashcode", "score", "class", "keyword", "msg", "original msg"] + ["tagged", "pattern"])
        for index, each in enumerate(scores):
            id_, sc, classes, key_words, each_s, each_o = each
            original_writer = [id_, sc, "**".join(remove_repeat(classes)),
                               "**".join(remove_repeat(key_words)), each_s, each_o]
            tagged = all_tagged[index]
            new_string_tagged = []
            for each_sentence in tagged:
                new_string_tagged.append(" ".join(each_sentence))
            tagged_new = "\n".join(new_string_tagged)
            if len(collected_tags[index]) > 0:
                pattern_new = "\n".join(collected_tags[index])
                writer.writerow(original_writer + [tagged_new, pattern_new])
            else:
                writer.writerow(original_writer + [tagged_new])