Import necessary **modules**:

In [68]:
%matplotlib inline
import os
import artm
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import csv
from tqdm import tqdm
from datetime import datetime, timedelta
from tree_sitter import Language, Parser
from glob import glob
from collections import Counter
from operator import itemgetter
from typing import List, Tuple

Write a function to get the dates.

In [69]:
def get_dates(number: int, delta: int) -> List:
    """
    Creates a list of a given number of the datetime objects with a given step.
    :param number: the amount of dates.
    :param delta: the time step between dates
    :return: a list of datetime objects.
    """
    dates = []
    date = datetime.now()
    for i in range(number):
        dates.append(date)
        date = date - timedelta(days=delta)
    dates.sort()
    return dates

In [70]:
dates = get_dates(5, 31)
print(dates)

[datetime.datetime(2019, 10, 24, 20, 23, 12, 426345), datetime.datetime(2019, 11, 24, 20, 23, 12, 426345), datetime.datetime(2019, 12, 25, 20, 23, 12, 426345), datetime.datetime(2020, 1, 25, 20, 23, 12, 426345), datetime.datetime(2020, 2, 25, 20, 23, 12, 426345)]


Write a function to checkout a repository by date.

In [19]:
def checkout_by_date(repository: str, directory: str, date: datetime) -> None:
    """
    Checkout a given repository into a folder for a given date and time.
    :param repository: address of processed project.
    :param directory: address of target directory for a checkout.
    :param date: date and time of the last commit for the checkout
    :return: None.
    """
    os.system('cp -r ' + repository + ' ' + directory)
    os.system('(cd ' + directory + '; git checkout `git rev-list -n 1 --before="' + date.strftime('%Y-%m-%d') + '" master`)')
    # TODO: consider non-master branches

In [20]:
checkout_by_date('/home/diannao/java-design-patterns/', '/home/diannao/java-design-patterns_new', dates[2])

Write a function to get the file extensions for various languages.

In [22]:
def get_extensions(lang: str) -> str:
    """
    Returns the extension for a given language. TODO: more than one extension.
    :param lang: language name.
    :return: the extension.
    """
    extensions = {'cpp': 'cpp',
                  'java': 'java',
                  'python': 'py'}
    return extensions[lang]

Write a function to get a list of files with a given extension from a directory.

In [23]:
def get_a_list_of_files(directory: str, extension: str) -> List[str]:
    """
    Get a list of files with a given extension.
    :param directory: the root directory that is studied.
    :param extension: extension of the listed files.
    :return: list of file paths.
    """
    list_of_files = [y for x in os.walk(directory) for y in glob(os.path.join(x[0], '*.' + extension))]
    return list_of_files

In [72]:
files = get_a_list_of_files('/home/diannao/PycharmProjects/topic-dynamics/topic_dynamics/tests/test_files/', 'java')
len(files)

1

Write a function to read the contents of the file.

In [28]:
def read_file(file: str) -> bytes:
    """
    Read the contents of the file.
    :param file: address of the file.
    :return: bytes with the contents of the file.
    """
    with open(file, 'r') as fin:
        code = bytes(fin.read(), 'utf-8')
    return code

Write a function to get the positional bytes of the node.

In [31]:
def get_positional_bytes(node: tree_sitter.Node) -> Tuple[int, int]:
    """
    Extract start and end byte.
    :param node: node on the AST.
    :return: (start byte, end byte)
    """
    start = node.start_byte
    end = node.end_byte
    return start, end

Write the utility functions for parsing.

In [49]:
PARSERS = {}


def get_tree_sitter_dir() -> str:
    """
    Get tree-sitter directory.
    :return: absolute path.
    """
    return '/home/diannao/PycharmProjects/topic-dynamics/topic_dynamics/parsers/'


def get_tree_sitter_so() -> str:
    """
    Get build tree-sitter `.so` location.
    :return: absolute path.
    """
    tree_sitter_dir = get_tree_sitter_dir()
    bin_loc = os.path.join(tree_sitter_dir, "build/langs.so")
    return bin_loc


def main_parse() -> None:
    """
    Initialize tree-sitter library.
    :return: None
    """
    # root directory for tree-sitter
    tree_sitter_dir = get_tree_sitter_dir()
    # grammar locations
    c_grammar_loc = os.path.join(tree_sitter_dir, "vendor/tree-sitter-c")
    c_sharp_grammar_loc = os.path.join(tree_sitter_dir, "vendor/tree-sitter-c-sharp")
    cpp_grammar_loc = os.path.join(tree_sitter_dir, "vendor/tree-sitter-cpp")
    java_grammar_loc = os.path.join(tree_sitter_dir, "vendor/tree-sitter-java")
    python_grammar_loc = os.path.join(tree_sitter_dir, "vendor/tree-sitter-python")
    # location for library
    bin_loc = get_tree_sitter_so()
    # build everything
    Language.build_library(
        # Store the library in the `bin_loc`
        bin_loc,
        # Include languages
        [
            c_grammar_loc,
            c_sharp_grammar_loc,
            cpp_grammar_loc,
            java_grammar_loc,
            python_grammar_loc
        ]
    )


def get_parser(lang: str) -> Parser:
    """
    Initialize parser for a specific language.
    :param lang: language to use.
    :return: parser.
    """
    global PARSERS
    if lang not in PARSERS:
        parser = Parser()
        parser.set_language(Language(get_tree_sitter_so(), lang))
        PARSERS[lang] = parser
    else:
        parser = PARSERS[lang]
    return parser


Initialize the parsing.

In [41]:
main_parse()

Write a function to get the identifiers of a file.

In [74]:
def get_identifiers(file: str, lang: str) -> List[Tuple[str, int]]:
    """
    Gather a sorted list of identifiers in the file and their count.
    :param file: address of the file.
    :param lang: the language of file.
    :return: a list of tuples, identifier and count.
    """
    code = read_file(file)
    tree = get_parser(lang).parse(code)
    root = tree.root_node
    identifiers = []
    node_types = {'c': ['identifier', 'type_identifier'],
                  'c-sharp': ['identifier', 'type_identifier'],
                  'cpp': ['identifier', 'type_identifier'],
                  'java': ['identifier', 'type_identifier'],
                  'python': ['identifier', 'type_identifier']}

    def traverse_tree(node: tree_sitter.Node) -> None:
        """
        Run down the AST from a given node and gather identifiers from its childern.
        :param node: starting node.
        :return: None.
        """
        for child in node.children:
            if child.type in node_types[lang]:
                start, end = get_positional_bytes(child)
                identifier = code[start:end].decode('utf-8')
                if '\n' not in identifier:  # Will break output files. Can add other bad characters later
                    identifiers.append(identifier)
            if len(child.children) != 0:
                traverse_tree(child)

    traverse_tree(root)
    sorted_identifiers = sorted(Counter(identifiers).items(), key=itemgetter(1), reverse=True)

    return sorted_identifiers

In [75]:
identifiers = get_identifiers('../../tests/test_files/test.java', 'java')
print(identifiers)

[('i', 9), ('anArray', 6), ('length', 2), ('System', 2), ('out', 2), ('ArrayDemo', 1), ('main', 1), ('String', 1), ('args', 1), ('print', 1), ('println', 1)]


Write a function to transform the identifiers into a writeable format.

In [76]:
def transform_identifiers(identifiers: List) -> List[str]:
    """
    Transform the original list of identifiers into the writable form.
    :param identifiers: list of tuples, identifier and count.
    :return: a list of identifiers in the writable for, "identifier:count".
    """
    formatted_identifiers = []
    for identifier in identifiers:
        if identifier[0].rstrip() != '':  # Checking for occurring empty tokens.
            formatted_identifiers.append(identifier[0].rstrip() + ':' + str(identifier[1]).rstrip())
    return formatted_identifiers

In [77]:
formatted = transform_identifiers(identifiers)
print (formatted)

['i:9', 'anArray:6', 'length:2', 'System:2', 'out:2', 'ArrayDemo:1', 'main:1', 'String:1', 'args:1', 'print:1', 'println:1']


Write the main function that slices and tokenizes the repository.

In [79]:
def main(repository: str, number: int, delta: int, lang: str, output: str, output_info: str) -> None:
    """
    Split the repository, parse the files, write the data into a file.
    :param repository: path to the repository to process.
    :param number: the amount of dates.
    :param delta: the time step between dates
    :param lang: language of parsing.
    :param output: an output file.
    :param output_info: a file for information about output (slice indexes).
    :return: None.
    """
    directory = os.path.abspath(os.path.join(repository, os.pardir, 'project_slices'))
    os.mkdir(directory)
    dates = get_dates(number, delta)
    lists_of_files = {}
    for date in dates:
        subdirectory = os.path.abspath(os.path.join(directory, date.strftime('%Y-%m-%d')))
        checkout_by_date(repository, subdirectory, date)
        lists_of_files[date.strftime('%Y-%m-%d')] = get_a_list_of_files(subdirectory, get_extensions(lang))
    indexes_of_slices = {}
    count = 0
    with open(output, 'w+') as fout:
        for date in dates:
            starting_index = count + 1
            for file in lists_of_files[date.strftime('%Y-%m-%d')]:
                if os.path.isfile(file):  # TODO: implement a better file-checking mechanism
                    try:
                        identifiers = get_identifiers(file, lang)
                        if len(identifiers) != 0:
                            count += 1
                            formatted_identifiers = transform_identifiers(identifiers)
                            fout.write(str(count) + ';' + file + ';' + ','.join(formatted_identifiers) + '\n')
                    except UnicodeDecodeError:
                        continue
            ending_index = count
            indexes_of_slices[date.strftime('%Y-%m-%d')] = (starting_index, ending_index)
    with open(output_info, 'w+') as fout:
        for date in indexes_of_slices.keys():
            fout.write(date + ';' + str(indexes_of_slices[date][0]) + ',' + str(indexes_of_slices[date][1]) + '\n')

In [84]:
main('/home/diannao/Documents/java-design-patterns/', 12, 31, 'java', 
     '/home/diannao/Documents/java-design-patterns.txt', '/home/diannao/Documents/java-design-patterns_info.txt')