# git log/history
<br>  

### References
- [Visualizing Code: Polyglot Notebooks Repository (YouTube)](https://youtu.be/ipOpToPS-PY?si=3doePt2cp-LgEUmt)
- [gitstractor (GitHub)](https://github.com/IntegerMan/gitstractor)
- [Neo4j Python Driver](https://neo4j.com/docs/api/python-driver/current)

In [None]:
import os
import numpy as np
import pandas as pd
#pd.options.mode.copy_on_write = True

In [None]:
from neo4j import GraphDatabase
from plotly import graph_objects as plotly_graph_objects
from plotly.express import colors as plotly_colors

In [None]:
# To be able to distinguish between command line execution and Jupyter notebook execution
# we need to check if the environment variable NBCONVERT is set.
# The command line execution is required to take care of setting NBCONVERT.

# Note: Even if it would be great to retain the interactivity of plotly Treemap plots (e.g. clicking into details)
#       for command line executed notebooks (via nbconvert),
#       it would require to execute the notebook twice: Once including interactivity and once for static Markdown and PDF.
#       Therefore, command line executed notebooks (nbconvert) will contain static graphics (here using svg).
def is_command_line_execution():
    return 'NBCONVERT_PATH' in os.environ

def get_offline_path():
    return os.environ.get('NBCONVERT_PATH', '')

default_renderer = None

if is_command_line_execution():
    print("Command line execution (CLI mode): Yes")
    default_renderer = 'svg' # SVG is the default renderer for static (non interactive) pictures for command line execution
else:
    print("Command line execution (CLI mode): No")

In [None]:
# Please set the environment variable "NEO4J_INITIAL_PASSWORD" in your shell 
# before starting jupyter notebook to provide the password for the user "neo4j". 
# It is not recommended to hardcode the password into jupyter notebook for security reasons.

driver = GraphDatabase.driver(uri="bolt://localhost:7687", auth=("neo4j", os.environ.get("NEO4J_INITIAL_PASSWORD")))
driver.verify_connectivity()

In [None]:
def get_cypher_query_from_file(cypher_file_name : str):
    with open(cypher_file_name) as file:
        return ' '.join(file.readlines())


def query_cypher_to_data_frame(filename : str, limit: int = -1):
    """
    Execute the Cypher query of the given file and returns the result.
    filename : str : The name of the file containing the Cypher query
    limit : int : The optional limit of rows to optimize the query. Default = -1 = no limit
    """
    cypher_query = get_cypher_query_from_file(filename)
    if limit > 0:
        cypher_query = "{query}\nLIMIT {row_limit}".format(query = cypher_query, row_limit = limit)
    records, summary, keys = driver.execute_query(cypher_query)
    return pd.DataFrame([r.values() for r in records], columns=keys)


def query_first_non_empty_cypher_to_data_frame(*filenames : str, limit: int = -1):
    """
    Executes the Cypher queries of the given files and returns the first result that is not empty.
    If all given file names result in empty results, the last (empty) result will be returned.
    By additionally specifying "limit=" the "LIMIT" keyword will appended to query so that only the first results get returned.
    """    
    result=pd.DataFrame()
    for filename in filenames:
        result=query_cypher_to_data_frame(filename, limit)
        if not result.empty:
            return result
    return result

In [None]:
#The following cell uses the build-in %html "magic" to override the CSS style for tables to a much smaller size.
#This is especially needed for PDF export of tables with multiple columns.

In [None]:
%%html
<style>
/* CSS style for smaller dataframe tables. */
.dataframe th {
    font-size: 8px;
}
.dataframe td {
    font-size: 8px;
}
</style>

In [None]:
# Pandas DataFrame Display Configuration
pd.set_option('display.max_colwidth', 500)

## Git History - Directory Commit Statistics

In [None]:
# --------------------------------------------------------------------------------
# Shared Treemap Layout Functions and Constants
# --------------------------------------------------------------------------------

In [None]:
# Base settings for Plotly Treemap

plotly_main_layout_base_settings = dict(
    margin=dict(t=50, l=15, r=15, b=15),
)
plotly_treemap_layout_base_settings = dict(
    **plotly_main_layout_base_settings
)
plotly_bar_layout_base_settings = dict(
    **plotly_main_layout_base_settings
)
plotly_treemap_figure_show_settings = dict(
    renderer="svg" if is_command_line_execution() else None,
    width=680 if is_command_line_execution() else 1080,
    height=680 if is_command_line_execution() else 1080,
    config={'scrollZoom': False, 'displaylogo': False, 'displayModeBar': False} if is_command_line_execution() else {}
)

plotly_treemap_marker_base_style = dict(
    cornerradius=5, 
)

plotly_treemap_marker_base_colorscale = dict(
    **plotly_treemap_marker_base_style,
    colorscale='Hot_r', #  Hot_r, ice_r, Viridis_r, speed_r, haline_r, thermal_r, Plasma_r, solar_r, Electric_r, Blackbody_r, deep_r, Turbo_r, amp, Reds, Blackbody_r, RdGy_r, RdBu_r
)

In [None]:
def get_plotly_figure_write_image_settings(name: str):
    """
    Returns the settings for the plotly figure write_image method
    :param name: Name of the figure
    :return: Dictionary with settings for the write_image method
    """
    return dict(
        file=get_offline_path() + "/" + name + ".svg", 
        format="svg", 
        width=1080, 
        height=1080
    )

In [None]:
def create_treemap_commit_statistics_settings(data_frame: pd.DataFrame):
    """
    Creates a Plotly Treemap with the given settings and data frame.
    data_frame : pd.DataFrame : The input data frame
    return :plotly_graph_objects.Treemap : The prepared Plotly Treemap
    """
    return plotly_graph_objects.Treemap(
        labels=data_frame['directoryName'],
        parents=data_frame['directoryParentPath'],
        ids=data_frame['directoryPath'],
        customdata=data_frame[['fileCount', 'mostFrequentFileExtension', 'commitCount', 'authorCount', 'mainAuthor', 'secondAuthor','lastCommitDate', 'daysSinceLastCommit', 'lastCreationDate', 'daysSinceLastCreation', 'lastModificationDate', 'daysSinceLastModification', 'directoryPath']],
        hovertemplate='<b>%{label}</b><br>Files: %{customdata[0]} (%{customdata[1]})<br>Commits: %{customdata[2]}<br>Authors: %{customdata[4]},  %{customdata[5]},.. (%{customdata[3]})<br>Last Commit: %{customdata[6]} (%{customdata[7]} days ago)<br>Last Created: %{customdata[8]} (%{customdata[9]} days ago)<br>Last Modified: %{customdata[10]} (%{customdata[11]} days ago)<br>Path: %{customdata[12]}',
        maxdepth=-1,
        root_color="lightgrey",
        marker=dict(**plotly_treemap_marker_base_style),
    )

In [None]:
def create_rank_colorbar_for_graph_objects_treemap_marker(data_frame: pd.DataFrame, name_column: str, rank_column: str):
    """
    Creates a plotly graph_objects.Treemap marker object for a colorbar representing ranked names.
    data_frame : pd.DataFrame : The DataFrame that contains the name and the count column
    name_column : str : The name of the column containing the ranking 
    rank_column : str : The name of the column containing the ranking 
    return : plotly_graph_objects.treemap.Marker : The created Marker object
    """
    # The rank is inverted so that the first rank is shown on the top of the colorbar.
    inverse_ranked = data_frame[rank_column].max() + 1 - data_frame[rank_column]

    return dict(
        cornerradius=5, 
        colors=inverse_ranked,
        colorscale=plotly_colors.qualitative.G10, #favorites: plotly_colors.qualitative.G10, Blackbody, ice, haline, hot
        colorbar=dict(
            title="Rank",
            tickmode="array",
            ticktext=data_frame[name_column],
            tickvals=inverse_ranked,
            tickfont_size=10
        ),
    )

In [None]:
# --------------------------------------------------------------------------------
# Shared Visualization Data Preparation Functions
# --------------------------------------------------------------------------------

In [None]:
def add_quantile_limited_column(input_data_frame : pd.DataFrame, column_name : str, quantile : float = 0.95) -> pd.DataFrame:
    """
    Limits the values of the given column in the input data frame to the given quantile.
    The values are not filtered out but set to the limited (integer quantile value).
    input_data_frame : pd.DataFrame : The input data frame
    column_name : str : The name of the column to limit
    quantile : float : The quantile to limit the values to (default: 0.95)
    return : pd.DataFrame : The modified dataframe with the added column (column_name + '_limited')
    """
    data_frame=input_data_frame.copy()
    column_values = data_frame[column_name]
    column_limit = column_values.quantile(quantile)
    data_frame[column_name + '_limited'] = np.where(column_values > column_limit, column_limit, column_values)
    return data_frame

In [None]:
def add_rank_column(input_data_frame : pd.DataFrame, column_name : str) -> pd.DataFrame:
    """
    Adds a rank column ("dense" mode) to the input data frame based on the given column name.
    input_data_frame : pd.DataFrame : The input data frame
    column_name : str : The name of the column to rank
    return : pd.DataFrame : The modified dataframe with the added rank column
    """
    data_frame=input_data_frame.copy()
    data_frame[column_name + '_rank'] = data_frame[column_name].rank(ascending=True, method='dense')
    return data_frame

In [None]:
# --------------------------------------------------------------------------------
# Shared File Data Preparation Functions
# --------------------------------------------------------------------------------

In [None]:
def get_last_entry(values: pd.Series):
    """
    Get the last element of an array and converts therefore an array to a single element
    Meant to be used as an aggregation function for dataframe grouping.
    values : Series : The pandas Series of values
    return : any : The last entry
    """
    return values[-1]


def add_file_extension_column(input_dataframe: pd.DataFrame, file_path_column: str, file_extension_column: str = 'fileExtension'):
    """
    Adds a fileExtension column to the input DataFrame based on the file path column.
    input_dataframe : pd.DataFrame : The input DataFrame
    file_path_column : str : The name of the file path column
    file_extension_column : str : The name of the file extension column to be added
    return : pd.DataFrame : The DataFrame with added directory column
    """
    if file_extension_column in input_dataframe.columns:
        return input_dataframe # Column already exists
    
    # What is the correct extension in the following cases?
    #  - /main/resources/META-INF/services/org.axonframework.messaging.annotation.HandlerEnhancerDefinition
    #  - MyReactComponent.test.tsx
    # Currently, it would be
    #  - HandlerEnhancerDefinition
    #  - tsx
    # which is not perfect but good enough to start with.#
    
    file_path_column_position = input_dataframe.columns.get_loc(file_path_column)
    file_extensions=input_dataframe[file_path_column].str.split('/').map(get_last_entry)
    file_extensions=file_extensions.str.split('.').map(get_last_entry)
    input_dataframe.insert(file_path_column_position + 1, file_extension_column, file_extensions)
    return input_dataframe

def remove_last_file_path_element(file_path_elements: list) -> list:
    """
    Removes the last element of the file path so that only the directory names retain.
    file_path_elements : list : The list of levels to remove
    return : list : The list of the directories
    """
    return file_path_elements[:-1] if len(file_path_elements) > 1 else ['']

def convert_path_elements_to_directories(file_path_elements: list) -> list:
    """
    Converts the file path elements into directories.
    file_path_elements : list : The list of levels to convert
    return : list : The list of directories
    """
    directories = remove_last_file_path_element(file_path_elements)
    return ['/'.join(directories[:i+1]) for i in range(len(directories))]

def add_directory_column(input_dataframe: pd.DataFrame, file_path_column: str, directory_column: str = 'directoryPath'):
    """
    Adds a directory column to the input DataFrame based on the file path column.
    input_dataframe : pd.DataFrame : The input DataFrame
    file_path_column : str : The name of the file path column
    directory_column : str : The name of the directory column to be added
    return : pd.DataFrame : The DataFrame with added directory column
    """
    if directory_column in input_dataframe.columns:
        return input_dataframe # Column already exists
    
    input_dataframe.insert(0, directory_column, input_dataframe[file_path_column].str.split('/').apply(convert_path_elements_to_directories))
    input_dataframe = input_dataframe.explode(directory_column)
    return input_dataframe

def add_directory_name_column(input_dataframe: pd.DataFrame, directory_column: str = 'directoryPath', directory_name_column: str = 'directoryName'):
    """
    Adds a directory name column to the input DataFrame based on the directory column.
    input_dataframe : pd.DataFrame : The input DataFrame
    directory_column : str : The name of the directory column
    directory_name_column : str : The name of the directory name column to be added
    return : pd.DataFrame : The DataFrame with added directory name column
    """
    if directory_name_column in input_dataframe.columns:
        return input_dataframe # Column already exists
    
    splitted_directories = input_dataframe[directory_column].str.rsplit('/', n=1)
    input_dataframe.insert(1, directory_name_column, splitted_directories.apply(lambda x: (x[-1])))
    return input_dataframe

def add_parent_directory_column(input_dataframe: pd.DataFrame, directory_column: str = 'directoryPath', directory_parent_column: str = 'directoryParentPath'):
    """
    Adds a directory parent column to the input DataFrame based on the directory column.
    input_dataframe : pd.DataFrame : The input DataFrame
    directory_column : str : The name of the directory column
    directory_parent_column : str : The name of the directory parent column to be added
    return : pd.DataFrame : The DataFrame with added directory parent column
    """
    if directory_parent_column in input_dataframe.columns:
        return input_dataframe # Column already exists
    
    # Remove last path element from directory_column to get the directory_parent_column
    splitted_directories = input_dataframe[directory_column].str.rsplit('/', n=1)
    input_dataframe.insert(1, directory_parent_column, splitted_directories.apply(lambda x: (x[0])))
    
    # Clear parent (set to empty string) when it equal to the directory
    input_dataframe.loc[input_dataframe[directory_parent_column] == input_dataframe[directory_column], directory_parent_column] = ''
    return input_dataframe


def collect_as_array(values: pd.Series):
    """
    Just collect all values (no operation, "noop")
    Meant to be used as an aggregation function for dataframe grouping.
    values : Series : The pandas Series of values
    return : any : The second entry
    """
    return np.asanyarray(values.to_list())

def second_entry(values: pd.Series):
    """
    Returns the second entry of a list of values.
    Meant to be used as an aggregation function for dataframe grouping.
    values : Series : The pandas Series of values
    return : any : The second entry
    """
    return values.iloc[1] if len(values) > 1 else None

def get_flattened_unique_values(values: pd.Series):
    """
    Return an array of unique string values from an array of array of strings.
    Meant to be used as an aggregation function for dataframe grouping.
    values : Series : The pandas Series of values
    return :  Series : The pandas Series of values
    """
    return np.unique(np.concatenate(values.to_list()))

def count_unique_aggregated_values(values: pd.Series):
    """
    Return the number of unique values from an array of array of strings.
    Meant to be used as an aggregation function for dataframe grouping.
    values : Series : The pandas Series of values
    return : int : The number of files
    """
    return len(np.unique(np.concatenate(values.to_list())))


def get_most_frequent_entry(input_values: pd.Series):
    """
    Flattens the array of arrays and return the most frequent entry .
    Meant to be used as an aggregation function for dataframe grouping.
    input_values : Series : The pandas Series of values
    return : str : The most frequent entry
    """
    # flatten the array of arrays 
    values = np.concatenate(input_values.to_list())
    # find frequency of each value
    values, counts = np.unique(values, return_counts=True)
    #display all values with highest frequencies
    return values[counts.argmax()]

In [None]:
# --------------------------------------------------------------------------------
# Main File Data Preparation
# --------------------------------------------------------------------------------

In [None]:
git_files_with_commit_statistics = query_cypher_to_data_frame("../cypher/GitLog/List_git_files_with_commit_statistics_by_author.cypher")

# Get all authors, their commit count and based on it their rank in a separate dataframe.
# This will then be needed to visualize the (main) author for each directory.
git_file_authors=git_files_with_commit_statistics[['author', 'commitCount']].groupby('author').aggregate(
    authorCommitCount=pd.NamedAgg(column="commitCount", aggfunc="sum"),
    ).sort_values(by='authorCommitCount', ascending=False).reset_index()
git_file_authors['authorCommitCountRank'] = git_file_authors['authorCommitCount'].rank(ascending=False, method='dense').astype(int)

# Debug
# display(git_file_authors)

# Debug
# display("1. query result ---------------------")
# display(git_files_with_commit_statistics)

# Add new column 'fileExtension' for every 'filePath'
git_files_with_commit_statistics = add_file_extension_column(git_files_with_commit_statistics, 'filePath', 'fileExtension')

# Create a separate dataframe with all unique extensions, the number of their occurrences and the rank derived from it.
git_file_extensions=git_files_with_commit_statistics['fileExtension'].value_counts().rename_axis('fileExtension').reset_index(name='fileExtensionCount')
git_file_extensions['fileExtensionCountRank'] = git_file_extensions['fileExtensionCount'].rank(ascending=False, method='dense').astype(int)

# Debug
# display(git_file_extensions)

# Add multiple rows for each file path containing all its directories paths in the new column 'directoryPath'
git_files_with_commit_statistics = add_directory_column(git_files_with_commit_statistics, 'filePath', 'directoryPath')

# Debug
# display("2. added directoryPath and fileExtension --------------")
# display(git_files_with_commit_statistics)

# Define how common non-grouped columns will be aggregated.
# Hint: maxCommitSha might not seem very useful, but it actually helps by group similar directories in the final step
common_named_aggregation = dict(
    daysSinceLastCommit=pd.NamedAgg(column="daysSinceLastCommit", aggfunc="min"),
    daysSinceLastCreation=pd.NamedAgg(column="daysSinceLastCreation", aggfunc="min"),
    daysSinceLastModification=pd.NamedAgg(column="daysSinceLastModification", aggfunc="min"),
    lastCommitDate=pd.NamedAgg(column="lastCommitDate", aggfunc="max"),
    lastCreationDate=pd.NamedAgg(column="lastCreationDate", aggfunc="max"),
    lastModificationDate=pd.NamedAgg(column="lastModificationDate", aggfunc="max"),
    maxCommitSha=pd.NamedAgg(column="maxCommitSha", aggfunc="max"),
)

# Group the git files by their directory and author and count the number of files of each directory (across all levels).
git_files_with_commit_statistics = git_files_with_commit_statistics.groupby(['directoryPath', 'author']).aggregate(
    filePaths=pd.NamedAgg(column="filePath", aggfunc=np.unique),
    firstFile=pd.NamedAgg(column="filePath", aggfunc="first"),
    fileExtensions=pd.NamedAgg(column="fileExtension", aggfunc=collect_as_array),
    commitHashes=pd.NamedAgg(column="commitHashes", aggfunc=get_flattened_unique_values),
    intermediateCommitCount=pd.NamedAgg(column="commitHashes", aggfunc="count"),
    **common_named_aggregation
)

# Sort the grouped and aggregated entries by the name of the directory ascending and the number of commits descending.
# The author with the most commits will then be listed first for each directory.
git_files_with_commit_statistics = git_files_with_commit_statistics.sort_values(by=['directoryPath', 'intermediateCommitCount'], ascending=[True, False])
git_files_with_commit_statistics = git_files_with_commit_statistics.reset_index()

# Debug
# display("3. grouped by 'directoryPath' and 'author' -----")
# display(git_files_with_commit_statistics)

# Group the entries again now only by their directory path to get the aggregated number of authors, the main author and the second author.
# Hint: firstFile (similar to maxCommitSha) might not seem very useful, but it also helps to group similar directories in the final step
git_files_with_commit_statistics = git_files_with_commit_statistics.groupby('directoryPath').aggregate(
    fileCount=pd.NamedAgg(column="filePaths", aggfunc=count_unique_aggregated_values),
    firstFile=pd.NamedAgg(column="firstFile", aggfunc="first"),
    mostFrequentFileExtension=pd.NamedAgg(column="fileExtensions", aggfunc=get_most_frequent_entry),
    authorCount=pd.NamedAgg(column="author", aggfunc="nunique"),
    mainAuthor=pd.NamedAgg(column="author", aggfunc="first"),
    secondAuthor=pd.NamedAgg(column="author", aggfunc=second_entry),
    commitCount=pd.NamedAgg(column="commitHashes", aggfunc=count_unique_aggregated_values),
    **common_named_aggregation
)
git_files_with_commit_statistics = git_files_with_commit_statistics.reset_index()

# Debug
# display("4. grouped by 'directoryPath' ----------------------")
# display(git_files_with_commit_statistics)

# Add the name of the directory (last '/' separated element) and the parent directory path to the table.
git_files_with_commit_statistics = add_directory_name_column(git_files_with_commit_statistics, 'directoryPath', 'directoryName')
git_files_with_commit_statistics = add_parent_directory_column(git_files_with_commit_statistics, 'directoryPath', 'directoryParentPath')

# Debug
# display("5. added parent and name columns ------------")
# display(git_files_with_commit_statistics)

# Group finally by all columns except for the directory name, parent and path (first 3 columns) and pick the longest (max) directory path in case there are multiple.
all_column_names_except_for_the_directory_path = git_files_with_commit_statistics.columns.to_list()[3:]
git_files_with_commit_statistics = git_files_with_commit_statistics.groupby(all_column_names_except_for_the_directory_path).aggregate(
   directoryName=pd.NamedAgg(column="directoryName", aggfunc=lambda names: '/'.join(names)),
   directoryParentPath=pd.NamedAgg(column="directoryParentPath", aggfunc="first"),
   directoryPath=pd.NamedAgg(column="directoryPath", aggfunc="last"),
)
# Reorder the column positions so that the directory path is again the first column. 
all_column_names_with_the_directory_path_first = ['directoryPath', 'directoryParentPath', 'directoryName'] + all_column_names_except_for_the_directory_path
git_files_with_commit_statistics = git_files_with_commit_statistics.reset_index()[all_column_names_with_the_directory_path_first]

# Debug
# display("6. grouped by all except for directory path, name and parent columns (max) ----------------------")
# display(git_files_with_commit_statistics)

### Data Preview

In [None]:
git_files_with_commit_statistics.describe()

In [None]:
git_files_with_commit_statistics.head(30)

In [None]:
# Print prepared data frame to CSV file
# git_files_with_commit_statistics.to_csv('git_files_with_commit_statistics.csv', index=False)

### Number of files per directory

In [None]:
figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(
    create_treemap_commit_statistics_settings(git_files_with_commit_statistics),
    values = git_files_with_commit_statistics['fileCount'],
))
figure.update_layout(
    **plotly_treemap_layout_base_settings,
    title='Directories and their file count'
)
figure.show(**plotly_treemap_figure_show_settings)
if is_command_line_execution():
    figure.write_image(**get_plotly_figure_write_image_settings("NumberOfFilesPerDirectory"))

### Most frequent file extension per directory

In [None]:
git_files_with_commit_statistics_and_file_extension_rank = pd.merge(
    git_files_with_commit_statistics, 
    git_file_extensions, 
    left_on='mostFrequentFileExtension', 
    right_on="fileExtension",
    how="left",
    validate="m:1"
)
figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(
    create_treemap_commit_statistics_settings(git_files_with_commit_statistics),
    # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.
    # values = git_files_with_commit_statistics['fileCount'],
    marker=create_rank_colorbar_for_graph_objects_treemap_marker(git_files_with_commit_statistics_and_file_extension_rank, 'fileExtension', 'fileExtensionCountRank')
))
figure.update_layout(
    **plotly_treemap_layout_base_settings,
    title='Most frequent file extension per directory'
)
figure.show(**plotly_treemap_figure_show_settings)
if is_command_line_execution():
    figure.write_image(**get_plotly_figure_write_image_settings("MostFrequentFileExtensionPerDirectory"))

### Number of commits per directory

In [None]:
git_commit_count_per_directory = add_quantile_limited_column(git_files_with_commit_statistics, "commitCount", 0.98)

figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(
    create_treemap_commit_statistics_settings(git_commit_count_per_directory),
    # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.
    # values = git_commit_count_per_directory['fileCount'],
    marker=dict(
        **plotly_treemap_marker_base_colorscale,
        colors=git_commit_count_per_directory['commitCount_limited'], 
        colorbar=dict(title="Commits"),
    ),
))
figure.update_layout(
    **plotly_treemap_layout_base_settings,
    title='Number of git commits',
)
figure.show(**plotly_treemap_figure_show_settings)
if is_command_line_execution():
    figure.write_image(**get_plotly_figure_write_image_settings("NumberOfGitCommits"))

### Number of distinct authors per directory

In [None]:
git_commit_authors_per_directory = add_quantile_limited_column(git_files_with_commit_statistics, "authorCount", 0.98)

figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(
    create_treemap_commit_statistics_settings(git_commit_authors_per_directory),
    # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.
    # values = git_commit_authors_per_directory['fileCount'],
    marker=dict(
        **plotly_treemap_marker_base_colorscale,
        colors=git_commit_authors_per_directory['authorCount_limited'], 
        colorbar=dict(title="Authors"),
    ),
))
figure.update_layout(
    **plotly_treemap_layout_base_settings,
    title='Number of distinct commit authors',
)
figure.show(**plotly_treemap_figure_show_settings)
if is_command_line_execution():
    figure.write_image(**get_plotly_figure_write_image_settings("NumberOfDistinctCommitAuthors"))

### Directories with very few different authors

In [None]:
git_commit_authors_per_directory_low_focus = add_quantile_limited_column(git_files_with_commit_statistics, "authorCount", 0.33)

author_count_top_limit = git_commit_authors_per_directory_low_focus['authorCount_limited'].max().astype(int).astype(str)
author_count_top_limit_label_alias = {author_count_top_limit: author_count_top_limit + ' or more'}

figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(
    create_treemap_commit_statistics_settings(git_commit_authors_per_directory_low_focus),
    # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.
    # values = git_commit_authors_per_directory['fileCount'],
    marker=dict(
        **plotly_treemap_marker_base_colorscale,
        colors=git_commit_authors_per_directory_low_focus['authorCount_limited'], 
        colorbar=dict(title="Authors",
                      tickmode="auto",
                      labelalias=author_count_top_limit_label_alias
                      ),
        reversescale=True
    ),
))
figure.update_layout(
    **plotly_treemap_layout_base_settings,
    title='Number of distinct commit authors (red/black = only one or very few authors)',
)

figure.show(**plotly_treemap_figure_show_settings)
if is_command_line_execution():
    figure.write_image(**get_plotly_figure_write_image_settings("NumberOfDistinctCommitAuthorsLowFocus"))

### Main author per directory

In [None]:
git_files_with_commit_statistics_and_main_author_rank = pd.merge(
    git_files_with_commit_statistics, 
    git_file_authors, 
    left_on='mainAuthor', 
    right_on="author",
    how="left",
    validate="m:1"
)
figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(
    create_treemap_commit_statistics_settings(git_files_with_commit_statistics),
    # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.
    # values = git_files_with_commit_statistics['fileCount'],
    marker=create_rank_colorbar_for_graph_objects_treemap_marker(git_files_with_commit_statistics_and_main_author_rank, 'mainAuthor', 'authorCommitCountRank')
))
figure.update_layout(
    **plotly_treemap_layout_base_settings,
    title='Main authors with highest number of commits'
)
figure.show(**plotly_treemap_figure_show_settings)
if is_command_line_execution():
    figure.write_image(**get_plotly_figure_write_image_settings("MainAuthorsWithHighestNumberOfCommits"))

### Second author per directory

In [None]:
git_files_with_commit_statistics_and_second_author_rank = pd.merge(
    git_files_with_commit_statistics, 
    git_file_authors, 
    left_on='secondAuthor', 
    right_on="author",
    how="left",
    validate="m:1"
)

figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(
    create_treemap_commit_statistics_settings(git_files_with_commit_statistics),
    # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.
    # values = git_files_with_commit_statistics['fileCount'],
    marker=create_rank_colorbar_for_graph_objects_treemap_marker(git_files_with_commit_statistics_and_second_author_rank, 'secondAuthor', 'authorCommitCountRank')
))
figure.update_layout(
    **plotly_treemap_layout_base_settings,
    title='Second author with the second highest number of commits'
)
figure.show(**plotly_treemap_figure_show_settings)
if is_command_line_execution():
    figure.write_image(**get_plotly_figure_write_image_settings("SecondAuthorWithTheSecondHighestNumberOfCommits"))

### Days since last commit per directory

In [None]:
git_commit_days_since_last_commit_per_directory = add_quantile_limited_column(git_files_with_commit_statistics, "daysSinceLastCommit", 0.98)

figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(
    create_treemap_commit_statistics_settings(git_commit_days_since_last_commit_per_directory),
    # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.
    #values = git_commit_days_since_last_commit_per_directory['fileCount'],
    marker=dict(
        **plotly_treemap_marker_base_colorscale,
        colors=git_commit_days_since_last_commit_per_directory['daysSinceLastCommit_limited'], 
        colorbar=dict(title="Days"),
    ),
))

figure.update_layout(
    **plotly_treemap_layout_base_settings,
    title='Days since last commit',
)
figure.show(**plotly_treemap_figure_show_settings)
if is_command_line_execution():
    figure.write_image(**get_plotly_figure_write_image_settings("DaysSinceLastCommit"))

### Days since last commit per directory (ranked)

In [None]:
git_commit_days_since_last_commit_per_directory = add_rank_column(git_files_with_commit_statistics, "daysSinceLastCommit")

figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(
    create_treemap_commit_statistics_settings(git_commit_days_since_last_commit_per_directory),
    # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.
    # values = git_commit_days_since_last_commit_per_directory['fileCount'],
    marker=dict(
        **plotly_treemap_marker_base_colorscale,
        colors=git_commit_days_since_last_commit_per_directory['daysSinceLastCommit_rank'], 
        colorbar=dict(title="Rank"),
    ),
))

figure.update_layout(
    **plotly_treemap_layout_base_settings,
    title='Rank of days since last commit',
)
figure.show(**plotly_treemap_figure_show_settings)
if is_command_line_execution():
    figure.write_image(**get_plotly_figure_write_image_settings("DaysSinceLastCommitRanked"))

### Days since last file creation per directory

In [None]:
git_commit_days_since_last_file_creation_per_directory = add_quantile_limited_column(git_files_with_commit_statistics, "daysSinceLastCreation", 0.98)

figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(
    create_treemap_commit_statistics_settings(git_commit_days_since_last_file_creation_per_directory),
    # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.
    # values = git_commit_days_since_last_file_creation_per_directory['fileCount'],
    marker=dict(
        **plotly_treemap_marker_base_colorscale,
        colors=git_commit_days_since_last_file_creation_per_directory['daysSinceLastCreation_limited'], 
        colorbar=dict(title="Days"),
    ),
))
figure.update_layout(
    **plotly_treemap_layout_base_settings,
    title='Days since last file creation',
)
figure.show(**plotly_treemap_figure_show_settings)
if is_command_line_execution():
    figure.write_image(**get_plotly_figure_write_image_settings("DaysSinceLastFileCreation"))

### Days since last file creation per directory (ranked)

In [None]:
git_commit_days_since_last_file_creation_per_directory = add_rank_column(git_files_with_commit_statistics, "daysSinceLastCreation")

figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(
    create_treemap_commit_statistics_settings(git_commit_days_since_last_file_creation_per_directory),
    # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.
    # values = git_commit_days_since_last_file_creation_per_directory['fileCount'],
    marker=dict(
        **plotly_treemap_marker_base_colorscale,
        colors=git_commit_days_since_last_file_creation_per_directory['daysSinceLastCreation_rank'], 
        colorbar=dict(title="Rank"),
    ),
))
figure.update_layout(
    **plotly_treemap_layout_base_settings,
    title='Rank of days since last file creation',
)
figure.show(**plotly_treemap_figure_show_settings)
if is_command_line_execution():
    figure.write_image(**get_plotly_figure_write_image_settings("DaysSinceLastFileCreationRanked"))

### Days since last file modification per directory

In [None]:
git_commit_days_since_last_file_modification_per_directory = add_quantile_limited_column(git_files_with_commit_statistics, "daysSinceLastModification", 0.98)

figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(
    create_treemap_commit_statistics_settings(git_commit_days_since_last_file_modification_per_directory),
    # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.
    # values = git_commit_days_since_last_file_modification_per_directory['fileCount'],
    marker=dict(
        **plotly_treemap_marker_base_colorscale,
        colors=git_commit_days_since_last_file_modification_per_directory['daysSinceLastModification_limited'], 
        colorbar=dict(title="Days"),
    ),
))
figure.update_layout(
    **plotly_treemap_layout_base_settings,
    title='Days since last file modification',
)
figure.show(**plotly_treemap_figure_show_settings)
if is_command_line_execution():
    figure.write_image(**get_plotly_figure_write_image_settings("DaysSinceLastFileModification"))

### Days since last file modification per directory (ranked)

In [None]:
git_commit_days_since_last_file_modification_per_directory = add_rank_column(git_files_with_commit_statistics, "daysSinceLastModification")

figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(
    create_treemap_commit_statistics_settings(git_commit_days_since_last_file_modification_per_directory),
    # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.
    # values = git_commit_days_since_last_file_modification_per_directory['fileCount'],
    marker=dict(
        **plotly_treemap_marker_base_colorscale,
        colors=git_commit_days_since_last_file_modification_per_directory['daysSinceLastModification_rank'], 
        colorbar=dict(title="Rank"),
    ),
))
figure.update_layout(
    **plotly_treemap_layout_base_settings,
    title='Rank of days since last file modification',
)
figure.show(**plotly_treemap_figure_show_settings)
if is_command_line_execution():
    figure.write_image(**get_plotly_figure_write_image_settings("DaysSinceLastFileModificationRanked"))

### File changed frequently with other files

In [None]:
pairwise_changed_git_files = query_cypher_to_data_frame("../cypher/GitLog/List_git_files_that_were_changed_together_with_another_file.cypher")

# Debug
# display("1. pairwise changed files --------------")
# display(pairwise_changed_git_files)

# Add multiple rows for each file path containing all its directories paths in the new column 'directoryPath'
pairwise_changed_git_files = add_directory_column(pairwise_changed_git_files, 'filePath', 'directoryPath')

# Debug
# display("2. added directories --------------")
# display(pairwise_changed_git_files)

# Group the git files by their directory and author and count the number of files of each directory (across all levels).
pairwise_changed_git_files = pairwise_changed_git_files.groupby(['directoryPath']).aggregate(
    pairwiseChangeCommitCount=pd.NamedAgg(column="commitCount", aggfunc="sum"),
    pairwiseChangeFileCount=pd.NamedAgg(column="filePath", aggfunc="count"),
)
pairwise_changed_git_files.reset_index(inplace=True)

# Debug
# display("3. after grouping --------------")
# display(pairwise_changed_git_files)

pairwise_changed_git_files = pd.merge(
    git_files_with_commit_statistics, 
    pairwise_changed_git_files, 
    left_on='directoryPath', 
    right_on="directoryPath",
    how="left",
    validate="m:1"
)

# Debug
# display("4. after merging --------------")
# display(pairwise_changed_git_files)

pairwise_changed_git_files['pairwiseChangeCommitCount'] = pairwise_changed_git_files['pairwiseChangeCommitCount'].fillna(0).astype(int)
pairwise_changed_git_files['pairwiseChangeFileCount'] = pairwise_changed_git_files['pairwiseChangeFileCount'].fillna(0).astype(int)
pairwise_changed_git_files.reset_index(inplace=True)

# Debug
# display("5. after NaN fill --------------")
# display(pairwise_changed_git_files)

In [None]:
pairwise_changed_git_files = add_quantile_limited_column(pairwise_changed_git_files, "pairwiseChangeCommitCount", 0.98)

figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(
    create_treemap_commit_statistics_settings(pairwise_changed_git_files),
    # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.
    # values = pairwise_changed_git_files['fileCount'],
    marker=dict(
        **plotly_treemap_marker_base_colorscale,
        colors=pairwise_changed_git_files['pairwiseChangeCommitCount_limited'], 
        colorbar=dict(title="Changes"),
    ),
))
figure.update_layout(
    **plotly_treemap_layout_base_settings,
    title='Pairwise file changes',
)
figure.show(**plotly_treemap_figure_show_settings)
if is_command_line_execution():
    figure.write_image(**get_plotly_figure_write_image_settings("PairwiseFileChanges"))

## Filecount per commit

Shows how many commits had changed one file, how many had changed two files, and so on.
The chart is limited to 30 lines for improved readability.
The data preview also includes overall statistics including the number of commits that are filtered out in the chart.

### Preview data

In [None]:
git_file_count_per_commit = query_cypher_to_data_frame("../cypher/GitLog/List_git_files_per_commit_distribution.cypher")

print("Sum of commits that changed more than 30 files (each) = " + str(git_file_count_per_commit[git_file_count_per_commit['filesPerCommit'] > 30]['commitCount'].sum()))
print("Max changed files with one commit = " + str(git_file_count_per_commit['filesPerCommit'].max()))
display(git_file_count_per_commit.describe())
display(git_file_count_per_commit.head(30))

### Bar chart with the number of files per commit distribution

In [None]:
if git_file_count_per_commit.empty:
    print("No data to plot")
else:
    figure = plotly_graph_objects.Figure(plotly_graph_objects.Bar(
        x=git_file_count_per_commit['filesPerCommit'].head(30), 
        y=git_file_count_per_commit['commitCount'].head(30)),
    )
    figure.update_layout(
        **plotly_bar_layout_base_settings,
        title='Changed files per commit',
        xaxis_title='file count',
        yaxis_title='commit count'
    )
    figure.show(**plotly_treemap_figure_show_settings)
    if is_command_line_execution():
        figure.write_image(**get_plotly_figure_write_image_settings("ChangedFilesPerCommit"))

## Pairwise Changed Files vs. Dependency Weight

This section explores the correlation between how often pairs of files are changed together (common commit count) and their dependency weight. Note that these results should be interpreted cautiously, as comparing pairwise changes and dependencies is inherently challenging.

### Considerations
- **Historical vs. Current State**: Pairwise changes reflect the entire git history, while dependency weight represents the current state of the codebase.
- **Commit Granularity**: Developers may use different commit strategies, such as squashing changes into a single commit or creating fine-grained commits. Ideally, each commit should represent a single semantic change for accurate analysis.
- **Dependency Representation**: Some file types (e.g., Java files with import statements) clearly define dependencies, while others (e.g., shell scripts, XML, YAML) lack explicit dependency relationships.
- **Repository Characteristics**: Repositories with generated code may have many large commits, while stabilized repositories may only update configuration files for dependency changes.

#### Data Preview

In [None]:
pairwise_changed_git_files_with_dependencies = query_cypher_to_data_frame("../cypher/GitLog/List_pairwise_changed_files_with_dependencies.cypher")
pairwise_changed_git_files_with_dependencies.head(20)

#### Data Statistics

In [None]:
display("Pairwise changed git files compared to dependency weights - Overall statistics")
display(pairwise_changed_git_files_with_dependencies.describe())

display("Pairwise changed git files compared to dependency weights - Pearson Correlation")
display(pairwise_changed_git_files_with_dependencies.corr(method='pearson'))

display("Pairwise changed git files compared to dependency weights - Spearman Correlation")
display(pairwise_changed_git_files_with_dependencies.corr(method='spearman'))

In [None]:
if pairwise_changed_git_files_with_dependencies.shape[0] < 5:
    print("Less than 5 samples are not enough to calculate p-values")
else:
    from scipy.stats import pearsonr, spearmanr

    display("Pearson Correlation with p-value for commitCount and dependencyWeight")
    display(pearsonr(pairwise_changed_git_files_with_dependencies['commitCount'], pairwise_changed_git_files_with_dependencies['dependencyWeight']))

    display("Spearman Correlation with p-value for commitCount and dependencyWeight")
    display(spearmanr(pairwise_changed_git_files_with_dependencies['commitCount'], pairwise_changed_git_files_with_dependencies['dependencyWeight']))

In [None]:
# Scatter plot of all pairs of files with their commit count on the x axis and dependency weight on the y axis

if pairwise_changed_git_files_with_dependencies.empty:
    print("No data to plot")
else:
    figure = plotly_graph_objects.Figure(plotly_graph_objects.Scatter(
        x=pairwise_changed_git_files_with_dependencies['commitCount'], 
        y=pairwise_changed_git_files_with_dependencies['dependencyWeight'],
        mode='markers',
        # marker=dict(size=pairwise_changed_git_files_with_dependencies['occurrences'] + 8)
    ))
    figure.update_layout(
        **plotly_bar_layout_base_settings,
        title='Pairwise changed files: Number of changes (commitCount) vs. dependency weight',
        xaxis_title='commit count',
        yaxis_title='dependency weight',
    )
    figure.show(**plotly_treemap_figure_show_settings)
    if is_command_line_execution():
        figure.write_image(**get_plotly_figure_write_image_settings("PairwiseChangedFilesVsDependencyWeight"))

## WordCloud of git authors

In [None]:
# Query data from graph database
git_author_words_with_frequency = query_cypher_to_data_frame("../cypher/Overview/Words_for_git_author_Wordcloud_with_frequency.cypher")

git_author_words_with_frequency.sort_values(by='frequency', ascending=False).reset_index(drop=True).head(10)

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plot

if not git_author_words_with_frequency.empty:
    # Expects the first column of the DataFrame to contain the words/text and the second column to contain the count/frequency.
    words_with_frequency_dict=git_author_words_with_frequency.set_index(git_author_words_with_frequency.columns[0]).to_dict()[git_author_words_with_frequency.columns[1]]
    wordcloud = WordCloud(
        width=800, 
        height=800,
        max_words=600, 
        collocations=False,
        background_color='white', 
        colormap='viridis'
    ).generate_from_frequencies(words_with_frequency_dict)

    # Plot the word cloud
    plot.figure(figsize=(15,15))
    plot.imshow(wordcloud, interpolation='bilinear')
    plot.axis("off")
    plot.title('Wordcloud of git authors')
    plot.show()