# git log/history
<br>  

### References
- [Visualizing Code: Polyglot Notebooks Repository (YouTube)](https://youtu.be/ipOpToPS-PY?si=3doePt2cp-LgEUmt)
- [gitstractor (GitHub)](https://github.com/IntegerMan/gitstractor)
- [Neo4j Python Driver](https://neo4j.com/docs/api/python-driver/current)

In [None]:
import os
import pandas as pd
pd.options.mode.copy_on_write = True

from neo4j import GraphDatabase

In [None]:
import numpy as np
from plotly import graph_objects as plotly_graph_objects
from plotly.subplots import make_subplots

In [None]:
# To be able to distinguish between command line execution and Jupyter notebook execution
# we need to check if the environment variable NBCONVERT is set.
# The command line execution is required to take care of setting NBCONVERT.
def is_command_line_execution():
    return 'NBCONVERT' in os.environ

default_renderer = None

if is_command_line_execution():
    print("Command line execution (CLI mode): Yes")
    default_renderer = 'svg' # SVG is the default renderer for static (non interactive) pictures for command line execution
else:
    print("Command line execution (CLI mode): No")

In [None]:
# Please set the environment variable "NEO4J_INITIAL_PASSWORD" in your shell 
# before starting jupyter notebook to provide the password for the user "neo4j". 
# It is not recommended to hardcode the password into jupyter notebook for security reasons.

driver = GraphDatabase.driver(uri="bolt://localhost:7687", auth=("neo4j", os.environ.get("NEO4J_INITIAL_PASSWORD")))
driver.verify_connectivity()

In [None]:
def get_cypher_query_from_file(cypher_file_name : str):
    with open(cypher_file_name) as file:
        return ' '.join(file.readlines())

In [None]:
def query_cypher_to_data_frame(filename : str, limit: int = 10_000):
    cypher_query_template = "{query}\nLIMIT {row_limit}"
    cypher_query = get_cypher_query_from_file(filename)
    cypher_query = cypher_query_template.format(query = cypher_query, row_limit = limit)
    records, summary, keys = driver.execute_query(cypher_query)
    return pd.DataFrame([r.values() for r in records], columns=keys)

In [None]:
def query_first_non_empty_cypher_to_data_frame(*filenames : str, limit: int = 10_000):
    """
    Executes the Cypher queries of the given files and returns the first result that is not empty.
    If all given file names result in empty results, the last (empty) result will be returned.
    By additionally specifying "limit=" the "LIMIT" keyword will appended to query so that only the first results get returned.
    """    
    result=pd.DataFrame()
    for filename in filenames:
        result=query_cypher_to_data_frame(filename, limit)
        if not result.empty:
            return result
    return result

In [None]:
#The following cell uses the build-in %html "magic" to override the CSS style for tables to a much smaller size.
#This is especially needed for PDF export of tables with multiple columns.

In [None]:
%%html
<style>
/* CSS style for smaller dataframe tables. */
.dataframe th {
    font-size: 8px;
}
.dataframe td {
    font-size: 8px;
}
</style>

In [None]:
# Pandas DataFrame Display Configuration
pd.set_option('display.max_colwidth', 300)

## Git History - Directory Commit Statistics

In [None]:
# The first part provides functions that provide basic functionality for the following parts.

### Treemap Layout Functions and Constants

In [None]:
# Base settings for Plotly Treemap

plotly_treemap_base_settings = dict(
    color_continuous_scale='Hot_r',  # Hot_r, amp, Reds, Blackbody_r, RdGy_r, RdBu_r
    path=['gitRepositoryName', 'directoryParentName', 'directoryName'],
    maxdepth=-1
)
plotly_treemap_traces_base_settings = dict(
    root_color="lightgrey",
    textinfo="label+value",
    marker=dict(cornerradius=5),
)
plotly_treemap_layout_base_settings = dict(
    margin=dict(t=50, l=15, r=15, b=15),
)
plotly_treemap_figure_base_settings = dict(
    renderer="svg" if is_command_line_execution() else None,
    width=1000,
    height=550
)

In [None]:
# Common settings for commit statistics of git file directories with Plotly Treemap

plotly_treemap_commit_statistics_settings = dict(
    **plotly_treemap_base_settings,
    custom_data=['commitCount', 'authorCount', 'lastCommitDate', 'daysSinceLastCommit', 'lastCreationDate', 'daysSinceLastCreation', 'lastModificationDate', 'daysSinceLastModification' , 'directoryPath'],
)
# Provide alternative color scale for diverging values
plotly_treemap_commit_statistics_settings_with_diverging_color_scale = plotly_treemap_commit_statistics_settings.copy()
plotly_treemap_commit_statistics_settings_with_diverging_color_scale.update({'color_continuous_scale':'RdBu'})
plotly_treemap_commit_statistics_settings_with_reverse_color_scale = plotly_treemap_commit_statistics_settings.copy()
plotly_treemap_commit_statistics_settings_with_reverse_color_scale.update({'color_continuous_scale':'Hot'})

plotly_treemap_traces_commit_statistics_settings = dict(
    **plotly_treemap_traces_base_settings,
    hovertemplate='<b>%{label}</b><br>Commits: %{customdata[0]}<br>Authors: %{customdata[1]}<br>Last Commit: %{customdata[2]} (%{customdata[3]} days ago)<br>Last Created: %{customdata[4]} (%{customdata[5]} days ago)<br>Last Modified: %{customdata[6]} (%{customdata[7]} days ago)<br>Path: %{customdata[8]}',
)

In [None]:
def create_treemap_commit_statistics_settings(data_frame: pd.DataFrame):
    """
    Creates a Plotly Treemap with the given settings and data frame.
    data_frame : pd.DataFrame : The input data frame
    return :plotly_graph_objects.Treemap : The prepared Plotly Treemap
    """
    return plotly_graph_objects.Treemap(
        labels=data_frame['directoryName'],
        parents=data_frame['directoryParentPath'],
        ids=data_frame['directoryPath'],
        customdata=data_frame[['fileCount', 'commitCount', 'authorCount', 'lastCommitDate', 'daysSinceLastCommit', 'lastCreationDate', 'daysSinceLastCreation', 'lastModificationDate', 'daysSinceLastModification', 'directoryPath']],
        hovertemplate='<b>%{label}</b><br>Files: %{customdata[0]}<br>Commits: %{customdata[1]}<br>Authors: %{customdata[2]}<br>Last Commit: %{customdata[3]} (%{customdata[4]} days ago)<br>Last Created: %{customdata[5]} (%{customdata[6]} days ago)<br>Last Modified: %{customdata[7]} (%{customdata[8]} days ago)<br>Path: %{customdata[9]}',
        maxdepth=-1,
        root_color="lightgrey",
        marker=dict(cornerradius=5),
    )

### Data Preparation Functions

In [None]:
def add_quantile_limited_column(input_data_frame : pd.DataFrame, column_name : str, quantile : float = 0.95) -> pd.DataFrame:
    """
    Limits the values of the given column in the input data frame to the given quantile.
    The values are not filtered out but set to the limited (integer quantile value).
    input_data_frame : pd.DataFrame : The input data frame
    column_name : str : The name of the column to limit
    quantile : float : The quantile to limit the values to (default: 0.95)
    return : pd.DataFrame : The modified dataframe with the added column (column_name + '_limited')
    """
    data_frame=input_data_frame.copy()
    column_values = data_frame[column_name]
    column_limit = column_values.quantile(quantile)
    data_frame[column_name + '_limited'] = np.where(column_values > column_limit, column_limit, column_values)
    return data_frame

In [None]:
def add_rank_column(input_data_frame : pd.DataFrame, column_name : str) -> pd.DataFrame:
    """
    Adds a rank column ("dense" mode) to the input data frame based on the given column name.
    input_data_frame : pd.DataFrame : The input data frame
    column_name : str : The name of the column to rank
    return : pd.DataFrame : The modified dataframe with the added rank column
    """
    data_frame=input_data_frame.copy()
    data_frame[column_name + '_rank'] = data_frame[column_name].rank(ascending=True, method='dense')
    return data_frame

In [None]:
def filter_out_non_existing_parent_ids(data_frame: pd.DataFrame, parent_column: str, id_column: str):
    """
    Filters out all rows with a parent ID where there is no entry in the ID column.
    data_frame : pd.DataFrame : The input data frame
    parent_column : str : The name of the parent column
    id_column : str : The name of the ID column
    return : pd.DataFrame : The filtered data frame
    """
    list_of_ids = data_frame[id_column].tolist() + ['']
    # For Debugging
    problems = data_frame[~data_frame[parent_column].isin(list_of_ids)]
    if problems.empty:
        display("No problems with non-existing parent IDs found.")
    else:
        print('\033[31mFiltered out rows with non-existing parent IDs. See the entries in the table below.\033[0m')
        display(problems)
    return data_frame[data_frame[parent_column].isin(list_of_ids)]

def replace_empty_parent_by_repository_name(data_frame: pd.DataFrame, column_name: str, repository_column_name: str = ''):
    """
    Replaces the value 'root' in the given column by the repository name.
    data_frame : pd.DataFrame : The input data frame
    column_name : str : The name of the column
    gitRepositoryName : str : The name of the column that contains the value to be used instead of an empty root
    return : pd.DataFrame : The modified data frame
    """
    repository_names = data_frame[repository_column_name]
    data_frame[column_name] = data_frame[column_name].replace("", np.NaN).fillna(repository_names)

    return data_frame

def prepare_treemap_commit_statistics_data(data_frame: pd.DataFrame) -> pd.DataFrame:
    """
    data_frame : pd.DataFrame : The input data frame
    return : pd.DataFrame : The data frame prepared for treemap visualization
    """
    prepared_data = data_frame
    prepared_data = filter_out_non_existing_parent_ids(prepared_data, 'directoryParentPath', 'directoryPath')
    prepared_data = filter_out_non_existing_parent_ids(prepared_data, 'directoryParentPath', 'directoryPath')
    prepared_data = replace_empty_parent_by_repository_name(prepared_data, 'directoryParentPath', 'gitRepositoryName')
    return prepared_data

### Function to split file path levels

In [None]:
# TODO Still needed?

def fill_array_to_length(length: int, fill_value=''):
    """
    Fills the input array with the given fill value to the given length.
    array : list : The input array
    length : int : The length to fill the array to
    fill_value : any : The value to fill the array with (default: '')
    return : list : The filled array
    """
    def fill_array(array: list):
        return array + [fill_value] * (length - len(array))
    return fill_array

def add_file_path_levels(input_dataframe: pd.DataFrame, file_path_column: str, delimiter: str = '/'):
    """
    Adds hierarchical levels to a DataFrame based on a file path column.
    input_dataframe : pd.DataFrame : The input DataFrame
    file_path_column : str : The name of the file path column
    delimiter : str : The delimiter used to split the file path (default: '/')
    return : pd.DataFrame : The DataFrame with added hierarchical levels
    """

    # Get longest path length in the DataFrame
    max_path_length = input_dataframe[file_path_column].str.count(delimiter).max() + 1

    # Split the file path column into multiple columns based on the delimiter and align the array to the right so that there are no null leaf nodes with obj.ffill()
    dataframe_split = input_dataframe[file_path_column].str.split(delimiter).apply(fill_array_to_length(max_path_length)).apply(pd.Series)

    # Prefix each column in df_split with 'level'
    dataframe_split.columns = [f'level_{i+1}' for i in dataframe_split.columns]

    # Join df with df_split
    return input_dataframe.copy().join(dataframe_split), dataframe_split.columns.tolist()

### Data Preview

In [None]:
git_file_directories_with_commit_statistics = query_cypher_to_data_frame("../cypher/GitLog/List_git_files_directories_with_commit_statistics.cypher")
git_file_directories_with_commit_statistics = prepare_treemap_commit_statistics_data(git_file_directories_with_commit_statistics)

# Show a preview of the first 20 directories with the highest file count
display("Data Preview ------------------")
git_file_directories_with_commit_statistics.sort_values(by="fileCount", ascending=False).head(10)

### Null Checks

In [None]:
# Null values in the DataFrame
git_file_directories_with_commit_statistics.isnull().sum() 

### Check for multiple root directories

In [None]:
# Take the dataframe "git_file_directories_with_commit_statistics" and find values (=directories) in column "directoryPath", that have multiple parents (column "directoryParentPath").

# Find directories with multiple parents
directories_with_multiple_parents = git_file_directories_with_commit_statistics.groupby('directoryPath').filter(lambda x: len(x) > 1)
directories_with_multiple_parents

### TODO solve recursive missing parent directories issues
`WHERE NOT git_file.relativePath STARTS WITH 'docs/deadlines-guide/modules/ROOT/pages'`

### Directories by file count

In [None]:
figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(
    create_treemap_commit_statistics_settings(git_file_directories_with_commit_statistics),
    values = git_file_directories_with_commit_statistics['fileCount'],
))
figure.update_layout(
    **plotly_treemap_layout_base_settings,
    title='Directories and their file count'
)
figure.show()

### Number of commits per directory

In [None]:
git_commit_count_per_directory = add_quantile_limited_column(git_file_directories_with_commit_statistics, "commitCount", 0.96)

figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(
    create_treemap_commit_statistics_settings(git_commit_count_per_directory),
    values = git_commit_count_per_directory['fileCount'],
    marker=dict(
        cornerradius=5, 
        colors=git_commit_count_per_directory['commitCount_limited'], 
        colorscale='Hot_r',
        colorbar=dict(title="Commits"),
    ),
))
figure.update_layout(
    **plotly_treemap_layout_base_settings,
    title='Number of git commits',
)
figure.show()

### Number of distinct authors per directory

In [None]:
git_commit_authors_per_directory = add_quantile_limited_column(git_file_directories_with_commit_statistics, "authorCount", 0.96)

figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(
    create_treemap_commit_statistics_settings(git_commit_authors_per_directory),
    values = git_commit_authors_per_directory['fileCount'],
    marker=dict(
        cornerradius=5, 
        colors=git_commit_authors_per_directory['authorCount_limited'], 
        colorscale='Hot_r',
        colorbar=dict(title="Authors"),
    ),
))
figure.update_layout(
    **plotly_treemap_layout_base_settings,
    title='Number of distinct commit authors',
)
figure.show()

### Days since last commit per directory

In [None]:
git_commit_days_since_last_commit_per_directory = add_rank_column(git_file_directories_with_commit_statistics, "daysSinceLastCommit")

figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(
    create_treemap_commit_statistics_settings(git_commit_days_since_last_commit_per_directory),
    values = git_commit_days_since_last_commit_per_directory['fileCount'],
    marker=dict(
        cornerradius=5, 
        colors=git_commit_days_since_last_commit_per_directory['daysSinceLastCommit'], 
        colorscale='Hot_r',
        colorbar=dict(title="Days"),
    ),
))

figure.update_layout(
    **plotly_treemap_layout_base_settings,
    title='Days since last commit',
)
figure.show()

In [None]:
figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(
    create_treemap_commit_statistics_settings(git_commit_days_since_last_commit_per_directory),
    values = git_commit_days_since_last_commit_per_directory['fileCount'],
    marker=dict(
        cornerradius=5, 
        colors=git_commit_days_since_last_commit_per_directory['daysSinceLastCommit_rank'], 
        colorscale='Hot_r',
        colorbar=dict(title="Rank"),
    ),
))

figure.update_layout(
    **plotly_treemap_layout_base_settings,
    title='Rank of days since last commit',
)
figure.show()

### Days since last file creation per directory

In [None]:
git_commit_days_since_last_file_creation_per_directory = add_rank_column(git_file_directories_with_commit_statistics, "daysSinceLastCreation")

figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(
    create_treemap_commit_statistics_settings(git_commit_days_since_last_file_creation_per_directory),
    values = git_commit_days_since_last_file_creation_per_directory['fileCount'],
    marker=dict(
        cornerradius=5, 
        colors=git_commit_days_since_last_file_creation_per_directory['daysSinceLastCreation'], 
        colorscale='Hot_r',
        colorbar=dict(title="Days"),
    ),
))
figure.update_layout(
    **plotly_treemap_layout_base_settings,
    title='Days since last file creation',
)
figure.show()

In [None]:
figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(
    create_treemap_commit_statistics_settings(git_commit_days_since_last_file_creation_per_directory),
    values = git_commit_days_since_last_file_creation_per_directory['fileCount'],
    marker=dict(
        cornerradius=5, 
        colors=git_commit_days_since_last_file_creation_per_directory['daysSinceLastCreation_rank'], 
        colorscale='Hot_r',
        colorbar=dict(title="Rank"),
    ),
))
figure.update_layout(
    **plotly_treemap_layout_base_settings,
    title='Rank of days since last file creation',
)
figure.show()

### Days since last file modification per directory

In [None]:
git_commit_days_since_last_file_modification_per_directory = add_rank_column(git_file_directories_with_commit_statistics, "daysSinceLastModification")

figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(
    create_treemap_commit_statistics_settings(git_commit_days_since_last_file_modification_per_directory),
    values = git_commit_days_since_last_file_modification_per_directory['fileCount'],
    marker=dict(
        cornerradius=5, 
        colors=git_commit_days_since_last_file_modification_per_directory['daysSinceLastModification'], 
        colorscale='Hot_r',
        colorbar=dict(title="Days"),
    ),
))
figure.update_layout(
    **plotly_treemap_layout_base_settings,
    title='Days since last file modification',
)
figure.show()

In [None]:
figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(
    create_treemap_commit_statistics_settings(git_commit_days_since_last_file_modification_per_directory),
    values = git_commit_days_since_last_file_modification_per_directory['fileCount'],
    marker=dict(
        cornerradius=5, 
        colors=git_commit_days_since_last_file_modification_per_directory['daysSinceLastModification_rank'], 
        colorscale='Hot_r',
        colorbar=dict(title="Rank"),
    ),
))
figure.update_layout(
    **plotly_treemap_layout_base_settings,
    title='Rank of days since last file modification',
)
figure.show()