# git log/history
<br>  

### References
- [Visualizing Code: Polyglot Notebooks Repository (YouTube)](https://youtu.be/ipOpToPS-PY?si=3doePt2cp-LgEUmt)
- [gitstractor (GitHub)](https://github.com/IntegerMan/gitstractor)
- [Neo4j Python Driver](https://neo4j.com/docs/api/python-driver/current)

In [None]:
import os
import numpy as np
import pandas as pd
from scipy.stats import pearsonr, spearmanr
import matplotlib.pyplot as plot
from matplotlib.colors import ListedColormap
from neo4j import GraphDatabase

In [None]:
# Please set the environment variable "NEO4J_INITIAL_PASSWORD" in your shell 
# before starting jupyter notebook to provide the password for the user "neo4j". 
# It is not recommended to hardcode the password into jupyter notebook for security reasons.

driver = GraphDatabase.driver(uri="bolt://localhost:7687", auth=("neo4j", os.environ.get("NEO4J_INITIAL_PASSWORD")))
driver.verify_connectivity()

In [None]:
def get_cypher_query_from_file(cypher_file_name : str):
    with open(cypher_file_name) as file:
        return ' '.join(file.readlines())


def query_cypher_to_data_frame(filename : str, limit: int = -1):
    """
    Execute the Cypher query of the given file and returns the result.
    filename : str : The name of the file containing the Cypher query
    limit : int : The optional limit of rows to optimize the query. Default = -1 = no limit
    """
    cypher_query = get_cypher_query_from_file(filename)
    if limit > 0:
        cypher_query = "{query}\nLIMIT {row_limit}".format(query = cypher_query, row_limit = limit)
    records, summary, keys = driver.execute_query(cypher_query)
    return pd.DataFrame([r.values() for r in records], columns=keys)


def query_first_non_empty_cypher_to_data_frame(*filenames : str, limit: int = -1):
    """
    Executes the Cypher queries of the given files and returns the first result that is not empty.
    If all given file names result in empty results, the last (empty) result will be returned.
    By additionally specifying "limit=" the "LIMIT" keyword will appended to query so that only the first results get returned.
    """    
    result=pd.DataFrame()
    for filename in filenames:
        result=query_cypher_to_data_frame(filename, limit)
        if not result.empty:
            return result
    return result

In [None]:
#The following cell uses the build-in %html "magic" to override the CSS style for tables to a much smaller size.
#This is especially needed for PDF export of tables with multiple columns.

In [None]:
%%html
<style>
/* CSS style for smaller dataframe tables. */
.dataframe th {
    font-size: 8px;
}
.dataframe td {
    font-size: 8px;
}
</style>

In [None]:
# Main Colormap
# main_color_map = 'nipy_spectral'
main_color_map = 'viridis'

In [None]:
# Pandas DataFrame Display Configuration
pd.set_option('display.max_colwidth', 500)

## Pairwise Changed Files vs. Dependency Weight

This section explores the correlation between how often pairs of files are changed together (common commit count) and their dependency weight. Note that these results should be interpreted cautiously, as comparing pairwise changes and dependencies is inherently challenging.

### Considerations
- **Historical vs. Current State**: Pairwise changes reflect the entire git history, while dependency weight represents the current state of the codebase.
- **Commit Granularity**: Developers may use different commit strategies, such as squashing changes into a single commit or creating fine-grained commits. Ideally, each commit should represent a single semantic change for accurate analysis.
- **Dependency Representation**: Some file types (e.g., Java files with import statements) clearly define dependencies, while others (e.g., shell scripts, XML, YAML) lack explicit dependency relationships.
- **Repository Characteristics**: Repositories with generated code may have many large commits, while stabilized repositories may only update configuration files for dependency changes.

#### Data Preview

In [None]:
pairwise_changed_git_files_with_dependencies = query_cypher_to_data_frame("../cypher/GitLog/List_pairwise_changed_files_with_dependencies.cypher")
pairwise_changed_git_files_with_dependencies.head(10)

#### Data Statistics

In [None]:
display("Pairwise changed git files compared to dependency weights - Overall statistics")
display(pairwise_changed_git_files_with_dependencies.describe())

# The correlation matrix plot can be found further below
# display("Pairwise changed git files compared to dependency weights - Pearson Correlation")
# display(pairwise_changed_git_files_with_dependencies.corr(method='pearson'))

# display("Pairwise changed git files compared to dependency weights - Spearman Correlation")
# display(pairwise_changed_git_files_with_dependencies.corr(method='spearman'))

In [None]:
if pairwise_changed_git_files_with_dependencies.shape[0] < 5:
    print("Less than 5 samples are not enough to calculate p-values")
else:
    display("Pearson Correlation with p-value for commitCount and dependencyWeight")
    display(pearsonr(pairwise_changed_git_files_with_dependencies['commitCount'], pairwise_changed_git_files_with_dependencies['dependencyWeight']))

    display("Spearman Correlation with p-value for commitCount and dependencyWeight")
    display(spearmanr(pairwise_changed_git_files_with_dependencies['commitCount'], pairwise_changed_git_files_with_dependencies['dependencyWeight']))

    display("Pearson Correlation with p-value for updateCommitMinConfidence and dependencyWeight")
    display(pearsonr(pairwise_changed_git_files_with_dependencies['updateCommitMinConfidence'], pairwise_changed_git_files_with_dependencies['dependencyWeight']))

    display("Spearman Correlation with p-value for updateCommitMinConfidence and dependencyWeight")
    display(spearmanr(pairwise_changed_git_files_with_dependencies['updateCommitMinConfidence'], pairwise_changed_git_files_with_dependencies['dependencyWeight']))

In [None]:
def plot_correlation_matrix(correlation_matrix: pd.DataFrame, title_suffix: str = "") -> None:
    """
    Plots the correlation matrix of the features in the DataFrame.
    
    :param java_package_anomaly_detection_features: DataFrame containing the features.
    :param java_package_features_to_standardize: List of feature names to include in the correlation matrix.
    """
    figure, axis = plot.subplots(figsize=(6, 6))
    color_axis = axis.matshow(correlation_matrix, cmap="coolwarm")
    figure.colorbar(color_axis)
    axis.set_xticks(range(len(correlation_matrix.columns)))
    axis.set_yticks(range(len(correlation_matrix.index)))
    axis.set_xticklabels(correlation_matrix.columns, rotation=90, fontsize=8)
    axis.set_yticklabels(correlation_matrix.index, fontsize=8)
    for (i, j), correlation_value in np.ndenumerate(correlation_matrix.values):
        axis.text(j, i, f"{correlation_value:.2f}", ha='center', va='center', color='black', fontsize=8, bbox=dict(facecolor='white', alpha=0.2, edgecolor='none'))
    plot.title(f"Correlation Matrix {title_suffix}", fontsize=10)
    plot.tight_layout()
    plot.show()

In [None]:
plot_correlation_matrix(pairwise_changed_git_files_with_dependencies.corr(method="pearson"), "(Pearson)")

In [None]:
plot_correlation_matrix(pairwise_changed_git_files_with_dependencies.corr(method="spearman"), "(Spearman)")

In [None]:
def pvalue_matrix(data: pd.DataFrame) -> pd.DataFrame:
    columns = data.columns
    # Fill the diagonal with value 1.0. 
    # A p-value for 2 identical values would lead to warnings and not reveal any valueable insights.
    p_values = pd.DataFrame(np.ones((len(columns), len(columns))), columns=columns, index=columns)

    for i in range(len(columns)):
        for j in range(i+1, len(columns)):
            _, p_value = pearsonr(data[columns[i]], data[columns[j]])
            if np.isnan(p_value): # replace nan with 1.0 = no significance
                p_value = 1.0
            if np.isclose(p_value, 0.0, rtol=1e-15, atol=1e-15): # replace values to close to zero by 1 = no significance
                p_value = 1.0
            p_values.iloc[i, j] = p_value
            p_values.iloc[j, i] = p_value

    return p_values

def plot_p_value_matrix(p_value_matrix: pd.DataFrame):

    # Map values to 0 (green) and 1 (white)
    data_for_plot = np.where(p_value_matrix < 0.05, 0, 1)

    # Make a colormap: green for low p-values, white for others
    color_map = ListedColormap(["limegreen", "white"])

    # Plot heatmap
    figure, axis = plot.subplots(figsize=(6, 5))
    image = axis.imshow(data_for_plot, cmap=color_map, vmin=0, vmax=1)

    # Add colorbar
    # color_bar = plot.colorbar(image, ax=axis)
    # color_bar.set_label("p-value")

    # Show all ticks
    axis.set_xticks(np.arange(len(p_value_matrix.columns)))
    axis.set_yticks(np.arange(len(p_value_matrix.index)))
    axis.set_xticklabels(p_value_matrix.columns, fontsize=8)
    axis.set_yticklabels(p_value_matrix.index, fontsize=8)

    # Rotate tick labels
    plot.setp(axis.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor", fontsize=8)

    # Annotate with values
    for i in range(len(p_value_matrix.columns)):
        for j in range(len(p_value_matrix.index)):
            cell_value = p_value_matrix.iloc[i, j]
            if cell_value < 0.001:
                cell_text = f"{cell_value:.1e}"  # scientific notation
            else:
                cell_text = f"{cell_value:.4f}"  # normal 4-decimal format
            axis.text(j, i, cell_text, ha="center", va="center", color="black", fontsize=6)

    plot.title("p-value Matrix (< 0.05 in Green)", pad=20, fontsize=10)
    plot.tight_layout()
    plot.show()

# Plot p-values showing statistical significance
plot_p_value_matrix(pvalue_matrix(pairwise_changed_git_files_with_dependencies))

In [None]:
def plot_git_changes_vs_dependencies(pairwise_changes: pd.DataFrame, title: str, x_column: str, y_column: str):
    if pairwise_changes.empty:
        print("No projected data to plot.")
        return

    plot.scatter(
        x=pairwise_changes[x_column],
        y=pairwise_changes[y_column],
        s=3,
    )
    plot.xlabel(x_column)
    plot.ylabel(y_column)
    plot.title(title, pad=20)
    plot.show()

In [None]:
# Scatter plot of all pairs of files with their commit count on the x axis and dependency weight on the y axis

plot_git_changes_vs_dependencies(
    pairwise_changed_git_files_with_dependencies,
    'Pairwise changed files: Number of changes (commitCount) vs. dependency weight',
    'commitCount',
    'dependencyWeight'
)

In [None]:
# Scatter plot of all pairs of files with their min confidence (normalized update commit count) on the x axis and dependency weight on the y axis

plot_git_changes_vs_dependencies(
    pairwise_changed_git_files_with_dependencies,
    'Pairwise changed files: Min confidence co-change rate vs. dependency weight',
    'updateCommitMinConfidence',
    'dependencyWeight'
)