# Git Analysis
This notebook contains ways of visualizing git repositories in ways that are separate than the files in them. This is focused on visualizing the flow of commits, contributors, size of commits, etc. over time.

In [46]:
# Install statements for things not present in Anaconda
%pip install plotly
%pip install pydriller

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [47]:
# Imports

from pydriller import Repository

import pandas as pd
import plotly.express as px

account = 'IntegerMan'
repository = 'VisualizingCode'

path = 'https://github.com/' + account + '/' + repository

print('Using repository ' + path)

Using repository https://github.com/IntegerMan/VisualizingCode


In [48]:
repo = Repository(path)

commits = []

for commit in repo.traverse_commits():
    record = {
        'hash': commit.hash,
        'message': commit.msg,
        'author_name': commit.author.name,
        'author_email': commit.author.email,
        'author_date': commit.author_date,
        'author_tz': commit.author_timezone,
        'committer_name': commit.committer.name,
        'committer_email': commit.committer.email,
        'committer_date': commit.committer_date,
        'committer_tz': commit.committer_timezone,
        'in_main': commit.in_main_branch,
        'is_merge': commit.merge,
        'num_deletes': commit.deletions,
        'num_inserts': commit.insertions,
        'net_lines': commit.lines,
        'num_files': commit.files,
        'branches': ', '.join(commit.branches), # Comma separated list of branches the commit is found in
        #'files': ', '.join(commit.modified_files), # Comma separated list of files the commit modifies
        'parents': ', '.join(commit.parents), # Comma separated list of parents
        # PyDriller Open Source Delta Maintainability Model (OS-DMM) stat. See https://pydriller.readthedocs.io/en/latest/deltamaintainability.html for metric definitions
        'dmm_unit_size': commit.dmm_unit_size,
        'dmm_unit_complexity': commit.dmm_unit_complexity,
        'dmm_unit_interfacing': commit.dmm_unit_interfacing,
    }
    # Omitted: modified_files (list), project_path, project_name
    commits.append(record)

In [49]:
df_commits = pd.DataFrame(commits)
df_commits.to_csv('RawCommits.csv')
df_commits.head()

Unnamed: 0,hash,message,author_name,author_email,author_date,author_tz,committer_name,committer_email,committer_date,committer_tz,...,is_merge,num_deletes,num_inserts,net_lines,num_files,branches,parents,dmm_unit_size,dmm_unit_complexity,dmm_unit_interfacing
0,262f4fbfa31cc7e1d9bb7b9871be1c9ac0242c57,Initial commit,Matt Eland,Matt.Eland@GMail.com,2021-10-07 00:50:53-04:00,14400,GitHub,noreply@github.com,2021-10-07 00:50:53-04:00,14400,...,False,0,131,131,2,main,,,,
1,436cece5340eb314fe5fe0eef94e423388755b23,Added code analysis export results from Visual...,Matt Eland,Matt.Eland@GMail.com,2021-10-07 00:53:05-04:00,14400,Matt Eland,Matt.Eland@GMail.com,2021-10-07 00:53:05-04:00,14400,...,False,0,0,0,1,main,262f4fbfa31cc7e1d9bb7b9871be1c9ac0242c57,,,
2,1b7c69a77563a9845cee2e5a55affea4c590bf17,Early work on file mining,Matt Eland,Matt.Eland@GMail.com,2021-10-07 14:33:19-04:00,14400,Matt Eland,Matt.Eland@GMail.com,2021-10-07 14:33:19-04:00,14400,...,False,0,109,109,9,main,436cece5340eb314fe5fe0eef94e423388755b23,1.0,1.0,1.0
3,5d59ceb5390057755c3648307324e30d5ea0dc92,Mining Progress,Matt Eland,Matt.Eland@GMail.com,2021-10-07 17:39:49-04:00,14400,Matt Eland,Matt.Eland@GMail.com,2021-10-07 17:39:49-04:00,14400,...,False,32,217,249,3,main,1b7c69a77563a9845cee2e5a55affea4c590bf17,1.0,1.0,1.0
4,c745f4f151247070e3d5ee9c3467cd3dec763cfd,Additional Mining,Matt Eland,Matt.Eland@GMail.com,2021-10-07 19:05:52-04:00,14400,Matt Eland,Matt.Eland@GMail.com,2021-10-07 19:05:52-04:00,14400,...,False,3,3,6,1,main,5d59ceb5390057755c3648307324e30d5ea0dc92,,,


In [50]:
df_commits.describe()

Unnamed: 0,author_tz,committer_tz,num_deletes,num_inserts,net_lines,num_files,dmm_unit_size,dmm_unit_complexity,dmm_unit_interfacing
count,30.0,30.0,30.0,30.0,30.0,30.0,6.0,6.0,6.0
mean,10560.0,10560.0,74.1,2413.233333,2487.333333,2.233333,0.5,0.72549,0.833333
std,6476.78081,6476.78081,306.707808,7216.892777,7406.393833,2.160513,0.547723,0.439671,0.408248
min,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
25%,3600.0,3600.0,0.25,1.0,2.0,1.0,0.0,0.514706,1.0
50%,14400.0,14400.0,1.0,1.5,4.5,1.0,0.5,1.0,1.0
75%,14400.0,14400.0,11.25,99.0,106.0,2.75,1.0,1.0,1.0
max,14400.0,14400.0,1661.0,33703.0,34060.0,9.0,1.0,1.0,1.0


In [51]:
# Histogram of net lines of code
fig = px.histogram(df_commits,
                   x="net_lines",
                   title='Net Lines of Code',
                   labels={'net_lines': 'Net Lines of Code'},
                   color_discrete_sequence=px.colors.qualitative.Dark24)
fig.show()

In [52]:
# Scatter of net lines of code by number of files modified
fig = px.scatter(df_commits, x='num_files', y='net_lines', hover_name='message', hover_data=['author_name', 'author_date', 'hash', 'branches'])
fig.show()