# GitStractor Jupyter Notebook Data Visualization

This notebook serves as a set of examples for visualizing codebases using [GitStractor](https://github.com/integerman/gitstractor) and Jupyter Notebooks.

Contact [Matt Eland](https://MattEland.dev) ([@IntegerMan](https://twitter.com/IntegerMan)) with questions.

## Requirements

This application currently requires:

- CSV files generated by [GitStractor](https://github.com/integerman/gitstractor)
- Jupyter Notebooks running some version of Python (tested using Python 3.8.8)
- The following Python libraries:
  - pandas
  - plotly.express

In [1]:
# Load Dependencies
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

## Data Loading

In [2]:
# Project Name shows up in some visualizations
project_name = 'Accessible AI Blog'

# This should point to the location containing the GitStractor CSV files
data_dir = 'C:\\tools\\gitstractor'

# These are the default GitStractor file names and shouldn't need to be customized
author_file = data_dir + '\\Authors.csv'
commits_file = data_dir + '\\Commits.csv'
file_commits_file = data_dir + '\\FileCommits.csv'
files_file = data_dir + '\\Files.csv'
final_structure_file = data_dir + '\\FinalStructure.csv'

### Load Authors

In [3]:
df_authors = pd.read_csv(author_file)

df_authors.head(5)

Unnamed: 0,Name,Email,NumCommits,TotalBytes
0,Matt Eland,Matt.Eland@GMail.com,531,316359646
1,repo-visualizer,repo-visualizer@users.noreply.github.com,304,41592641


### Load Commits

In [81]:
df_commits = pd.read_csv(commits_file, parse_dates=['AuthorDateUTC','CommitterDate'])

# Engineer Date Component Columns
df_commits['date'] = df_commits['AuthorDateUTC'].dt.date
df_commits['year'] = df_commits['AuthorDateUTC'].dt.year
df_commits['month'] = df_commits['AuthorDateUTC'].dt.month
df_commits['year-month'] = df_commits['AuthorDateUTC'].to_numpy().astype('datetime64[M]')
df_commits['weekday'] = df_commits['AuthorDateUTC'].dt.weekday
df_commits['weekday_name'] = df_commits['AuthorDateUTC'].dt.strftime("%A")

# Grab the name of the author from our authors dataframe
df_commits = df_commits.merge(df_authors[['Name','Email']], right_on='Email', left_on='AuthorEmail')
df_commits.drop(columns=['Email'], inplace=True)
df_commits.rename(columns={'Name':'AuthorName'}, inplace=True)

df_commits.head(5)

Unnamed: 0,CommitHash,AuthorEmail,AuthorDateUTC,CommitterEmail,CommitterDate,Message,NumFiles,AddedFiles,DeletedFiles,TotalFiles,TotalBytes,FileNames,date,year,month,year-month,weekday,weekday_name,AuthorName
0,fb99683d67a0a16dbcf0b56995025602952a8b61,Matt.Eland@GMail.com,2021-11-05 00:30:19,Matt.Eland@GMail.com,2021-11-05 00:30:19,Initial site contents,94,94,0,94,1979490,"2d97c5 .gitignore @ fb9968 (Added), e69de2 .hu...",2021-11-05,2021,11,2021-11-01,4,Friday,Matt Eland
1,170cc0d8dfff42cfe84f33ab0c4976fe7a9c6a0d,Matt.Eland@GMail.com,2021-11-05 02:37:51,Matt.Eland@GMail.com,2021-11-05 02:37:51,Some initial content.,6,2,0,95,82194,"dfe113 config.toml @ 170cc0 (Modified), ac5c6c...",2021-11-05,2021,11,2021-11-01,4,Friday,Matt Eland
2,775ca7ac6df7d904dfcf429f5c317286175d6615,Matt.Eland@GMail.com,2021-11-05 02:41:00,Matt.Eland@GMail.com,2021-11-05 02:41:00,Fixed URL,1,0,0,95,851,6c6a5b config.toml @ 775ca7 (Modified),2021-11-05,2021,11,2021-11-01,4,Friday,Matt Eland
3,b6f7a8d2299fa09affa1bd6ab56f22c5926ba8f0,Matt.Eland@GMail.com,2021-11-05 02:42:40,Matt.Eland@GMail.com,2021-11-05 02:42:40,Tweak for JS,1,0,0,95,533,4c21a7 themes/terminal/layouts/partials/footer...,2021-11-05,2021,11,2021-11-01,4,Friday,Matt Eland
4,03003f604e8289724c4cc637e8b123b8ac4b6ac4,Matt.Eland@GMail.com,2021-11-05 02:45:31,Matt.Eland@GMail.com,2021-11-05 02:45:31,Trying a footer tweak,1,0,0,95,494,1540a8 themes/terminal/layouts/partials/footer...,2021-11-05,2021,11,2021-11-01,4,Friday,Matt Eland


In [5]:
df_file_commits = pd.read_csv(file_commits_file)

df_file_commits.head()

Unnamed: 0,FilePath,FileHash,CommitHash,AuthorEmail,AuthorDateUTC,CommitterEmail,CommitterDate,Message,Bytes,Lines
0,.gitignore,2d97c55f7dd455f5ac604b337562bf6a148d7ca8,fb99683d67a0a16dbcf0b56995025602952a8b61,Matt.Eland@GMail.com,11/5/2021 12:30:19 AM,Matt.Eland@GMail.com,11/5/2021 12:30:19 AM,Initial site contents,509,41
1,.hugo_build.lock,e69de29bb2d1d6434b8b29ae775ad8c2e48c5391,fb99683d67a0a16dbcf0b56995025602952a8b61,Matt.Eland@GMail.com,11/5/2021 12:30:19 AM,Matt.Eland@GMail.com,11/5/2021 12:30:19 AM,Initial site contents,0,0
2,archetypes/default.md,00e77bd79be44872c0b29256b03799c2fb00c10d,fb99683d67a0a16dbcf0b56995025602952a8b61,Matt.Eland@GMail.com,11/5/2021 12:30:19 AM,Matt.Eland@GMail.com,11/5/2021 12:30:19 AM,Initial site contents,84,6
3,config.toml,9e2bebcf50195939989aa578281ae34340056124,fb99683d67a0a16dbcf0b56995025602952a8b61,Matt.Eland@GMail.com,11/5/2021 12:30:19 AM,Matt.Eland@GMail.com,11/5/2021 12:30:19 AM,Initial site contents,959,35
4,content/about.md,8944da150ed7125974f91ecd4d97c9aa2e97eea4,fb99683d67a0a16dbcf0b56995025602952a8b61,Matt.Eland@GMail.com,11/5/2021 12:30:19 AM,Matt.Eland@GMail.com,11/5/2021 12:30:19 AM,Initial site contents,813,22


### Load File Structure

In [6]:
df_files = pd.read_csv(final_structure_file)

df_files.fillna('.', inplace=True)
df_files.head(5)

Unnamed: 0,CommitHash,FileHash,Filename,Extension,FilePath,State,Lines,Bytes,CreatedDateUTC,Path1,Path2,Path3,Path4,Path5
0,cc46ac38229baf9b022920fb1b2713a889809b49,00e77bd79be44872c0b29256b03799c2fb00c10d,default.md,.md,archetypes/default.md,Final,6,84,3/16/2023 4:24:08 AM,archetypes,.,.,.,.
1,cc46ac38229baf9b022920fb1b2713a889809b49,32818c64f3d36d812fc69b14101be976f46a3f9f,about.md,.md,content/about.md,Final,32,1870,3/16/2023 4:24:08 AM,content,.,.,.,.
2,cc46ac38229baf9b022920fb1b2713a889809b49,6454ea6576efca7982ab78d3ad2e1520577c4140,bots.md,.md,content/bots.md,Final,45,3292,3/16/2023 4:24:08 AM,content,.,.,.,.
3,cc46ac38229baf9b022920fb1b2713a889809b49,7ae052f95abcdace9ae07e86cb2c3fd2ce47a2b2,chat.md,.md,content/chat.md,Final,14,577,3/16/2023 4:24:08 AM,content,.,.,.,.
4,cc46ac38229baf9b022920fb1b2713a889809b49,c5541482230075d4b4800ebbb7ac42fad29e5133,browserconfig.xml,.xml,content/img/FavIcon/browserconfig.xml,Final,2,281,3/16/2023 4:24:08 AM,content,img,FavIcon,.,.


## Data Visualization

In [7]:
# Declare standard styles here

theme_discrete = px.colors.qualitative.Prism
theme_diverging_neutral = px.colors.diverging.RdYlBu
theme_diverging = px.colors.diverging.Picnic_r
theme_diverging_r = px.colors.diverging.Picnic
theme_sequential = px.colors.sequential.Agsunset
theme_continuous= px.colors.diverging.balance
theme_hot = px.colors.sequential.Reds
theme_cold = px.colors.sequential.Blues

template = 'plotly_dark'

In [8]:
# Utility Formatting functions
def format_and_show_short(fig):
    fig.update_layout(template=template,
                      height=400)
    fig.show()

def format_and_show(fig):
    fig.update_layout(template=template,
                      height=550)
    fig.show()

def format_and_show_tall(fig):
    fig.update_layout(template=template,
                      height=800)
    fig.show()

def format_and_show_3d(fig):
    fig.update_layout(template=template,
                      width=800,
                      height=600)
    fig.show()

def format_and_show_sunburst(fig):
    fig.update_layout(template=template,
                      width=1024,
                      height=800)
    fig.show()

### File Structure

Data visualizations exploring the static structure of the git repository's final state

In [9]:
file_labels = {
    'Path1': 'Project',
    'Path2': 'Area',
    'Lines': 'Lines of Code',
    'Lines_sum': 'Total Lines of Code',
}

In [10]:
# Files by File Size
fig = px.treemap(df_files,
                 path=[px.Constant(project_name),'Path1','Path2','Path3','Filename'],
                 color='Lines',
                 title=project_name + ' Largest Files (Lines)',
                 labels=file_labels,
                 values='Lines')
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
format_and_show_tall(fig)

In [11]:
# Files by File Size
fig = px.treemap(df_files,
                 path=[px.Constant(project_name),'Path1','Path2','Path3','Filename'],
                 color='Bytes',
                 title=project_name + ' Largest Files (Bytes)',
                 labels=file_labels,
                 values='Lines')
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
format_and_show_tall(fig)

In [12]:
# Sunburst diagram. Same data as a treemap, but different presentation
fig = px.sunburst(df_files,
                 path=['Path1','Path2','Path3','Filename'],
                 color='Lines',
                 title=project_name + ' Size of Code Files by Project and Directory',
                 hover_data=['FilePath'],
                 color_continuous_scale='sunsetdark',
                 labels=file_labels,
                 values='Lines')
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
format_and_show_sunburst(fig)

In [13]:
# Files by Directory Structure
fig = px.treemap(df_files,
                 path=[px.Constant(project_name),'Path1','Path2','Path3','Filename'],
                 color='Path1',
                 title=project_name + ' Project Structure',
                 labels=file_labels,
                 color_discrete_sequence=theme_discrete,
                 values='Lines')
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
format_and_show_tall(fig)

In [14]:
# Files by Extension
fig = px.treemap(df_files,
                 path=[px.Constant(project_name),'Path1','Path2','Path3','Filename'],
                 color='Extension',
                 title=project_name + ' File Types',
                 labels=file_labels,
                 color_discrete_sequence=theme_discrete,
                 values='Lines')
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
format_and_show_tall(fig)

In [15]:
fig = px.histogram(df_files,
                   x="Lines",
                   title=project_name + ' Frequency of File Sizes by Area',
                   color='Path2',
                   labels=file_labels,
                   color_discrete_sequence=theme_discrete)
format_and_show(fig)

In [16]:
# Overall box plot for all code
fig = px.box(df_files,
             title=project_name + ' Lines of Code by Area',
             x='Lines',
             y='Path2',
             color='Path2',
             color_discrete_sequence=theme_discrete,
             labels=file_labels,
             hover_data=['FilePath'],
             points='outliers') # Acceptable values: 'all', 'outliers', 'suspectedoutliers', or False
fig.update_traces(quartilemethod='linear', jitter=1)
format_and_show(fig)

## Commit Graphs

Data visualizations exploring commit patterns

In [83]:
# Replacement Values to make the graphs look nice
commit_labels = {
                     'TotalBytes': 'Bytes',
                     'NumFiles': '# Files',
                     'weekday_name': 'Weekday',
                     'AuthorEmail': 'Author E-Mail',
                     'AuthorDateUTC': 'Date',
                     'AuthorName': 'Author',

                     'net_lines':'Net Lines',
                     'num_deletes': 'Lines Deleted',
                     'num_inserts': 'Lines Added',
                     'num_files': 'Files Modified',
                     'date': 'Date',
                     'datetime': 'Date',
                     'filename': 'File',
                     'message': 'Commit Message',
                     'hash': 'Hash',
                     'author_name': 'Author',
                     'count': 'Count',
                     'avg_net': 'Avg. Net Lines',
                     'num_commits': 'Commits',
                     'num_authors': 'Authors',
                     'sum_net': 'Total Net Lines',
                     'lines': 'Lines of Code',
                     'project': 'Project',
                 }

In [85]:
fig = px.scatter(df_commits, 
                 title= project_name + ' bytes per commit',
                 x='AuthorDateUTC', 
                 y='TotalBytes',
                 color='TotalBytes',
                 color_discrete_sequence=theme_sequential,
                 hover_data=['AuthorName'],
                 labels=commit_labels,
                 hover_name='Message')
fig.update_layout(xaxis_title='Date')
format_and_show_short(fig)

In [86]:
fig = px.scatter(df_commits, 
                 title= project_name + ' files per commit',
                 x='AuthorDateUTC', 
                 y='NumFiles',
                 color='NumFiles',
                 color_discrete_sequence=theme_sequential,
                 labels=commit_labels,
                 hover_data=['AuthorName'],
                 hover_name='Message')
fig.update_layout(xaxis_title='Date')
format_and_show_short(fig)

In [87]:
fig = px.histogram(df_commits, 
                 title=project_name + ' Commits by Day of Week',
                 x='date', 
                 color='weekday_name',
                 color_discrete_sequence=theme_sequential,
                 labels=commit_labels)
fig.update_layout(xaxis_title='Date')
format_and_show_short(fig)

In [88]:
fig = px.histogram(df_commits, 
                 title=project_name + ' Commits by Year (Day of Week Colorized)',
                 x='year', 
                 color='weekday_name',
                 color_discrete_sequence=theme_sequential,
                 labels=commit_labels)
fig.update_layout(xaxis_title='Year')
format_and_show_short(fig)

In [89]:
fig = px.histogram(df_commits, 
                 title=project_name + ' Commits by Month (Day of Week Colorized)',
                 x='year-month', 
                 color='weekday_name',
                 color_discrete_sequence=theme_sequential,
                 labels=commit_labels)
fig.update_layout(xaxis_title='Month')
format_and_show_short(fig)

In [90]:
# Determine the length of the project in days. This allows us to bin future graphs by the exact number of days in the project
num_days = (df_commits['date'].max() - df_commits['date'].min()).days
num_days

496

In [91]:
fig = px.histogram(df_commits, 
                 title=project_name + ' Daily Commits (Weekdays Colorized)',
                 x='date', 
                 color='weekday_name',
                 nbins=num_days,
                 color_discrete_sequence=theme_sequential,
                 hover_name='Message',
                 labels=commit_labels)
fig.update_layout(xaxis_title='Date')
format_and_show_short(fig)

In [60]:
# Let's now remove author from things and just look at aggregate daily totals
df_commits_daily = df_commits.groupby('date').agg(
        num_files=('NumFiles', 'sum'),
        num_authors=('AuthorEmail', pd.Series.nunique),
        num_commits=('CommitHash', pd.Series.nunique),
        sum_inserts=('DeletedFiles', 'sum'),
        sum_deletes=('DeletedFiles', 'sum'),
        sum_total_files=('TotalFiles', 'sum'),
        min_total_files = ('TotalFiles', 'min'), 
        max_total_files=('TotalFiles', 'max'),
        avg_total_files=('TotalFiles', 'mean'),
        min_files = ('NumFiles', 'min'), 
        max_files=('NumFiles', 'max'),
        avg_files=('NumFiles', 'mean'),
        min_deletes = ('DeletedFiles', 'min'), 
        max_deletes=('DeletedFiles', 'max'),
        avg_deletes=('DeletedFiles', 'mean'),
        min_inserts = ('AddedFiles', 'min'), 
        max_inserts=('AddedFiles', 'max'),
        avg_inserts=('AddedFiles', 'mean'))

df_commits_monthly = df_commits.groupby('year-month').agg(
        num_files=('NumFiles', 'sum'),
        num_authors=('AuthorEmail', pd.Series.nunique),
        num_commits=('CommitHash', pd.Series.nunique),
        sum_inserts=('DeletedFiles', 'sum'),
        sum_deletes=('DeletedFiles', 'sum'),
        sum_total_files=('TotalFiles', 'sum'),
        min_total_files = ('TotalFiles', 'min'), 
        max_total_files=('TotalFiles', 'max'),
        avg_total_files=('TotalFiles', 'mean'),
        min_files = ('NumFiles', 'min'), 
        max_files=('NumFiles', 'max'),
        avg_files=('NumFiles', 'mean'),
        min_deletes = ('DeletedFiles', 'min'), 
        max_deletes=('DeletedFiles', 'max'),
        avg_deletes=('DeletedFiles', 'mean'),
        min_inserts = ('AddedFiles', 'min'), 
        max_inserts=('AddedFiles', 'max'),
        avg_inserts=('AddedFiles', 'mean'))

agg_commit_hover_data = ['sum_inserts', 'sum_deletes', 'min_files', 'min_inserts', 'min_deletes', 'max_files', 'max_inserts', 'max_deletes', 'avg_files', 'avg_inserts','avg_deletes']

In [92]:
fig = px.scatter(df_commits_daily, 
                 title=project_name + ' Daily Commit Counts',
                 x=df_commits_daily.index,
                 y='num_commits', 
                 color='num_commits',
                 hover_data=agg_commit_hover_data,
                 hover_name=df_commits_daily.index,
                 color_continuous_scale=theme_sequential,
                 labels=commit_labels)

fig.update_layout(xaxis_title='Date')

format_and_show(fig)

In [93]:
fig = px.scatter(df_commits_daily, 
                 title=project_name + ' Daily Commit and Author Counts',
                 x=df_commits_daily.index,
                 y='num_commits', 
                 color='num_authors',
                 hover_name=df_commits_daily.index,
                 hover_data=agg_commit_hover_data,
                 color_continuous_scale=theme_sequential,
                 labels=commit_labels)

fig.update_layout(xaxis_title='Date')

format_and_show(fig)

In [68]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_commits_monthly.index,
            mode='lines+markers',
            name='Commits',
            line=dict(
                color='Purple'
            ),
            marker=dict(
                color=df_commits_monthly['num_authors'],
                size=8,
                colorscale=theme_sequential,
                colorbar=dict(
                    title="Authors"
                ),
            ),
            y=df_commits_monthly['num_commits']))

fig.update_layout(xaxis_title='Date',yaxis_title='Commits', title=project_name + " Monthly Commits and Authors")

format_and_show(fig)

In [69]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_commits_monthly.index,
            mode='lines+markers',
            name='Authors',
            line=dict(
                color='Purple'
            ),
            marker=dict(
                color=df_commits_monthly['num_commits'],
                size=8,
                colorscale=theme_sequential,
                colorbar=dict(
                    title="Commits"
                ),
            ),
            y=df_commits_monthly['num_authors']))

fig.update_layout(xaxis_title='Date',yaxis_title='Authors', title=project_name +" Monthly Authors and Commits")

format_and_show(fig)

In [70]:
fig = px.scatter_3d(df_commits_monthly, 
                 title=project_name + ' Monthly Commit and Author Counts',
                 x=df_commits_monthly.index,
                 y='num_commits', 
                 z='num_authors',
                 color='num_authors',
                 hover_name=df_commits_monthly.index,
                 hover_data=agg_commit_hover_data,
                 color_continuous_scale=theme_sequential,
                 labels=commit_labels)

fig.update_layout(xaxis_title='Date')

format_and_show_3d(fig)

In [59]:
fig = px.scatter_3d(df_commits_daily, 
                 title=project_name + ' Daily Commit and Average Files Modified',
                 x=df_commits_daily.index,
                 y='num_commits', 
                 z='avg_files',
                 color='avg_files',
                 hover_data=agg_commit_hover_data,
                 hover_name=df_commits_daily.index,
                 color_continuous_scale=theme_sequential,
                 labels=commit_labels)

fig.update_layout(xaxis_title='Date')

format_and_show_3d(fig)

## Authors

Data visualizations exploring author behaviors and tendencies

In [94]:
fig = px.scatter(df_commits, 
                 title=project_name + ' Total Bytes per Commit by Author',
                 x='AuthorDateUTC', 
                 y='TotalBytes',
                 color='AuthorName',
                 color_discrete_sequence=theme_discrete,
                 hover_name='Message',
                 labels=commit_labels)

fig.update_traces(marker=dict(size=5), selector=dict(mode='markers'))

format_and_show(fig)

In [95]:
df_attributed = df_commits[df_commits['AuthorName'] != '(no author)']
df_attributed = df_attributed[df_attributed['AuthorName'] != 'unknown']

df_contributor_monthly = df_attributed.groupby(['year-month','AuthorName']).agg(
        count=('CommitHash', pd.Series.nunique),
        sum_files=('NumFiles', 'sum'),
        sum_inserts=('AddedFiles', 'sum'),
        sum_deletes=('DeletedFiles', 'sum')).sort_index(ascending=False)

In [96]:
fig = px.scatter(df_contributor_monthly,
        x=df_contributor_monthly.index.get_level_values(0),
        y='sum_inserts',
        size='count',
        labels=commit_labels,
        color=df_contributor_monthly.index.get_level_values(1),
        color_discrete_sequence=theme_discrete)
fig.update_layout(title=project_name + " Net lines of code by month by author",
                  xaxis_title='Year / Month',
                  yaxis_title='Files Added',
                  legend_title='Author')
format_and_show(fig)

In [97]:
fig = px.scatter(df_contributor_monthly,
                 x=df_contributor_monthly.index.get_level_values(0), 
                 y=df_contributor_monthly.index.get_level_values(1),
                 color=df_contributor_monthly.index.get_level_values(1),
                 size='count',
                 color_discrete_sequence=theme_discrete,
                 labels=commit_labels)
fig.update_layout(title=project_name + " Monthly Contribution History",
                  xaxis_title='Year / Month',
                  yaxis_title='Author',
                  legend_title='Author')
format_and_show(fig)

In [98]:
# Overall box plot for all authors
fig = px.box(df_commits,
             title=project_name + ' bytes per commit by Author',
             x='TotalBytes',
             y='AuthorName',
             color='AuthorName',
             labels=commit_labels,
             color_discrete_sequence=theme_discrete,
             hover_data=['CommitHash','AuthorDateUTC','Message'],
             points='outliers') # Acceptable values: 'all', 'outliers', 'suspectedoutliers', or False
fig.update_traces(quartilemethod='linear', jitter=1)
format_and_show(fig)