# GitStractor Jupyter Notebook Data Visualization

This notebook serves as a set of examples for visualizing codebases using [GitStractor](https://github.com/integerman/gitstractor) and Jupyter Notebooks.

Contact [Matt Eland](https://MattEland.dev) ([@IntegerMan](https://twitter.com/IntegerMan)) with questions.

## Requirements

This application currently requires:

- CSV files generated by [GitStractor](https://github.com/integerman/gitstractor)
- Jupyter Notebooks running some version of Python (tested using Python 3.8.8)
- The following Python libraries:
  - pandas
  - plotly.express

In [532]:
# Load Dependencies
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

## Data Loading

In [533]:
# Project Name shows up in some visualizations
project_name = 'AccessibleAI Blog'

# This should point to the location containing the GitStractor CSV files
data_dir = 'C:\\tools\\gitstractor'

# These are the default GitStractor file names and shouldn't need to be customized
author_file = data_dir + '\\Authors.csv'
commits_file = data_dir + '\\Commits.csv'
file_commits_file = data_dir + '\\FileCommits.csv'
files_file = data_dir + '\\Files.csv'
final_structure_file = data_dir + '\\FinalStructure.csv'

### Load Authors

In [534]:
df_authors = pd.read_csv(author_file)

df_authors.head(5)

Unnamed: 0,Name,Email,NumCommits,TotalBytes
0,Matt Eland,Matt.Eland@GMail.com,531,6539823
1,repo-visualizer,repo-visualizer@users.noreply.github.com,304,0


### Load Commits

In [535]:
def add_date_columns(df, dateColName):
    df['date'] = df[dateColName].dt.date
    df['year'] = df[dateColName].dt.year
    df['month'] = df[dateColName].dt.month
    df['year-month'] = df[dateColName].to_numpy().astype('datetime64[M]')
    df['weekday'] = df[dateColName].dt.weekday
    df['weekday_name'] = df[dateColName].dt.strftime("%A")
    
    return df

In [536]:
df_commits = pd.read_csv(commits_file, parse_dates=['AuthorDateUTC','CommitterDateUTC'])

# Engineer Date Component Columns
df_commits = add_date_columns(df_commits, 'AuthorDateUTC')

# Grab the name of the author from our authors dataframe
df_commits = df_commits.merge(df_authors[['Name','Email']], right_on='Email', left_on='AuthorEmail')
df_commits.drop(columns=['Email'], inplace=True)
df_commits.rename(columns={'Name':'AuthorName'}, inplace=True)

df_commits.head(5)

Unnamed: 0,CommitHash,AuthorEmail,AuthorDateUTC,CommitterEmail,CommitterDateUTC,Message,NumFiles,AddedFiles,DeletedFiles,TotalFiles,...,FileNames,TotalLines,NetLines,date,year,month,year-month,weekday,weekday_name,AuthorName
0,fb99683d67a0a16dbcf0b56995025602952a8b61,Matt.Eland@GMail.com,2021-11-05 00:30:19,Matt.Eland@GMail.com,2021-11-05 00:30:19,Initial site contents,66,66,0,66,...,"00e77b archetypes/default.md @ fb9968 (Added),...",3082,3082,2021-11-05,2021,11,2021-11-01,4,Friday,Matt Eland
1,170cc0d8dfff42cfe84f33ab0c4976fe7a9c6a0d,Matt.Eland@GMail.com,2021-11-05 02:37:51,Matt.Eland@GMail.com,2021-11-05 02:37:51,Some initial content.,4,1,0,67,...,"ac5c6c content/about.md @ 170cc0 (Modified), f...",69,6,2021-11-05,2021,11,2021-11-01,4,Friday,Matt Eland
2,775ca7ac6df7d904dfcf429f5c317286175d6615,Matt.Eland@GMail.com,2021-11-05 02:41:00,Matt.Eland@GMail.com,2021-11-05 02:41:00,Fixed URL,0,0,0,67,...,,0,0,2021-11-05,2021,11,2021-11-01,4,Friday,Matt Eland
3,b6f7a8d2299fa09affa1bd6ab56f22c5926ba8f0,Matt.Eland@GMail.com,2021-11-05 02:42:40,Matt.Eland@GMail.com,2021-11-05 02:42:40,Tweak for JS,1,0,0,67,...,4c21a7 themes/terminal/layouts/partials/footer...,19,0,2021-11-05,2021,11,2021-11-01,4,Friday,Matt Eland
4,03003f604e8289724c4cc637e8b123b8ac4b6ac4,Matt.Eland@GMail.com,2021-11-05 02:45:31,Matt.Eland@GMail.com,2021-11-05 02:45:31,Trying a footer tweak,1,0,0,67,...,1540a8 themes/terminal/layouts/partials/footer...,19,0,2021-11-05,2021,11,2021-11-01,4,Friday,Matt Eland


In [537]:
df_file_commits = pd.read_csv(file_commits_file, parse_dates=['AuthorDateUTC','CommitterDate'])

# Add Date Columns
df_file_commits = add_date_columns(df_file_commits, 'AuthorDateUTC')

# Grab the name of the author from our authors dataframe
df_file_commits = df_file_commits.merge(df_authors[['Name','Email']], right_on='Email', left_on='AuthorEmail')
df_file_commits.drop(columns=['Email'], inplace=True)
df_file_commits.rename(columns={'Name':'AuthorName'}, inplace=True)

df_file_commits.head()

Unnamed: 0,FilePath,FileHash,CommitHash,AuthorEmail,AuthorDateUTC,CommitterEmail,CommitterDate,Message,Bytes,Lines,NetLines,date,year,month,year-month,weekday,weekday_name,AuthorName
0,archetypes/default.md,00e77bd79be44872c0b29256b03799c2fb00c10d,fb99683d67a0a16dbcf0b56995025602952a8b61,Matt.Eland@GMail.com,2021-11-05 00:30:19,Matt.Eland@GMail.com,2021-11-05 00:30:19,Initial site contents,84,6,6,2021-11-05,2021,11,2021-11-01,4,Friday,Matt Eland
1,content/about.md,8944da150ed7125974f91ecd4d97c9aa2e97eea4,fb99683d67a0a16dbcf0b56995025602952a8b61,Matt.Eland@GMail.com,2021-11-05 00:30:19,Matt.Eland@GMail.com,2021-11-05 00:30:19,Initial site contents,813,22,22,2021-11-05,2021,11,2021-11-01,4,Friday,Matt Eland
2,content/post/hello.md,6e3938bc332c395463db37cddff90660a7346f9d,fb99683d67a0a16dbcf0b56995025602952a8b61,Matt.Eland@GMail.com,2021-11-05 00:30:19,Matt.Eland@GMail.com,2021-11-05 00:30:19,Initial site contents,1798,18,18,2021-11-05,2021,11,2021-11-01,4,Friday,Matt Eland
3,content/showcase.md,1ffc15029cf768a7ccc5649f3e22383598f4c844,fb99683d67a0a16dbcf0b56995025602952a8b61,Matt.Eland@GMail.com,2021-11-05 00:30:19,Matt.Eland@GMail.com,2021-11-05 00:30:19,Initial site contents,3473,84,84,2021-11-05,2021,11,2021-11-01,4,Friday,Matt Eland
4,themes/terminal/.postcssrc.js,e0ea4b5f42453e4fbbade0ed888291f53469783e,fb99683d67a0a16dbcf0b56995025602952a8b61,Matt.Eland@GMail.com,2021-11-05 00:30:19,Matt.Eland@GMail.com,2021-11-05 00:30:19,Initial site contents,592,27,27,2021-11-05,2021,11,2021-11-01,4,Friday,Matt Eland


### Load File Structure

In [538]:
df_files = pd.read_csv(final_structure_file, parse_dates=['CreatedDateUTC'])

df_files.fillna('.', inplace=True)
df_files.head(5)

Unnamed: 0,CommitHash,FileHash,Filename,Extension,FilePath,State,Lines,Bytes,CreatedDateUTC,Path1,Path2,Path3,Path4,Path5
0,fb99683d67a0a16dbcf0b56995025602952a8b61,00e77bd79be44872c0b29256b03799c2fb00c10d,default.md,.md,archetypes/default.md,Final,6,84,2021-11-05 00:30:19,archetypes,.,.,.,.
1,94ab28dda040468e9899933673f87f74d5a8c7b8,32818c64f3d36d812fc69b14101be976f46a3f9f,about.md,.md,content/about.md,Final,32,1870,2023-03-11 21:25:48,content,.,.,.,.
2,94ab28dda040468e9899933673f87f74d5a8c7b8,6454ea6576efca7982ab78d3ad2e1520577c4140,bots.md,.md,content/bots.md,Final,45,3292,2023-03-11 21:25:48,content,.,.,.,.
3,645b45fe6b54ee8cad29b805b2e9258301d9340a,7ae052f95abcdace9ae07e86cb2c3fd2ce47a2b2,chat.md,.md,content/chat.md,Final,14,577,2022-09-25 01:18:33,content,.,.,.,.
4,a57e02fb1497206f4e1f799f46fc01e92f7e9049,07a1b5745669ac5a7969d9eb8f6b2b523858b1ac,AIvsML.md,.md,content/post/AIvsML.md,Final,86,6581,2022-09-07 04:26:37,content,post,.,.,.


In [539]:
# Get Aggregate level data for each file
df_file_commits_agg = df_file_commits.groupby('FilePath').agg(
    num_commits=('FileHash',pd.Series.nunique),
    sum_bytes=('Bytes', 'sum'),
    avg_bytes=('Bytes', 'mean'),
    avg_lines=('Lines', 'mean'),
    min_date=('date', 'min'),
    max_date=('date', 'max'),
    first_author=('AuthorName', 'first'),
    last_author=('AuthorName', 'last'),
    modal_author=('AuthorName', pd.Series.mode),
)
df_file_commits_agg.head()

Unnamed: 0_level_0,num_commits,sum_bytes,avg_bytes,avg_lines,min_date,max_date,first_author,last_author,modal_author
FilePath,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
archetypes/default.md,1,84,84.0,6.0,2021-11-05,2021-11-05,Matt Eland,Matt Eland,Matt Eland
content/about.md,11,16608,1277.538462,25.692308,2021-11-05,2023-03-11,Matt Eland,Matt Eland,Matt Eland
content/botproject.md,6,18865,3144.166667,44.666667,2022-09-19,2022-09-21,Matt Eland,Matt Eland,Matt Eland
content/bots.md,2,6572,3286.0,45.0,2022-09-21,2023-03-11,Matt Eland,Matt Eland,Matt Eland
content/chat.md,9,6362,578.363636,12.636364,2022-08-29,2022-09-25,Matt Eland,Matt Eland,Matt Eland


In [540]:
# Merge aggregate data into the file dataset
df_files = df_files.merge(df_file_commits_agg, left_on='FilePath', right_on='FilePath', suffixes=('', ''))
df_files.head(5)

Unnamed: 0,CommitHash,FileHash,Filename,Extension,FilePath,State,Lines,Bytes,CreatedDateUTC,Path1,...,Path5,num_commits,sum_bytes,avg_bytes,avg_lines,min_date,max_date,first_author,last_author,modal_author
0,fb99683d67a0a16dbcf0b56995025602952a8b61,00e77bd79be44872c0b29256b03799c2fb00c10d,default.md,.md,archetypes/default.md,Final,6,84,2021-11-05 00:30:19,archetypes,...,.,1,84,84.0,6.0,2021-11-05,2021-11-05,Matt Eland,Matt Eland,Matt Eland
1,94ab28dda040468e9899933673f87f74d5a8c7b8,32818c64f3d36d812fc69b14101be976f46a3f9f,about.md,.md,content/about.md,Final,32,1870,2023-03-11 21:25:48,content,...,.,11,16608,1277.538462,25.692308,2021-11-05,2023-03-11,Matt Eland,Matt Eland,Matt Eland
2,94ab28dda040468e9899933673f87f74d5a8c7b8,6454ea6576efca7982ab78d3ad2e1520577c4140,bots.md,.md,content/bots.md,Final,45,3292,2023-03-11 21:25:48,content,...,.,2,6572,3286.0,45.0,2022-09-21,2023-03-11,Matt Eland,Matt Eland,Matt Eland
3,645b45fe6b54ee8cad29b805b2e9258301d9340a,7ae052f95abcdace9ae07e86cb2c3fd2ce47a2b2,chat.md,.md,content/chat.md,Final,14,577,2022-09-25 01:18:33,content,...,.,9,6362,578.363636,12.636364,2022-08-29,2022-09-25,Matt Eland,Matt Eland,Matt Eland
4,a57e02fb1497206f4e1f799f46fc01e92f7e9049,07a1b5745669ac5a7969d9eb8f6b2b523858b1ac,AIvsML.md,.md,content/post/AIvsML.md,Final,86,6581,2022-09-07 04:26:37,content,...,.,12,77476,6456.333333,86.833333,2021-11-05,2022-09-07,Matt Eland,Matt Eland,Matt Eland


## Data Visualization

In [541]:
# Declare standard styles here

theme_discrete = px.colors.qualitative.Prism
theme_diverging_neutral = px.colors.diverging.RdYlBu
theme_diverging = px.colors.diverging.Picnic_r
theme_diverging_r = px.colors.diverging.Picnic
theme_sequential = px.colors.sequential.Agsunset
theme_continuous= px.colors.diverging.balance
theme_hot = px.colors.sequential.Reds
theme_cold = px.colors.sequential.Blues

template = 'plotly_dark'

In [542]:
# Utility Formatting functions
def format_and_show_short(fig):
    fig.update_layout(template=template,
                      height=400)
    fig.show()

def format_and_show(fig):
    fig.update_layout(template=template,
                      height=550)
    fig.show()

def format_and_show_tall(fig):
    fig.update_layout(template=template,
                      height=800)
    fig.show()

def format_and_show_3d(fig):
    fig.update_layout(template=template,
                      width=800,
                      height=600)
    fig.show()

def format_and_show_sunburst(fig):
    fig.update_layout(template=template,
                      width=1024,
                      height=800)
    fig.show()

### File Structure

Data visualizations exploring the static structure of the git repository's final state

In [543]:
file_labels = {
    'Path1': 'Project',
    'Path2': 'Area',
    'Lines': 'Lines of Code',
    'Lines_sum': 'Total Lines of Code',
    'num_commits': '# Commits',
}

In [544]:
# Files by File Size
fig = px.treemap(df_files,
                 path=[px.Constant(project_name),'Path1','Path2','Path3','Filename'],
                 color='Lines',
                 title=project_name + ' Largest Files (Lines)',
                 labels=file_labels,
                 values='Lines')
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
format_and_show_tall(fig)

In [545]:
# Files by File Size
fig = px.treemap(df_files,
                 path=[px.Constant(project_name),'Path1','Path2','Path3','Filename'],
                 color='Bytes',
                 title=project_name + ' Largest Files (Bytes)',
                 labels=file_labels,
                 values='Lines')
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
format_and_show_tall(fig)

In [546]:
# Sunburst diagram. Same data as a treemap, but different presentation
fig = px.sunburst(df_files,
                 path=['Path1','Path2','Path3','Filename'],
                 color='Lines',
                 title=project_name + ' Size of Code Files by Project and Directory',
                 hover_data=['FilePath'],
                 color_continuous_scale='sunsetdark',
                 labels=file_labels,
                 values='Lines')
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
format_and_show_sunburst(fig)

In [547]:
# Files by Directory Structure
fig = px.treemap(df_files,
                 path=[px.Constant(project_name),'Path1','Path2','Path3','Filename'],
                 color='Path1',
                 title=project_name + ' Project Structure',
                 labels=file_labels,
                 color_discrete_sequence=theme_discrete,
                 values='Lines')
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
format_and_show_tall(fig)

In [548]:
# Files by Extension
fig = px.treemap(df_files,
                 path=[px.Constant(project_name),'Path1','Path2','Path3','Filename'],
                 color='Extension',
                 title=project_name + ' File Types',
                 labels=file_labels,
                 color_discrete_sequence=theme_discrete,
                 values='Lines')
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
format_and_show_tall(fig)

In [549]:
fig = px.histogram(df_files,
                   x="Lines",
                   title=project_name + ' Frequency of File Sizes by Area',
                   color='Path2',
                   labels=file_labels,
                   color_discrete_sequence=theme_discrete)
format_and_show(fig)

In [550]:
# Overall box plot for all code
fig = px.box(df_files,
             title=project_name + ' Lines of Code by Area',
             x='Lines',
             y='Path2',
             color='Path2',
             color_discrete_sequence=theme_discrete,
             labels=file_labels,
             hover_data=['FilePath'],
             points='outliers') # Acceptable values: 'all', 'outliers', 'suspectedoutliers', or False
fig.update_traces(quartilemethod='linear', jitter=1)
format_and_show(fig)

In [551]:
# Files by Date Created
fig = px.treemap(df_files,
                 path=[px.Constant(project_name),'Path1','Path2','Path3','Filename'],
                 color='min_date',
                 title=project_name + ' Files by Creation Date',
                 labels=file_labels,
                 color_discrete_sequence=theme_sequential,
                 values='Lines')
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
format_and_show_tall(fig)

In [552]:
# Files by Date Modified
fig = px.treemap(df_files,
                 path=[px.Constant(project_name),'Path1','Path2','Path3','Filename'],
                 color='max_date',
                 title=project_name + ' Files by Date Last Modified',
                 labels=file_labels,
                 color_discrete_sequence=theme_sequential,
                 values='Lines')
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
format_and_show_tall(fig)

In [553]:
# Files by Date Modified
fig = px.treemap(df_files,
                 path=[px.Constant(project_name),'Path1','Path2','Path3','Filename'],
                 color='num_commits',
                 title=project_name + ' Files by # Commits',
                 labels=file_labels,
                 color_discrete_sequence=theme_sequential,
                 values='Lines')
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
format_and_show_tall(fig)

In [585]:
# Sunburst diagram. Same data as a treemap, but different presentation
fig = px.sunburst(df_files,
                 path=['Path1','Path2','Path3','Filename'],
                 color='num_commits',
                 title=project_name + ' # Commits by Project Structure',
                 hover_data=['FilePath'],
                 color_continuous_scale='sunsetdark',
                 labels=file_labels,
                 values='num_commits')
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
format_and_show_sunburst(fig)

In [554]:
# Files by Date Modified
fig = px.treemap(df_files,
                 path=[px.Constant(project_name),'Path1','Path2','Path3','Filename'],
                 color='first_author',
                 title=project_name + ' Files by Creator',
                 labels=file_labels,
                 color_discrete_sequence=theme_discrete,
                 values='Lines')
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
format_and_show_tall(fig)

In [555]:
# Files by Date Modified
fig = px.treemap(df_files,
                 path=[px.Constant(project_name),'Path1','Path2','Path3','Filename'],
                 color='last_author',
                 title=project_name + ' Files by Last Modified By',
                 labels=file_labels,
                 color_discrete_sequence=theme_discrete,
                 values='Lines')
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
format_and_show_tall(fig)

In [556]:
# Files by Date Modified
fig = px.treemap(df_files,
                 path=[px.Constant(project_name),'Path1','Path2','Path3','Filename'],
                 color='modal_author',
                 title=project_name + ' Files by Most Common Author',
                 labels=file_labels,
                 color_discrete_sequence=theme_discrete,
                 values='Lines')
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
format_and_show_tall(fig)

In [557]:
df_file_commits_daily = df_file_commits.groupby('date').agg(
    NetLines=('NetLines', 'sum'),
    NumFiles=('FilePath', pd.Series.count),
    NumCommits=('CommitHash', pd.Series.count)
)

df_file_commits_daily.head()

Unnamed: 0_level_0,NetLines,NumFiles,NumCommits
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2021-11-05,3519,93,93
2021-11-06,4598,59,59
2021-11-07,172,26,26
2021-11-08,94,9,9
2021-11-13,121,30,30


In [558]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_file_commits_daily.index,
            mode='lines+markers',
            marker=dict(
                color=df_file_commits_daily['NumFiles'],
                size=8,
                colorscale=px.colors.diverging.balance
            ),
            y=df_file_commits_daily['NumFiles']))
fig.update_layout(title=project_name + " Files Changed",
                  yaxis_title="# Files Changed",
                  xaxis_title="Date")
format_and_show(fig)

## Commit Graphs

Data visualizations exploring commit patterns

In [559]:
# Replacement Values to make the graphs look nice
commit_labels = {
                     'TotalBytes': 'Bytes',
                     'NumFiles': '# Files',
                     'weekday_name': 'Weekday',
                     'AuthorEmail': 'Author E-Mail',
                     'AuthorDateUTC': 'Date',
                     'AuthorName': 'Author',

                     'net_lines':'Net Lines',
                     'num_deletes': 'Lines Deleted',
                     'num_inserts': 'Lines Added',
                     'num_files': 'Files Modified',
                     'date': 'Date',
                     'datetime': 'Date',
                     'filename': 'File',
                     'message': 'Commit Message',
                     'hash': 'Hash',
                     'author_name': 'Author',
                     'count': 'Count',
                     'avg_net': 'Avg. Net Lines',
                     'num_commits': 'Commits',
                     'num_authors': 'Authors',
                     'sum_net': 'Total Net Lines',
                     'lines': 'Lines of Code',
                     'project': 'Project',
                 }

In [560]:
fig = px.scatter(df_commits, 
                 title= project_name + ' bytes per commit',
                 x='AuthorDateUTC', 
                 y='TotalBytes',
                 color='TotalBytes',
                 color_discrete_sequence=theme_sequential,
                 hover_data=['AuthorName'],
                 labels=commit_labels,
                 hover_name='Message')
fig.update_layout(xaxis_title='Date')
format_and_show_short(fig)

In [561]:
fig = px.scatter(df_commits, 
                 title= project_name + ' files per commit',
                 x='AuthorDateUTC', 
                 y='NumFiles',
                 color='NumFiles',
                 color_discrete_sequence=theme_sequential,
                 labels=commit_labels,
                 hover_data=['AuthorName'],
                 hover_name='Message')
fig.update_layout(xaxis_title='Date')
format_and_show_short(fig)

In [562]:
fig = px.histogram(df_commits, 
                 title=project_name + ' Commits by Day of Week',
                 x='date', 
                 color='weekday_name',
                 color_discrete_sequence=theme_sequential,
                 labels=commit_labels)
fig.update_layout(xaxis_title='Date')
format_and_show_short(fig)

In [563]:
fig = px.histogram(df_commits, 
                 title=project_name + ' Commits by Year (Day of Week Colorized)',
                 x='year', 
                 color='weekday_name',
                 color_discrete_sequence=theme_sequential,
                 labels=commit_labels)
fig.update_layout(xaxis_title='Year')
format_and_show_short(fig)

In [564]:
fig = px.histogram(df_commits, 
                 title=project_name + ' Commits by Month (Day of Week Colorized)',
                 x='year-month', 
                 color='weekday_name',
                 color_discrete_sequence=theme_sequential,
                 labels=commit_labels)
fig.update_layout(xaxis_title='Month')
format_and_show_short(fig)

In [565]:
# Determine the length of the project in days. This allows us to bin future graphs by the exact number of days in the project
num_days = (df_commits['date'].max() - df_commits['date'].min()).days
num_days

496

In [566]:
fig = px.histogram(df_commits, 
                 title=project_name + ' Daily Commits (Weekdays Colorized)',
                 x='date', 
                 color='weekday_name',
                 nbins=num_days,
                 color_discrete_sequence=theme_sequential,
                 hover_name='Message',
                 labels=commit_labels)
fig.update_layout(xaxis_title='Date')
format_and_show_short(fig)

In [567]:
# Let's now remove author from things and just look at aggregate daily totals
df_commits_daily = df_commits.groupby('date').agg(
        num_files=('NumFiles', 'sum'),
        num_authors=('AuthorEmail', pd.Series.nunique),
        num_commits=('CommitHash', pd.Series.nunique),
        sum_inserts=('DeletedFiles', 'sum'),
        sum_deletes=('DeletedFiles', 'sum'),
        sum_total_files=('TotalFiles', 'sum'),
        min_total_files = ('TotalFiles', 'min'), 
        max_total_files=('TotalFiles', 'max'),
        avg_total_files=('TotalFiles', 'mean'),
        min_files = ('NumFiles', 'min'), 
        max_files=('NumFiles', 'max'),
        avg_files=('NumFiles', 'mean'),
        sum_net=('NetLines','sum'),
        avg_net=('NetLines','mean'),
        min_net=('NetLines','min'),
        max_net=('NetLines','max'),
        total_lines=('TotalLines', 'max'),
        min_deletes = ('DeletedFiles', 'min'), 
        max_deletes=('DeletedFiles', 'max'),
        avg_deletes=('DeletedFiles', 'mean'),
        min_inserts = ('AddedFiles', 'min'), 
        max_inserts=('AddedFiles', 'max'),
        avg_inserts=('AddedFiles', 'mean'))

df_commits_monthly = df_commits.groupby('year-month').agg(
        num_files=('NumFiles', 'sum'),
        num_authors=('AuthorEmail', pd.Series.nunique),
        num_commits=('CommitHash', pd.Series.nunique),
        sum_inserts=('DeletedFiles', 'sum'),
        sum_deletes=('DeletedFiles', 'sum'),
        sum_total_files=('TotalFiles', 'sum'),
        min_total_files = ('TotalFiles', 'min'), 
        max_total_files=('TotalFiles', 'max'),
        avg_total_files=('TotalFiles', 'mean'),
        min_files = ('NumFiles', 'min'), 
        max_files=('NumFiles', 'max'),
        avg_files=('NumFiles', 'mean'),
        sum_net=('NetLines','sum'),
        avg_net=('NetLines','mean'),
        min_net=('NetLines','min'),
        max_net=('NetLines','max'),
        total_lines=('TotalLines', 'max'),
        min_deletes = ('DeletedFiles', 'min'), 
        max_deletes=('DeletedFiles', 'max'),
        avg_deletes=('DeletedFiles', 'mean'),
        min_inserts = ('AddedFiles', 'min'), 
        max_inserts=('AddedFiles', 'max'),
        avg_inserts=('AddedFiles', 'mean'))

agg_commit_hover_data = ['sum_inserts', 'sum_deletes', 'min_files', 'min_inserts', 'min_deletes', 'max_files', 'max_inserts', 'max_deletes', 'avg_files', 'avg_inserts','avg_deletes']

In [568]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_commits_daily.index,
            mode='lines+markers',
            marker=dict(
                color=df_commits_daily['sum_net'],
                size=8,
                colorscale=px.colors.diverging.balance
            ),
            y=df_commits_daily['sum_net']))
fig.update_layout(title=project_name + " Daily Net Changes",
                  yaxis_title="Net Change (Lines of Code)",
                  xaxis_title="Date")
format_and_show(fig)

In [569]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_commits_monthly.index,
            mode='lines+markers',
            marker=dict(
                color=df_commits_monthly['sum_net'],
                size=8,
                colorscale=px.colors.diverging.balance
            ),
            y=df_commits_monthly['sum_net']))
fig.update_layout(title=project_name + " Monthly Net Changes",
                  yaxis_title="Net Change (Lines of Code)",
                  xaxis_title="Date")
format_and_show(fig)

In [570]:
fig = px.scatter(df_commits_daily, 
                 title=project_name + ' Daily Commit Counts',
                 x=df_commits_daily.index,
                 y='num_commits', 
                 color='num_commits',
                 hover_data=agg_commit_hover_data,
                 hover_name=df_commits_daily.index,
                 color_continuous_scale=theme_sequential,
                 labels=commit_labels)

fig.update_layout(xaxis_title='Date')

format_and_show(fig)

In [571]:
fig = px.scatter(df_commits_daily, 
                 title=project_name + ' Daily Commit and Author Counts',
                 x=df_commits_daily.index,
                 y='num_commits', 
                 color='num_authors',
                 hover_name=df_commits_daily.index,
                 hover_data=agg_commit_hover_data,
                 color_continuous_scale=theme_sequential,
                 labels=commit_labels)

fig.update_layout(xaxis_title='Date')

format_and_show(fig)

In [572]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_commits_monthly.index,
            mode='lines+markers',
            name='Commits',
            line=dict(
                color='Purple'
            ),
            marker=dict(
                color=df_commits_monthly['num_authors'],
                size=8,
                colorscale=theme_sequential,
                colorbar=dict(
                    title="Authors"
                ),
            ),
            y=df_commits_monthly['num_commits']))

fig.update_layout(xaxis_title='Date',yaxis_title='Commits', title=project_name + " Monthly Commits and Authors")

format_and_show(fig)

In [573]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_commits_monthly.index,
            mode='lines+markers',
            name='Authors',
            line=dict(
                color='Purple'
            ),
            marker=dict(
                color=df_commits_monthly['num_commits'],
                size=8,
                colorscale=theme_sequential,
                colorbar=dict(
                    title="Commits"
                ),
            ),
            y=df_commits_monthly['num_authors']))

fig.update_layout(xaxis_title='Date',yaxis_title='Authors', title=project_name +" Monthly Authors and Commits")

format_and_show(fig)

In [574]:
fig = px.scatter_3d(df_commits_monthly, 
                 title=project_name + ' Monthly Commit and Author Counts',
                 x=df_commits_monthly.index,
                 y='num_commits', 
                 z='num_authors',
                 color='num_authors',
                 hover_name=df_commits_monthly.index,
                 hover_data=agg_commit_hover_data,
                 color_continuous_scale=theme_sequential,
                 labels=commit_labels)

fig.update_layout(xaxis_title='Date')

format_and_show_3d(fig)

In [575]:
fig = px.scatter_3d(df_commits_daily, 
                 title=project_name + ' Daily Commit and Average Files Modified',
                 x=df_commits_daily.index,
                 y='num_commits', 
                 z='avg_files',
                 color='avg_files',
                 hover_data=agg_commit_hover_data,
                 hover_name=df_commits_daily.index,
                 color_continuous_scale=theme_sequential,
                 labels=commit_labels)

fig.update_layout(xaxis_title='Date')

format_and_show_3d(fig)

## Authors

Data visualizations exploring author behaviors and tendencies

In [576]:
fig = px.scatter(df_commits, 
                 title=project_name + ' Total Bytes per Commit by Author',
                 x='AuthorDateUTC', 
                 y='TotalBytes',
                 color='AuthorName',
                 color_discrete_sequence=theme_discrete,
                 hover_name='Message',
                 labels=commit_labels)

fig.update_traces(marker=dict(size=5), selector=dict(mode='markers'))

format_and_show(fig)

In [577]:
df_attributed = df_commits[df_commits['AuthorName'] != '(no author)']
df_attributed = df_attributed[df_attributed['AuthorName'] != 'unknown']

df_contributor_monthly = df_attributed.groupby(['year-month','AuthorName']).agg(
        count=('CommitHash', pd.Series.nunique),
        sum_files=('NumFiles', 'sum'),
        sum_inserts=('AddedFiles', 'sum'),
        sum_deletes=('DeletedFiles', 'sum')).sort_index(ascending=False)

In [578]:
fig = px.scatter(df_contributor_monthly,
        x=df_contributor_monthly.index.get_level_values(0),
        y='sum_inserts',
        size='count',
        labels=commit_labels,
        color=df_contributor_monthly.index.get_level_values(1),
        color_discrete_sequence=theme_discrete)
fig.update_layout(title=project_name + " Net lines of code by month by author",
                  xaxis_title='Year / Month',
                  yaxis_title='Files Added',
                  legend_title='Author')
format_and_show(fig)

In [579]:
fig = px.scatter(df_contributor_monthly,
                 x=df_contributor_monthly.index.get_level_values(0), 
                 y=df_contributor_monthly.index.get_level_values(1),
                 color=df_contributor_monthly.index.get_level_values(1),
                 size='count',
                 color_discrete_sequence=theme_discrete,
                 labels=commit_labels)
fig.update_layout(title=project_name + " Monthly Contribution History",
                  xaxis_title='Year / Month',
                  yaxis_title='Author',
                  legend_title='Author')
format_and_show(fig)

In [580]:
# Overall box plot for all authors
fig = px.box(df_commits,
             title=project_name + ' bytes per commit by Author',
             x='TotalBytes',
             y='AuthorName',
             color='AuthorName',
             labels=commit_labels,
             color_discrete_sequence=theme_discrete,
             hover_data=['CommitHash','AuthorDateUTC','Message'],
             points='outliers') # Acceptable values: 'all', 'outliers', 'suspectedoutliers', or False
fig.update_traces(quartilemethod='linear', jitter=1)
format_and_show(fig)

In [581]:
# Overall box plot for all authors
fig = px.box(df_commits,
             title=project_name + ' net lines of code per commit by Author',
             x='NetLines',
             y='AuthorName',
             color='AuthorName',
             labels=commit_labels,
             color_discrete_sequence=theme_discrete,
             hover_data=['CommitHash','AuthorDateUTC','Message'],
             points='outliers') # Acceptable values: 'all', 'outliers', 'suspectedoutliers', or False
fig.update_traces(quartilemethod='linear', jitter=1)
format_and_show(fig)

In [582]:
# Overall box plot for all authors
fig = px.box(df_commits,
             title=project_name + ' # files modified per commit by Author',
             x='NumFiles',
             y='AuthorName',
             color='AuthorName',
             labels=commit_labels,
             color_discrete_sequence=theme_discrete,
             hover_data=['CommitHash','AuthorDateUTC','Message'],
             points='outliers') # Acceptable values: 'all', 'outliers', 'suspectedoutliers', or False
fig.update_traces(quartilemethod='linear', jitter=1)
format_and_show(fig)