# GitStractor Jupyter Notebook Data Visualization

This notebook serves as a set of examples for visualizing codebases using [GitStractor](https://github.com/integerman/gitstractor) and Jupyter Notebooks.

Contact [Matt Eland](https://MattEland.dev) ([@IntegerMan](https://twitter.com/IntegerMan)) with questions.

## Disclaimer

This is a **Technical Preview** of GitStractor. The data source collections and visualizations have not been formally tested on a wide variety of sources and likely contain data inaccuracies.

Additionally, the intent of GitStractor is to help you understand a project at a high level. GitStractor is not intended to aid in any personnel evaluations, hiring or firing, or the performance review process.

The author assumes no liability or responsibility for issues arising from the use of GitStractor, including inaccurate data, bugs, or decisions made based on its output.

## Requirements

This application currently requires:

- CSV files generated by [GitStractor](https://github.com/integerman/gitstractor)
- Jupyter Notebooks running some version of Python (tested using Python 3.8.8)
- The following Python libraries:
  - pandas
  - plotly.express

## Data Loading

In [227]:
# Project Name shows up in some visualizations
project_name = 'Wherewolf'

# This should point to the location containing the GitStractor CSV files
data_dir = '../wherewolf/'

# These are the default GitStractor file names and shouldn't need to be customized
author_file = data_dir + 'Authors.csv'
commits_file = data_dir + 'Commits.csv'
file_commits_file = data_dir + 'FileCommits.csv'
files_file = data_dir + 'Files.csv'

In [228]:
# Load Dependencies
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

# Raw author data
df_authors = pd.read_csv(author_file)
df_authors.rename(columns={'Id': 'AuthorId', 'Name': 'Author'}, inplace=True)
df_authors.sort_values(by=['Author'], inplace=True)

df_authors.head()

Unnamed: 0,AuthorId,Email,Author,Is Bot?,Num Commits,Lines Added,Lines Deleted,Files Added,Files Deleted,Files Modified,First Commit Date UTC,Last Commit Date UTC
0,1,matt.eland@gmail.com,Matt Eland,False,28,5405,1256,77,9,173,04/19/2024 02:07:44,07/22/2024 04:01:25


In [229]:
def add_date_columns(df, dateColName):
    df['date'] = df[dateColName].dt.date
    df['year'] = df[dateColName].dt.year
    df['month'] = df[dateColName].dt.month
    df['year-month'] = df[dateColName].to_numpy().astype('datetime64[M]')
    df['weekday'] = df[dateColName].dt.weekday
    df['weekday_name'] = df[dateColName].dt.strftime("%A")
    
    return df

# Raw commit data
df_commits = pd.read_csv(commits_file, parse_dates=['AuthorDateUtc','CommitterDateUtc'])
df_commits = add_date_columns(df_commits, 'AuthorDateUtc')
df_commits['hour'] = df_commits['AuthorDateUtc'].dt.hour
df_commits = df_commits.merge(df_authors[['Author','AuthorId','Is Bot?']], right_on='AuthorId', left_on='AuthorId')
df_commits.drop(columns=['AuthorId'], inplace=True)

# Commits per file
df_file_commits = pd.read_csv(file_commits_file, parse_dates=['Commit Date Utc'])
df_file_commits.rename(columns={'Commit Author': 'AuthorId'}, inplace=True)
df_file_commits = add_date_columns(df_file_commits, 'Commit Date Utc')
df_file_commits = df_file_commits.merge(df_authors[['Author','AuthorId', 'Is Bot?']], right_on='AuthorId', left_on='AuthorId')
df_file_commits.drop(columns=['AuthorId'], inplace=True)

# Filter out commits with no author
df_attributed = df_commits[df_commits['Author'] != '(no author)']
df_attributed = df_attributed[df_attributed['Author'] != 'unknown']
df_attributed = df_attributed[df_attributed['Is Bot?'] == False]
df_contributor_monthly = df_attributed.groupby(['year-month','Author']).agg(
        count=('Sha', pd.Series.nunique),
        sum_net=('Net Lines','sum'),
        sum_files=('Total Files', 'sum'),
        sum_inserts=('Added Files', 'sum'),
        sum_deletes=('Deleted Files', 'sum')).sort_index(ascending=False)

# Aggregate commits by unique path
df_file_commits_agg = df_file_commits.sort_values(by='Commit Date Utc').groupby('Path').agg(
    num_commits=('File Sha',pd.Series.nunique),
    sum_work_items=('Work Items', 'sum'),
    avg_lines=('Current Lines', 'mean'),
    avg_lines_added=('Lines Added', 'mean'),
    avg_lines_deleted=('Lines Deleted', 'mean'),
    min_date=('date', 'min'),
    max_date=('date', 'max'),
    bugfixes=('IsBugfix', 'sum'),
    bugfix_probability=('BugfixProbability', 'mean'),
    first_author=('Author', 'first'),
    last_author=('Author', 'last'),
    modal_author=('Author', lambda x : x.value_counts().index[0]),
)

# Aggregate commits by date
df_file_commits_daily = df_file_commits.groupby('date').agg(
    NetLines=('Lines Added', 'sum'),
    NumFiles=('Path', pd.Series.count),
    NumCommits=('File Sha', pd.Series.count)
)

# File Data
df_files = pd.read_csv(files_file)
df_files = df_files.merge(df_file_commits_agg, left_on='Path', right_on='Path', suffixes=('', ''))
df_files = df_files.fillna('.')

# Hide files with 0 lines of code (prevents errors later)
df_files = df_files[df_files['Lines'] > 0]

# Aggregating commits by date ranges
def aggregate_commits(df):
    return df.agg(
        num_authors=('Author', pd.Series.nunique),
        num_commits=('Sha', pd.Series.nunique),
        sum_work_items=('Work Items', 'sum'),
        total_files=('Total Files', 'sum'),
        num_files=('Modified Files', 'sum'),
        min_files = ('Modified Files', 'min'), 
        max_files=('Modified Files', 'max'),
        avg_files=('Modified Files', 'mean'),
        sum_net=('Net Lines','sum'),
        avg_net=('Net Lines','mean'),
        min_net=('Net Lines','min'),
        max_net=('Net Lines','max'),
        total_lines=('Total Lines', 'max'),
        sum_deletes=('Deleted Files', 'sum'),
        min_deletes = ('Deleted Files', 'min'), 
        max_deletes=('Deleted Files', 'max'),
        avg_deletes=('Deleted Files', 'mean'),
        sum_inserts=('Added Files', 'sum'),
        min_inserts = ('Added Files', 'min'), 
        max_inserts=('Added Files', 'max'),
        avg_inserts=('Added Files', 'mean'))

df_commits_daily = aggregate_commits(df_commits.groupby('date'))
df_commits_monthly = aggregate_commits(df_commits.groupby('year-month'))

# Determine the length of the project in days. This allows us to bin future graphs by the exact number of days in the project
num_days = (df_commits['date'].max() - df_commits['date'].min()).days
num_days

# Get top X% of contributors by number of commits
top_contributors = df_commits.value_counts('Author').head(int(len(df_commits['Author'].unique()) * 0.05)).index.tolist()

# Filter df_contributor_monthly to only include the top contributors and store it in df_top_contributors_monthly
df_top_contributors_monthly = df_contributor_monthly[df_contributor_monthly.index.get_level_values(1).isin(top_contributors)]
df_top_contributors = df_attributed[df_attributed['Author'].isin(top_contributors)]

### Loaded Data Shape and Metrics

In [230]:
df_commits.head()

Unnamed: 0,Sha,ParentSha,Parent2Sha,IsMerge,IsBugfix,BugfixProbability,AuthorDateUtc,CommitterId,CommitterDateUtc,Message,...,Deleted Lines,date,year,month,year-month,weekday,weekday_name,hour,Author,Is Bot?
0,53914f0a9cba77256188630e6870f98494a94e85,,,False,False,2.4e-05,2024-04-19 02:07:44,1,2024-04-19 02:07:44,Added project,...,0,2024-04-19,2024,4,2024-04-01,4,Friday,2,Matt Eland,False
1,a7918ed4426845719f580c4a0c222efa898e69a7,53914f0a9cba77256188630e6870f98494a94e85,,False,True,0.881014,2024-04-19 02:12:08,1,2024-04-19 02:12:08,Fixed path to Wherewolf,...,0,2024-04-19,2024,4,2024-04-01,4,Friday,2,Matt Eland,False
2,b6c5e208d066d227269c70517b096b99ce9634c3,a7918ed4426845719f580c4a0c222efa898e69a7,,False,False,0.000341,2024-04-19 02:25:47,1,2024-04-19 02:25:47,Structuring early application.,...,6,2024-04-19,2024,4,2024-04-01,4,Friday,2,Matt Eland,False
3,ba4f3acb4bc1111109f5f8cd1a0813c794a22fd7,b6c5e208d066d227269c70517b096b99ce9634c3,,False,False,0.001583,2024-04-19 03:24:09,1,2024-04-19 03:24:09,Early game creation and summarization,...,2,2024-04-19,2024,4,2024-04-01,4,Friday,3,Matt Eland,False
4,b5cdfe154be7342f2785d0450efd047e2c1b6295,ba4f3acb4bc1111109f5f8cd1a0813c794a22fd7,,False,False,0.036773,2024-04-19 03:29:43,1,2024-04-19 03:29:43,Good early game display,...,32,2024-04-19,2024,4,2024-04-01,4,Friday,3,Matt Eland,False


In [231]:
df_commits['IsBugfix'].value_counts()

IsBugfix
False    26
True      2
Name: count, dtype: int64

In [232]:
df_commits['BugfixProbability'].describe()

count    28.000000
mean      0.124351
std       0.242661
min       0.000024
25%       0.002448
50%       0.027466
75%       0.136335
max       0.998457
Name: BugfixProbability, dtype: float64

In [233]:
df_commits.describe()

Unnamed: 0,BugfixProbability,AuthorDateUtc,CommitterId,CommitterDateUtc,Work Items,Total Files,Modified Files,Added Files,Deleted Files,Total Lines,Net Lines,Added Lines,Deleted Lines,year,month,year-month,weekday,hour
count,28.0,28,28.0,28,28.0,28.0,28.0,28.0,28.0,28.0,28.0,28.0,28.0,28.0,28.0,28,28.0,28.0
mean,0.124351,2024-05-15 22:06:12.964285952,1.0,2024-05-15 22:06:12.964285952,0.0,9.25,6.178571,2.75,0.321429,566.714286,148.178571,193.035714,44.857143,2024.0,4.857143,2024-04-27 00:00:00,4.107143,7.071429
min,2.4e-05,2024-04-19 02:07:44,1.0,2024-04-19 02:07:44,0.0,1.0,0.0,0.0,0.0,37.0,-26.0,6.0,0.0,2024.0,4.0,2024-04-01 00:00:00,0.0,0.0
25%,0.002448,2024-04-20 21:37:25.750000128,1.0,2024-04-20 21:37:25.750000128,0.0,2.0,1.75,0.0,0.0,181.25,48.5,61.25,1.5,2024.0,4.0,2024-04-01 00:00:00,2.0,2.0
50%,0.027466,2024-04-21 18:30:43.500000,1.0,2024-04-21 18:30:43.500000,0.0,6.5,4.0,1.5,0.0,429.0,96.5,127.0,10.0,2024.0,4.0,2024-04-01 00:00:00,4.5,3.0
75%,0.136335,2024-07-11 16:59:38.750000128,1.0,2024-07-11 16:59:38.750000128,0.0,13.25,7.25,4.25,0.0,668.0,180.25,184.75,33.25,2024.0,7.0,2024-07-01 00:00:00,6.0,8.25
max,0.998457,2024-07-22 04:01:25,1.0,2024-07-22 04:01:25,0.0,39.0,29.0,18.0,5.0,2395.0,659.0,837.0,433.0,2024.0,7.0,2024-07-01 00:00:00,6.0,22.0
std,0.242661,,0.0,,0.0,9.379351,7.013498,3.835748,1.020297,552.0858,167.366764,221.066855,97.73271,0.0,1.380131,,2.02465,7.793295


In [234]:
df_files.head()

Unnamed: 0,Commit Sha,File Sha,Lines,Path,Path1,Path2,Path3,Extension,num_commits,sum_work_items,avg_lines,avg_lines_added,avg_lines_deleted,min_date,max_date,bugfixes,bugfix_probability,first_author,last_author,modal_author
0,6ddd16fc77d596215925abd5dce7348f0ec6a3c1,8dd4607a4b3c4f581d2728ef1cd2d15ad42e2ee2,398,.gitignore,.gitignore,.,.,.gitignore,1,0,398.0,398.0,0.0,2024-04-19,2024-04-19,0,0.013007,Matt Eland,Matt Eland,Matt Eland
1,6ddd16fc77d596215925abd5dce7348f0ec6a3c1,f77ef21bdc1694e6af664e73a717beb5804944be,34,MattEland.Wherewolf.sln,MattEland.Wherewolf.sln,.,.,.sln,1,0,34.0,0.0,0.0,2024-04-19,2024-04-19,1,0.881014,Matt Eland,Matt Eland,Matt Eland
2,6ddd16fc77d596215925abd5dce7348f0ec6a3c1,8ef0769eb0ea8f8f0dc2cc9b873580e7775e6de2,19,MattEland.Wherewolf.Benchmarking/MattEland.Whe...,MattEland.Wherewolf.Benchmarking,MattEland.Wherewolf.Benchmarking.csproj,.,.csproj,1,0,19.0,19.0,0.0,2024-04-19,2024-04-19,0,0.002071,Matt Eland,Matt Eland,Matt Eland
3,6ddd16fc77d596215925abd5dce7348f0ec6a3c1,e5dff12bc410be0bac59bc70865a8e21827ec7f3,3,MattEland.Wherewolf.Benchmarking/Program.cs,MattEland.Wherewolf.Benchmarking,Program.cs,.,.cs,1,0,3.0,3.0,0.0,2024-04-19,2024-04-19,0,0.001424,Matt Eland,Matt Eland,Matt Eland
4,6ddd16fc77d596215925abd5dce7348f0ec6a3c1,18ff3f4588d2a150645c85d62ff698dc21853c94,76,MattEland.Wherewolf.Console/DisplayHelpers.cs,MattEland.Wherewolf.Console,DisplayHelpers.cs,.,.cs,3,0,67.333333,30.0,4.666667,2024-04-20,2024-07-10,0,0.025136,Matt Eland,Matt Eland,Matt Eland


In [235]:
df_file_commits.head()

Unnamed: 0,Commit,File Sha,Type,Path,Lines Added,Lines Deleted,Current Lines,Commit Date Utc,Commit Message,Work Items,IsBugfix,BugfixProbability,date,year,month,year-month,weekday,weekday_name,Author,Is Bot?
0,53914f0a9cba77256188630e6870f98494a94e85,8dd4607a4b3c4f581d2728ef1cd2d15ad42e2ee2,Added,.gitignore,398,0,398,2024-04-19 02:07:44,Added project,0,False,0.013007,2024-04-19,2024,4,2024-04-01,4,Friday,Matt Eland,False
1,53914f0a9cba77256188630e6870f98494a94e85,791cb45db90ec48323ba2b29def2b63ca73b5574,Added,.idea/.idea.MattEland.Werewolf/.idea/.gitignore,13,0,13,2024-04-19 02:07:44,Added project,0,False,0.009008,2024-04-19,2024,4,2024-04-01,4,Friday,Matt Eland,False
2,53914f0a9cba77256188630e6870f98494a94e85,7b08163cebc50fb3e777eea4881b68fcebc10590,Added,.idea/.idea.MattEland.Werewolf/.idea/indexLayo...,8,0,8,2024-04-19 02:07:44,Added project,0,False,0.00622,2024-04-19,2024,4,2024-04-01,4,Friday,Matt Eland,False
3,53914f0a9cba77256188630e6870f98494a94e85,febde0f8f9c3c8bc3d6bd35ecfd0d21ae1a7f9f2,Added,.idea/.idea.MattEland.Werewolf/.idea/material_...,17,0,17,2024-04-19 02:07:44,Added project,0,False,0.004304,2024-04-19,2024,4,2024-04-01,4,Friday,Matt Eland,False
4,53914f0a9cba77256188630e6870f98494a94e85,f77ef21bdc1694e6af664e73a717beb5804944be,Added,MattEland.Werewolf.sln,34,0,34,2024-04-19 02:07:44,Added project,0,False,0.002993,2024-04-19,2024,4,2024-04-01,4,Friday,Matt Eland,False


In [236]:
df_file_commits.describe()

Unnamed: 0,Lines Added,Lines Deleted,Current Lines,Commit Date Utc,Work Items,BugfixProbability,year,month,year-month,weekday
count,259.0,259.0,259.0,259,259.0,259.0,259.0,259.0,259,259.0
mean,20.868726,4.849421,61.266409,2024-05-14 04:27:14.212355328,0.0,0.072751,2024.0,4.810811,2024-04-25 14:16:12,4.015444
min,0.0,0.0,0.0,2024-04-19 02:07:44,0.0,2.4e-05,2024.0,4.0,2024-04-01 00:00:00,0.0
25%,2.0,0.0,15.0,2024-04-20 21:26:58,0.0,0.003086,2024.0,4.0,2024-04-01 00:00:00,2.0
50%,8.0,1.0,33.0,2024-04-21 18:42:01,0.0,0.010005,2024.0,4.0,2024-04-01 00:00:00,4.0
75%,25.5,5.0,77.5,2024-07-10 05:53:54,0.0,0.046569,2024.0,7.0,2024-07-01 00:00:00,6.0
max,398.0,55.0,417.0,2024-07-22 04:01:25,0.0,0.998473,2024.0,7.0,2024-07-01 00:00:00,6.0
std,36.995471,9.436677,73.514978,,0.0,0.18468,0.0,1.334878,,1.791386


In [237]:
df_authors.head()

Unnamed: 0,AuthorId,Email,Author,Is Bot?,Num Commits,Lines Added,Lines Deleted,Files Added,Files Deleted,Files Modified,First Commit Date UTC,Last Commit Date UTC
0,1,matt.eland@gmail.com,Matt Eland,False,28,5405,1256,77,9,173,04/19/2024 02:07:44,07/22/2024 04:01:25


## Data Visualization

In [238]:
# Declare standard styles here
theme_discrete = px.colors.qualitative.Prism
theme_diverging_neutral = px.colors.diverging.RdYlBu
theme_diverging = px.colors.diverging.Picnic_r
theme_diverging_r = px.colors.diverging.Picnic
theme_sequential = px.colors.sequential.Agsunset
theme_continuous= px.colors.diverging.balance
theme_hot = px.colors.sequential.Reds
theme_cold = px.colors.sequential.Blues
template = 'presentation'

px.defaults.template = template
px.defaults.color_continuous_scale = theme_sequential
px.defaults.color_discrete_sequence = theme_discrete

# Utility Formatting functions
def format_and_show_short(fig):
    fig.update_layout(height=400)
    fig.show()

def format_and_show(fig):
    fig.update_layout(height=550)
    fig.show()

def format_and_show_tall(fig):
    fig.update_layout(height=800)
    fig.show()

def format_and_show_3d(fig):
    fig.update_layout(width=1024, height=800)
    fig.show()

def format_and_show_sunburst(fig):
    fig.update_layout(width=1024, height=800)
    fig.show()

file_labels = {
    'Path1': 'Project',
    'Path2': 'Area',
    'Lines': 'Lines of Code',
    'Lines_sum': 'Total Lines of Code',
    'num_commits': '# Commits',
    'sum_work_items': '# Work Items in Commits',
    'bugfixes': '# Bugfix Commits',
    'bugfix_probability': 'Bugfix Probability',
}

# Replacement Values to make the graphs look nice
commit_labels = {
                     'TotalBytes': 'Bytes',
                     'NumFiles': '# Files',
                     'weekday_name': 'Weekday',
                     'AuthorEmail': 'Author E-Mail',
                     'AuthorDateUTC': 'Date',
                     'AuthorName': 'Author',
                     'avg_files': 'Average Files Modified',

                     'net_lines':'Net Lines',
                     'num_deletes': 'Lines Deleted',
                     'num_inserts': 'Lines Added',
                     'num_files': 'Files Modified',
                     'date': 'Date',
                     'datetime': 'Date',
                     'filename': 'File',
                     'message': 'Commit Message',
                     'hash': 'Hash',
                     'author_name': 'Author',
                     'count': 'Count',
                     'avg_net': 'Avg. Net Lines',
                     'num_commits': 'Commits',
                     'num_authors': 'Authors',
                     'sum_net': 'Total Net Lines',
                     'lines': 'Lines of Code',
                     'project': 'Project',
                 }

agg_commit_hover_data = ['sum_inserts', 'sum_deletes', 'min_files', 'min_inserts', 'min_deletes', 'max_files', 'max_inserts', 'max_deletes', 'avg_files', 'avg_inserts','avg_deletes']

### What?
Exploring the commit trends of the project

In [239]:
fig = px.scatter(df_commits_daily, 
                 title=project_name + ' Daily Commit Counts',
                 x=df_commits_daily.index,
                 y='num_commits', 
                 color='num_commits',
                 hover_data=agg_commit_hover_data,
                 hover_name=df_commits_daily.index,
                 labels=commit_labels)

fig.update_layout(xaxis_title='Date')

format_and_show(fig)

In [240]:
fig = px.scatter(df_commits_daily, 
                 title=project_name + ' Daily Commit and Author Counts',
                 x=df_commits_daily.index,
                 y='num_commits', 
                 color='num_authors',
                 hover_name=df_commits_daily.index,
                 hover_data=agg_commit_hover_data,
                 labels=commit_labels)

fig.update_layout(xaxis_title='Date')

format_and_show(fig)

In [241]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_commits_monthly.index,
            mode='lines+markers',
            name='Commits',
            line=dict(
                color='Purple'
            ),
            marker=dict(
                color=df_commits_monthly['num_commits'],
                size=8,
                colorscale=theme_sequential,
                colorbar=dict(
                    title="Authors"
                ),
            ),
            y=df_commits_monthly['num_commits']))

fig.update_layout(xaxis_title='Date',yaxis_title='Commits', title=project_name + " Monthly Commits")

format_and_show(fig)

In [242]:
fig = px.scatter(df_commits, 
                 title= project_name + ' Daily Files per Commit',
                 x='AuthorDateUtc', 
                 y='Total Files',
                 color='Total Files',
                 labels=commit_labels,
                 hover_data=['Author'],
                 hover_name='Message')
fig.update_layout(xaxis_title='Date')
format_and_show_short(fig)


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result



In [243]:
fig = px.scatter(df_commits, 
                 title= project_name + ' Commits by Bugfix Status',
                 x='AuthorDateUtc', 
                 y='Modified Files',
                 color='IsBugfix',
                 color_discrete_sequence=px.colors.qualitative.Bold,
                 labels=commit_labels,
                 hover_data=['Author'],
                 hover_name='Message')
fig.update_layout(xaxis_title='Date')
format_and_show_short(fig)


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result



In [244]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_commits_monthly.index,
            mode='lines+markers',
            marker=dict(
                color=df_commits_monthly['sum_net'],
                size=8,
                colorscale=px.colors.diverging.balance
            ),
            y=df_commits_monthly['sum_net']))
fig.update_layout(title=project_name + " Monthly Net Changes",
                  yaxis_title="Net Change (Lines of Code)",
                  xaxis_title="Date")
format_and_show(fig)

In [245]:
# Generate a bar chart of the number of files by extension sorted by number of files
fig = px.bar(df_files.groupby('Extension').agg({'Path': 'count'}).sort_values(by='Path', ascending=False).reset_index(),
                title=project_name + ' File Count by Extension',
                x='Extension',
                y='Path',
                color='Path',
                labels=file_labels)
fig.update_layout(xaxis_title='File Extension', yaxis_title='# Files', margin=dict(b=200), legend_title_text='# Files')
fig.layout.coloraxis.colorbar.title = '# Files'

format_and_show(fig)

In [246]:
# Generate a bar chart of the total lines of code by extension sorted by number of files
fig = px.bar(df_files.groupby('Extension').agg({'avg_lines': 'mean'}).sort_values(by='avg_lines', ascending=False).reset_index(),
                title=project_name + ' Average Lines of Code by Extension',
                x='Extension',
                y='avg_lines',
                color='avg_lines',
                labels=file_labels)

fig.update_layout(xaxis_title='File Extension', yaxis_title='Average Lines', margin=dict(b=200), legend_title_text='# Files')
fig.layout.coloraxis.colorbar.title = '# Files'

format_and_show(fig)

### When?
What trends are there for when commits occur?

In [247]:
fig = px.histogram(df_commits.sort_values(by='weekday'), 
                 title=project_name + ' Commits by Month (Day of Week Colorized)',
                 x='year-month', 
                 color='weekday_name',
                 color_discrete_sequence=theme_sequential,
                 labels=commit_labels)
fig.update_layout(xaxis_title='Month')
format_and_show_short(fig)


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result



In [248]:
px.density_heatmap(df_commits.sort_values(by='weekday'), 
                 title=project_name + ' Commits by Hour of Day and Day of Week',
                 x='weekday_name', 
                 y='hour', 
                 color_continuous_scale=theme_sequential,
                 labels=commit_labels)

### Who?
Who is working on this project? When have they entered and departed the project?

In [249]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_commits_monthly.index,
            mode='lines+markers',
            name='Authors',
            line=dict(
                color='Purple'
            ),
            marker=dict(
                color=df_commits_monthly['num_commits'],
                size=8,
                colorscale=theme_sequential,
                colorbar=dict(
                    title="Commits"
                ),
            ),
            y=df_commits_monthly['num_authors']))

fig.update_layout(xaxis_title='Date',yaxis_title='Authors', title=project_name +" Monthly Authors and Commits")

format_and_show(fig)

In [250]:
fig = px.scatter_3d(df_commits_monthly, 
                 title=project_name + ' Monthly Commit and Author Counts',
                 x=df_commits_monthly.index,
                 y='num_commits', 
                 z='num_authors',
                 color='num_authors',
                 hover_name=df_commits_monthly.index,
                 hover_data=agg_commit_hover_data,
                 labels=commit_labels)

fig.update_layout(xaxis_title='Date')

format_and_show_3d(fig)


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result



In [251]:
fig = px.histogram(df_commits, 
                 title=project_name + ' Weekly Commits (Author Colorized)',
                 x='date', 
                 color='Author',
                 nbins=num_days//7,
                 color_discrete_sequence=theme_discrete,
                 hover_name='Message',
                 labels=commit_labels)
fig.update_layout(xaxis_title='Date')
format_and_show(fig)

In [252]:
fig = px.scatter(df_contributor_monthly,
        x=df_contributor_monthly.index.get_level_values(0),
        y='sum_inserts',
        size='count',
        labels=commit_labels,
        hover_data=['sum_net', 'sum_files', 'sum_inserts', 'sum_deletes', 'count'],
        color=df_contributor_monthly.index.get_level_values(1),
        color_discrete_sequence=theme_discrete)
fig.update_layout(title=project_name + " Files created by month by author, sized by # commits",
                  xaxis_title='Year / Month',
                  yaxis_title='Files Created',
                  legend_title='Author')
format_and_show(fig)


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result



In [253]:
fig = px.scatter(df_contributor_monthly,
        x=df_contributor_monthly.index.get_level_values(0),
        y='sum_net',
        size='count',
        labels=commit_labels,
        hover_data=['sum_net', 'sum_files', 'sum_inserts', 'sum_deletes', 'count'],
        color=df_contributor_monthly.index.get_level_values(1),
        color_discrete_sequence=theme_discrete)
fig.update_layout(title=project_name + " Net lines of code by month by author, sized by # commits",
                  xaxis_title='Year / Month',
                  yaxis_title='Net Lines',
                  legend_title='Author')
format_and_show(fig)


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result



In [254]:
fig = px.scatter(df_contributor_monthly,
                 x=df_contributor_monthly.index.get_level_values(0), 
                 y=df_contributor_monthly.index.get_level_values(1),
                 color=df_contributor_monthly.index.get_level_values(1),
                 size='count',
                 color_discrete_sequence=theme_discrete,
                 labels=commit_labels)
fig.update_layout(title=project_name + " Monthly Contribution History",
                  xaxis_title='Year / Month',
                  yaxis_title='',
                  margin=dict(l=200), # Reserve more horizontal space for the y axis labels
                  legend_title='Author')

format_and_show_tall(fig)


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result



### Where?
How is the code organized? What can we tell about each area?

In [255]:
hierarchy = [px.Constant(project_name),'Path1','Path2','Path3','Path']

# Files by File Size
fig = px.treemap(df_files,
                 path=hierarchy,
                 color='Lines',
                 title=project_name + ' Largest Files (Lines)',
                 labels=file_labels,
                 values='Lines')
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
format_and_show_tall(fig)

In [256]:
# Sunburst diagram. Same data as a treemap, but different presentation
fig = px.sunburst(df_files,
                 path=hierarchy,
                 color='Lines',
                 title=project_name + ' Size of Code Files by Project and Directory',
                 hover_data=['Path'],
                 color_continuous_scale='sunsetdark',
                 labels=file_labels,
                 values='Lines')
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
format_and_show_sunburst(fig)

In [257]:
# Files by Extension
fig = px.treemap(df_files,
                 path=hierarchy,
                 color='Extension',
                 title=project_name + ' File Types',
                 labels=file_labels,
                 color_discrete_sequence=theme_discrete,
                 values='Lines')
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
format_and_show_tall(fig)

In [258]:
fig = px.histogram(df_files,
                   x="Lines",
                   title=project_name + ' Frequency of File Sizes by Extension',
                   color='Extension',
                   labels=file_labels,
                   color_discrete_sequence=theme_discrete)
format_and_show(fig)

In [259]:
# Files by Bugfix probability
fig = px.treemap(df_files,
                 path=hierarchy,
                 color='bugfixes',
                 title=project_name + ' Files by Bugfix Count',
                 labels=file_labels,
                 color_discrete_sequence=theme_sequential,
                 values='Lines')
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
format_and_show_tall(fig)

In [260]:
# Files by Bugfix probability
fig = px.treemap(df_files,
                 path=hierarchy,
                 color='bugfix_probability',
                 title=project_name + ' Files by Commit Bugfix Probability',
                 labels=file_labels,
                 color_discrete_sequence=theme_sequential,
                 values='Lines')
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
format_and_show_tall(fig)

In [261]:
# Files by Date Created
fig = px.treemap(df_files,
                 path=hierarchy,
                 color='min_date',
                 title=project_name + ' Files by Creation Date',
                 labels=file_labels,
                 color_discrete_sequence=theme_sequential,
                 values='Lines')
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
format_and_show_tall(fig)

In [262]:
# Files by Date Modified
fig = px.treemap(df_files,
                 path=hierarchy,
                 color='max_date',
                 title=project_name + ' Files by Date Last Modified',
                 labels=file_labels,
                 color_discrete_sequence=theme_sequential,
                 values='Lines')
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
format_and_show_tall(fig)

In [263]:
# Files by Date Modified
fig = px.treemap(df_files,
                 path=hierarchy,
                 color='num_commits',
                 title=project_name + ' Files by # Commits',
                 labels=file_labels,
                 color_discrete_sequence=theme_sequential,
                 values='Lines')
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
format_and_show_tall(fig)

In [264]:
# Sunburst diagram. Same data as a treemap, but different presentation
fig = px.sunburst(df_files,
                 path=hierarchy,
                 color='num_commits',
                 title=project_name + ' # Commits by Project Structure',
                 hover_data=['Path'],
                 color_continuous_scale='sunsetdark',
                 labels=file_labels,
                 values='num_commits')
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
format_and_show_sunburst(fig)

In [265]:
# Files by Date Modified
fig = px.treemap(df_files,
                 path=hierarchy,
                 color='sum_work_items',
                 title=project_name + ' Files by # Work Item Commits',
                 labels=file_labels,
                 color_discrete_sequence=theme_sequential,
                 values='Lines')
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
format_and_show_tall(fig)

In [266]:
fig = px.treemap(df_files,
                 path=hierarchy,
                 color='first_author',
                 title=project_name + ' Files by Creator',
                 labels=file_labels,
                 color_discrete_sequence=theme_discrete,
                 values='Lines')
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
format_and_show_tall(fig)

In [267]:
fig = px.treemap(df_files,
                 path=hierarchy,
                 color='last_author',
                 title=project_name + ' Files by Last Modified By',
                 labels=file_labels,
                 hover_data=['Commit Sha'],
                 color_discrete_sequence=theme_discrete,
                 values='Lines')
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
format_and_show_tall(fig)

In [268]:
# Files by Date Modified
fig = px.treemap(df_files,
                 path=hierarchy,
                 color='modal_author',
                 title=project_name + ' Files by Most Common Author',
                 labels=file_labels,
                 color_discrete_sequence=theme_discrete,
                 values='Lines')
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
format_and_show_tall(fig)

In [269]:
fig = px.sunburst(df_files,
                 path=['modal_author', 'Path1', 'Path2', 'Path3', 'Path'],
                 color='modal_author',
                 title=project_name + ' Files by Most Common Author',
                 labels=file_labels,
                 color_discrete_sequence=theme_discrete,
                 values='Lines')
format_and_show_sunburst(fig)