# Structural File Analysis
This notebook contains ways to visualize the static state of a folder of source files and its subdirectories. This does not do any analysis of the contents of these files or any historical analysis.

For best results, mark this notebook as trusted so HTML content on graphs can execute.

In [1]:
# Standard Imports and path.
import pandas as pd
import plotly.express as px

# Path should point to the root directory of the application you're analyzing
path = 'C:/dev/OpenRA/'
print('Working with files at ' + path)

Working with files at C:/dev/OpenRA/


In [2]:
from FileUtilities import get_source_file_metrics

# Grab the file metrics for source files and put them into a data frame
df = pd.DataFrame(get_source_file_metrics(path))

# Write to a file for other analyses processes
df.to_csv('filesizes.csv')

# Sort AFTER we export
df = df.sort_values('lines', ascending=False)

# Display top 5 for preview purposes
df.head()

Unnamed: 0,fullpath,project,path,filename,ext,lines
158,OpenRA.Game/Server/Server.cs,OpenRA.Game,Server,Server.cs,.cs,1411
82,OpenRA.Game/Map/Map.cs,OpenRA.Game,Map,Map.cs,.cs,1378
581,OpenRA.Mods.Common/Traits/Air/Aircraft.cs,OpenRA.Mods.Common,Traits/Air,Aircraft.cs,.cs,1314
564,OpenRA.Mods.Common/ServerTraits/LobbyCommands.cs,OpenRA.Mods.Common,ServerTraits,LobbyCommands.cs,.cs,1149
756,OpenRA.Mods.Common/Traits/Mobile.cs,OpenRA.Mods.Common,Traits,Mobile.cs,.cs,1027


In [3]:
# Statistical Analysis
df.describe()

Unnamed: 0,lines
count,1362.0
mean,136.984581
std,146.008808
min,18.0
25%,56.25
50%,88.0
75%,160.0
max,1411.0


In [5]:
# Define common variables for plotly
from Theming import theme_discrete, template

In [6]:
# Histogram of lines of code
fig = px.histogram(df,
                   x="lines",
                   title='Frequency of File Sizes',
                   labels={'lines': 'Lines of Code'},
                   template=template,
                   color_discrete_sequence=theme_discrete)
fig.show()

In [12]:
# Overall box plot for all code
fig = px.box(df,
             title='Distribution of Lines of Code',
             x='lines',
             y='project',
             height=275,
             template=template,
             color_discrete_sequence=theme_discrete,
             hover_data=['project','path','filename'],
             labels={
                 'project': 'Project',
                 'lines': 'Lines of Code',
             },
             points='suspectedoutliers')
fig.update_traces(quartilemethod='linear', jitter=1)
fig.show()

In [13]:
# Box plot for distribution by projects
fig = px.box(df,
             x='lines',
             y='project',
             hover_data=['path','filename'],
             labels={
               'project': 'Project',
               'lines': 'Lines of Code',
             },
             title='Distribution of Lines of Code by Project',
             points='suspectedoutliers',
             template=template,
             color_discrete_sequence=theme_discrete,
             color='project')
fig.update_traces(quartilemethod='linear', jitter=0.5)
fig.update_layout(showlegend=False, height=600, width=1200)
fig.update_yaxes(tickangle=30, tickfont={'size': 10})
fig.show()

In [22]:
# Box plot for distribution by projects
fig = px.box(df,
             x='lines',
             y='project',
             hover_data=['path','filename'],
             labels={
               'project': 'Project',
               'lines': 'Lines of Code',
             },
             title='Distribution of Lines of Code by Project',
             points=False,
             template=template,
             color_discrete_sequence=theme_discrete,
             color='project')
fig.update_traces(quartilemethod='linear', jitter=0.5)
fig.update_layout(showlegend=False, height=600, width=1200)
fig.show()

In [26]:
# Box plot for distribution by projects
fig = px.box(df,
             x='lines',
             y='project',
             hover_data=['path','filename'],
             labels={
               'project': 'Project',
               'lines': 'Lines of Code',
             },
             title='Distribution of Lines of Code by Project',
             points='all',
             template=template,
             color_discrete_sequence=theme_discrete,
             color='project')
fig.update_traces(quartilemethod='linear', jitter=0.5)
fig.update_layout(showlegend=False, height=600, width=1200)
fig.show()

In [27]:
# Cool color scales: matter, icefire, darkmint, picnic, temps, balance, tempo

# TODO: Using Graph Objects here would allow for a more dynamic hierarchy

fig = px.treemap(df,
                 path=[px.Constant('All Code'),'project','path','filename'],
                 color='lines',
                 title='Size of Code Files by Project and Directory',
                 hover_name='filename',
                 hover_data=['fullpath'],
                 color_continuous_scale='balance',
                 template=template,
                 width=1200,
                 height=600,
                 values='lines')
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
fig.show()

In [29]:
# Sunburst diagram. Same data as a treemap, but different presentation
fig = px.sunburst(df,
                 path=['project','path','filename'],
                 color='lines',
                 title='Size of Code Files by Project and Directory',
                 hover_data=['fullpath'],
                 color_continuous_scale='sunsetdark',
                 template=template,
                 width=1024,
                 height=800,
                 values='lines')
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
fig.show()