
# File Data Extraction
This notebook focuses on building a CSV file out of the raw file structure on disk in a given directory

In [95]:
# Import required libraries
import pandas as pd
import os

In [96]:
# Path should point to the root directory of the application you're analyzing
paths = ['C:/Dev/MachineLearning/src', 'C:/Dev/MachineLearning/test']
paths

['C:\\Dev\\MachineLearning\\src', 'C:\\Dev\\MachineLearning\\test']

In [97]:
# Common source file extensions. This list is very incomplete. Intentionally not including JSON / XML
source_extensions = [
    '.cs', 
    '.vb', 
    '.java', 
    '.r', 
    '.agc', 
    '.fs', 
    '.js', 
    '.cpp', 
    '.go', 
    '.aspx', 
    '.jsp', 
    '.do', 
    '.php', 
    '.ipynb', 
    '.sh', 
    '.html', 
    '.lua', 
    '.css'
    ]

def is_source_file(file_label):
    """
    Defines what a source file is.
    """
    file, _ = file_label
    _, ext = os.path.splitext(file)
    
    return ext.lower() in source_extensions

In [98]:
def count_lines(path):
    """
    Reads the file at the specified path and returns the number of lines in that file
    Code taken from https://stackoverflow.com/questions/845058/how-to-get-line-count-of-a-large-file-cheaply-in-python
    """

    def _make_gen(reader):
        b = reader(2 ** 16)
        while b:
            yield b
            b = reader(2 ** 16)

    with open(path, "rb") as f:
        count = sum(buf.count(b"\n") for buf in _make_gen(f.raw.read))

    return count

In [99]:
def get_file_metrics(files, root):
    """
    This function gets all metrics for the files and returns them as a list of file detail objects
    """
    results = []

    for file, folders in files:
        lines = count_lines(file) # Slow as it actually reads the file
        _, filename = os.path.split(file)
        _, ext = os.path.splitext(filename)

        fullpath = ''

        if folders != None and len(folders) > 0:
            project = folders[0]
            for folder in folders[1:]:
                if len(fullpath) > 0:
                    fullpath += '/'
                fullpath += folder
        else:
            project = ''

        if len(fullpath) <= 0:
            fullpath = '.'

        id = root + '/' + project + '/' + fullpath + '/' + filename

        file_details = {
                        'fullpath': id,
                        'root': root,
                        'project': project,
                        'path': fullpath,
                        'filename': filename,
                        'ext': ext,
                        'lines': lines,
                        }
        results.append(file_details)

    return results

In [100]:
# Some directories should be ignored as their output is not helpful
ignored_directories = ['.git', '.vs', 'obj', 'ndependout', 'bin', 'debug']

def get_file_list(dir_path, paths=None):
    """
    Gets a list of files in this this directory and all contained directories
    """

    files = list()
    contents = os.listdir(dir_path)

    for entry in contents:
        path = os.path.join(dir_path, entry)
        if os.path.isdir(path):
            # Ignore build and reporting directories
            if entry.lower() in ignored_directories:
                continue

            # Maintain an accurate, but separate, hierarchy array
            if paths is None:
                p = [entry]
            else:
                p = paths[:]
                p.append(entry)

            files = files + get_file_list(path, p)
        else:
            files.append((path, paths))

    return files

In [101]:
def get_source_file_metrics(path):
    """
    This function gets all source files and metrics associated with them from a given path
    """
    source_files = filter(is_source_file, get_file_list(path))
    return get_file_metrics(list(source_files), path)

In [102]:
# Pull Source file metrics
files = []
for path in paths:
    files.extend(get_source_file_metrics(path))

print(str(len(files)) + " files read")

1476 files read


In [103]:
# Grab the file metrics for source files and put them into a data frame
df = pd.DataFrame(files)
df = df.sort_values('lines', ascending=False)

# Write to a file for other analyses processes
df.to_csv('filesizes.csv')

# Preview the data
df.head()

Unnamed: 0,fullpath,root,project,path,filename,ext,lines
32,C:\Dev\MachineLearning\src\Microsoft.Data.Anal...,C:\Dev\MachineLearning\src,Microsoft.Data.Analysis,.,PrimitiveDataFrameColumn.BinaryOperationAPIs.E...,.cs,15886
35,C:\Dev\MachineLearning\src\Microsoft.Data.Anal...,C:\Dev\MachineLearning\src,Microsoft.Data.Analysis,.,PrimitiveDataFrameColumn.BinaryOperators.cs,.cs,7397
1249,C:\Dev\MachineLearning\test\Microsoft.ML.Core....,C:\Dev\MachineLearning\test,Microsoft.ML.Core.Tests,UnitTests,TestEntryPoints.cs,.cs,6904
40,C:\Dev\MachineLearning\src\Microsoft.Data.Anal...,C:\Dev\MachineLearning\src,Microsoft.Data.Analysis,.,PrimitiveDataFrameColumnArithmetic.cs,.cs,6863
830,C:\Dev\MachineLearning\src\Microsoft.ML.OnnxCo...,C:\Dev\MachineLearning\src,Microsoft.ML.OnnxConverter,.,OnnxMl.cs,.cs,5945
