
# File Data Extraction
This notebook focuses on building a CSV file out of the raw file structure on disk in a given directory

In [203]:
# Import required libraries
import pandas as pd
import os

In [204]:
# Common source file extensions. This list is very incomplete. Intentionally not including JSON / XML
source_extensions = [
    '.cs', 
    '.vb', 
    '.java', 
    '.r', 
    '.agc', 
    '.fs', 
    '.js', 
    '.cpp', 
    '.go', 
    '.aspx', 
    '.jsp', 
    '.do', 
    '.php', 
    '.ipynb', 
    '.sh', 
    '.html', 
    '.lua', 
    '.css',
    '.md',
    '.sln',
    '.txt'
    ]

def is_source_file(file_label):
    """
    Defines what a source file is.
    """
    file, _, _ = file_label
    _, ext = os.path.splitext(file)

    return ext.lower() in source_extensions

In [205]:
def count_lines(path):
    """
    Reads the file at the specified path and returns the number of lines in that file
    Code taken from https://stackoverflow.com/questions/845058/how-to-get-line-count-of-a-large-file-cheaply-in-python
    """

    def _make_gen(reader):
        b = reader(2 ** 16)
        while b:
            yield b
            b = reader(2 ** 16)

    with open(path, "rb") as f:
        count = sum(buf.count(b"\n") for buf in _make_gen(f.raw.read))

    return count

In [206]:
def get_file_metrics(files, root):
    """
    This function gets all metrics for the files and returns them as a list of file detail objects
    """
    results = []

    for file, folders, relative_path in files:
        lines = count_lines(file) # Slow as it actually reads the file
        _, filename = os.path.split(file)
        _, ext = os.path.splitext(filename)

        fullpath = ''

        if folders != None and len(folders) > 0:
            project = folders[0]
            for folder in folders[1:]:
                if len(fullpath) > 0:
                    fullpath += '/'
                fullpath += folder
        else:
            project = ''

        if len(fullpath) <= 0:
            fullpath = '.'

        id = root + project + '/' + fullpath + '/' + filename

        if relative_path == '':
            rel_path = filename
        else:
            rel_path = relative_path + filename

        file_details = {
                        'fullpath': id,
                        'root': root,
                        'project': project,
                        'path': fullpath,
                        'relative_path': rel_path,
                        'filename': filename,
                        'ext': ext,
                        'lines': lines,
                        }
        results.append(file_details)

    return results

In [207]:
# Some directories should be ignored as their output is not helpful
ignored_directories = ['.git', '.vs', 'obj', 'ndependout', 'bin', 'debug']

def get_file_list(dir_path, relative_path, paths=None):
    """
    Gets a list of files in this this directory and all contained directories
    """

    files = list()
    contents = os.listdir(dir_path)

    for entry in contents:
        path = os.path.join(dir_path, entry)
        if os.path.isdir(path):
            # Ignore build and reporting directories
            if entry.lower() in ignored_directories:
                continue

            if relative_path == '':
                new_path = entry
            else:
                new_path = relative_path + '\\' + entry

            # Maintain an accurate, but separate, hierarchy array
            if paths is None:
                p = [entry]
            else:
                p = paths[:]

            files = files + get_file_list(path, new_path, p)
        else:
            files.append((path, paths, relative_path))
    return files

In [208]:
def get_source_file_metrics(root):
    """
    This function gets all source files and metrics associated with them from a given path
    """
    source_files = filter(is_source_file, get_file_list(root, ''))
    return get_file_metrics(list(source_files), root)

In [209]:
# Pull Source file metrics
files = []
root = 'C:/Dev/MachineLearning/'

files = get_source_file_metrics(root)

print(str(len(files)) + " files read")

2906 files read


In [210]:
# Grab the file metrics for source files and put them into a data frame
df = pd.DataFrame(files)
df = df.sort_values('lines', ascending=False)

# Write to a file for other analyses processes
df.to_csv('FileSizes.csv')

# Preview the data
df.head()

Unnamed: 0,fullpath,root,project,path,relative_path,filename,ext,lines
1405,C:/Dev/MachineLearning/src/./dict.txt,C:/Dev/MachineLearning/,src,.,src\Microsoft.ML.TorchSharp\Resourcesdict.txt,dict.txt,.txt,50260
2033,C:/Dev/MachineLearning/test/./Microsoft.ML.Tra...,C:/Dev/MachineLearning/,test,.,test\BaselineOutput\Common\Onnx\Regression\Adu...,Microsoft.ML.Trainers.FastTree.FastForestRegre...,.txt,39544
2035,C:/Dev/MachineLearning/test/./Microsoft.ML.Tra...,C:/Dev/MachineLearning/,test,.,test\BaselineOutput\Common\Onnx\Regression\Adu...,Microsoft.ML.Trainers.FastTree.FastTreeTweedie...,.txt,39534
2034,C:/Dev/MachineLearning/test/./Microsoft.ML.Tra...,C:/Dev/MachineLearning/,test,.,test\BaselineOutput\Common\Onnx\Regression\Adu...,Microsoft.ML.Trainers.FastTree.FastTreeRegress...,.txt,39524
2037,C:/Dev/MachineLearning/test/./Microsoft.ML.Tra...,C:/Dev/MachineLearning/,test,.,test\BaselineOutput\Common\Onnx\Regression\Adu...,Microsoft.ML.Trainers.LightGbm.LightGbmRegress...,.txt,38484
