In [1]:
import os
from collections import defaultdict, Counter
from itertools import combinations
from difflib import SequenceMatcher

import json
import pandas as pd
from tqdm import tqdm_notebook as tqdm


from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
PROJECT_NAME = 'linux_ckl'
PROJECT_PATH = os.getcwd().rsplit(f'/{PROJECT_NAME}', 1)[0] + f'/{PROJECT_NAME}'

LINUX_REPO_BASE = os.path.join(PROJECT_PATH, 'repo')
SNAPSHOTS_BASE = os.path.join(PROJECT_PATH, 'snapshots')
DATA_IN = os.path.join(PROJECT_PATH, 'data_in')
os.makedirs(DATA_IN, exist_ok=True)

In [3]:
VERSIONS = {
#     'v2.6.12-rc2':'2005-04-16 15:20:36 -0700',
#     'v2.6.21' : '2007-04-25 20:08:32 -0700',
    'v2.6.31':'2009-09-09 15:13:59 -0700',
    'v3.0':'2011-07-21 19:17:23 -0700',    
#     'v3.11':'2013-09-02 13:46:10 -0700',        
    'v4.0':'2015-04-12 15:12:50 -0700',            
#     'v4.13':'2017-09-03 13:56:17 -0700',
    'v5.0':'2019-03-03 15:21:29 -0800'
}

In [4]:
def generate_diff_snapshots():
    return [(v1, v2) for v1, v2 in  combinations(VERSIONS.keys(), 2)]

        
DIFFS = generate_diff_snapshots()
DIFFS

[('v2.6.31', 'v3.0'),
 ('v2.6.31', 'v4.0'),
 ('v2.6.31', 'v5.0'),
 ('v3.0', 'v4.0'),
 ('v3.0', 'v5.0'),
 ('v4.0', 'v5.0')]

In [5]:
def tool_file_path(version, tool):
    return os.path.join(DATA_IN, f'{version}.{tool}')

def snap_folder_path(version):
    return os.path.join(SNAPSHOTS_BASE, version)    

In [6]:
def log_collect_msg(msg):
    print(msg)

In [7]:
def profile_decorator(function):
    def wrapper(input_folder, output_file, overwrite = False):
        assert os.path.exists(input_folder) 
        assert os.path.isdir(input_folder) 

        fname = function.__name__
        
        if os.path.exists(output_file) and not overwrite:
            log_collect_msg(f'{fname} SKIP existing file: {output_file}')
            return 

        log_collect_msg(f'{fname} is about to create {output_file} for folder {input_folder}')
        function(input_folder, output_file)
        
        if os.path.exists(output_file):
            log_collect_msg(f'{fname} has created an output file. Size: {os.stat(output_file).st_size} bytes.')
        else:
            log_collect_msg(f'{fname} has FAILED to create output file')        
            
    return wrapper

In [8]:
def enumerate_folder_tidy_paths(input_folder, extensions_list = None):
    for root, subdirs, files in os.walk(input_folder):
        for f in files:
            _, ext = os.path.splitext(f)
            if extensions_list and ext.lower() not in extensions_list: 
                continue
            rel_path = os.path.relpath(root, input_folder)
            yield os.path.join(rel_path, f).lower(), os.path.join(rel_path, f)

In [9]:
def get_file_lines(f):
    with open(f, encoding='latin-1', errors='ignore') as fin:
        lines = fin.readlines()
        empty_lines_count = sum([1 for l in lines if 0 == len(l.strip())])
        return len(lines), empty_lines_count
    

@profile_decorator
def collect_lines(input_folder, output_file, overwrite = False):
    def _collect(input_folder):
        for tidy_path, raw_path in enumerate_folder_tidy_paths(input_folder):
            lines, empty_lines = get_file_lines(os.path.join(input_folder, raw_path))
            yield (tidy_path, lines, empty_lines)

    df = pd.DataFrame(data=_collect(input_folder), 
                      columns=['tidy_path', 'lines_count', 'empty_lines_count'])

    df.to_csv(output_file, sep=';')

In [10]:
def collect_diff_pairs(left_folder, right_folder, output_file,
                       exclude_exact_match, extensions_list, similar_exts, 
                       func_compare_paths, overwrite = False):
    assert os.path.exists(left_folder) and os.path.isdir(left_folder) 
    assert os.path.exists(right_folder) and os.path.isdir(right_folder) 

    if os.path.exists(output_file) and not overwrite:
        log_collect_msg('collect_diff_pairs SKIP existing file ' + output_file)
        return 
    
    log_collect_msg(f'collect_diff_pairs is about to create {output_file}')
    
    left_files = dict(enumerate_folder_tidy_paths(left_folder, extensions_list))
    right_files = dict(enumerate_folder_tidy_paths(right_folder, extensions_list))
    
    def is_similar_ext(ext1, ext2): return any([ext1 in sim_set and ext2 in sim_set for sim_set in similar_exts])    

    def choose_left_candidate(right_file, left_files_names):
        right_fname, right_ext = os.path.splitext(os.path.basename(right_file))
        
        best_ratio, best_left = 0, None
        for left_file in sorted(left_files_names[right_fname]):
            if left_file == right_file:
                if exclude_exact_match:
                    return 0, None
                else:
                    return 1, left_file

            left_fname, left_ext = os.path.splitext(os.path.basename(left_file))
            if not is_similar_ext(left_ext, right_ext):
                continue

            candidate_ratio = func_compare_paths(left_file, left_fname, left_ext, right_file, right_fname, right_ext)
            if candidate_ratio > best_ratio: 
                best_ratio, best_left = candidate_ratio, left_file
        
        return best_ratio, best_left

    def match_right_files(left_files, right_files):
        left_files_names = defaultdict(list)
        for f in left_files:
            fname, _ = os.path.splitext(os.path.basename(f))
            left_files_names[fname].append(f)

        for right_file in tqdm(right_files, total=len(right_files)):

            best_ratio, best_left = choose_left_candidate(right_file, left_files_names)
            if best_left is None:
                continue

            yield best_left, left_files[best_left], right_file, right_files[right_file], best_ratio
    
    df = (pd.DataFrame(data=match_right_files(left_files, right_files), 
                      columns=['File from', 'Raw File from', 'File to', 'Raw File to', 'Ratio'])
              .set_index(['File from', 'File to']))
    
    df.to_csv(output_file, sep=";")
    
    if os.path.exists(output_file):
        log_collect_msg(f'collect_diff_pairs has created an output file. Size: {os.stat(output_file).st_size} bytes.')
    else:
        log_collect_msg(f'collect_diff_pairs has FAILED to create output file')        
    
    return df

def compare_paths_longest_folder_seq(left_file, left_fname, left_ext, right_file, right_fname, right_ext):
    if left_file == right_file:
        return 1
    
    if left_fname != right_fname:
        return 0
    
    # Remove all empty elements in list after split to handle edge cases
    fp1 = list(filter(lambda x: x!= "", left_file.split("/"))) 
    fp2 = list(filter(lambda x: x!= "", right_file.split("/"))) 
    sm = SequenceMatcher(None, fp1, fp2)
    return sm.ratio()
    

In [11]:
def collect_tool_diff(left_folder, right_folder, 
                      diff_input_file, output_file, 
                      tool_diff_names, func_tool_diff, overwrite = False):
    
    assert os.path.exists(left_folder) and os.path.isdir(left_folder) 
    assert os.path.exists(right_folder) and os.path.isdir(right_folder) 

    assert os.path.exists(diff_input_file) and os.path.isfile(diff_input_file) 
    
    fname = func_tool_diff.__name__
    
    if os.path.exists(output_file) and not overwrite:
        log_collect_msg(f'{fname} SKIP existing file {output_file}')
        return 
    
    log_collect_msg(f'{fname} is about to create {output_file}')

    matches_df = (pd.read_csv(diff_input_file, sep=';')
                  .reset_index(drop=True))
    
    def tool_diff(matches_df, left_folder, right_folder, func_tool_diff):

        it = tqdm(matches_df[['File from', 'File to']].iterrows(), total=len(matches_df))
        for idx, values  in it:
            left_fname, right_fname = values[0:2] 
            left_file = os.path.join(left_folder, left_fname)
            right_file = os.path.join(right_folder, right_fname)
            try:
                counters = func_tool_diff(left_file, right_file)
                assert len(tool_diff_names) == len(counters), 'wrong set of counters:' + counters
                yield [left_fname, right_fname] + counters
            except:
                print('Failed to diff', left_fname, right_fname)

    df = (pd.DataFrame(data=tool_diff(matches_df, left_folder, right_folder, func_tool_diff),
                        columns=['File from', 'File to'] + tool_diff_names)
            .set_index(['File from', 'File to']))

    df.to_csv(output_file, sep=";")
    
    if os.path.exists(output_file):
        log_collect_msg(f'{fname} has created an output file. Size: {os.stat(output_file).st_size} bytes.')
    else:
        log_collect_msg(f'{fname} has FAILED to create output file')        
    
    return df
                                        
    

In [12]:
def collect_gnu_diff_au0bi(left_folder, right_folder, diff_input_file, output_file, overwrite = False):
    GNU_DIFF_NAMES = ['Added', 'Deleted', 'Blank_Added', 'Blank_Deleted']

    def gnu_diff_files(left_file, right_file):

        cmd = f'diff -au0bi "{left_file}" "{right_file}"'
        result = !{cmd}

        added, deleted, blank_added, blank_deleted = 0, 0, 0, 0
        for line in result[2:]:
            line = line.strip()
            if line.startswith("+"):
                if len(line) == 1:
                    blank_added += 1
                else: 
                    added += 1
            elif line.startswith("-"):
                if len(line) == 1:
                    blank_deleted += 1
                else: 
                    deleted += 1

        return [added, deleted, blank_added, blank_deleted]    
    
    return collect_tool_diff(left_folder, right_folder, diff_input_file, output_file, GNU_DIFF_NAMES, gnu_diff_files, overwrite)

In [13]:
for version in tqdm(VERSIONS):
    print('generate profile data for version', version)
    input_folder = snap_folder_path(version)
    collect_blanks(input_folder, tool_file_path(version, 'lines'))


HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

generate profile data for version v2.6.31



NameError: name 'collect_white_space' is not defined

In [None]:
SIMILAR_SETS = [set([".cpp", ".c", ".cxx", ".cc"]), set([".hpp", ".h", ".hxx"]), set([".php", ".phpt"])]
for diff in tqdm(DIFFS):
    v1, v2 = diff
    left_folder = snap_folder_path(v1)
    right_folder = snap_folder_path(v2)
    match_file = tool_file_path(f'{v1}-{v2}', 'match')
    diff_file_au0bi = tool_file_path(f'{v1}-{v2}', 'diff-gnu-au0bi')

    collect_diff_pairs(left_folder, right_folder, match_file, 
                            False, None, SIMILAR_SETS, 
                            compare_paths_longest_folder_seq)
    
    collect_gnu_diff_au0bi(left_folder, right_folder, 
                     match_file, diff_file_au0bi)

In [None]:
def clean_paths(df, col_in='path', col_out='tidy_path'):
    paths = df[col_in].str.lower().str.strip()
    prefix = os.path.commonpath([f for f in paths])
    if prefix:
        paths = paths.str[len(prefix) + 1:]
    paths = paths.apply(lambda p: p[2:] if p.startswith('./') else p)
    df[col_out] = paths
    return df

def parse_diff_gnu_df(f, v1, v2):
    return (pd.read_csv(f, sep=';')
               .rename(columns={'File from': 'path_from', 'File to' : 'path_to'})
               .pipe(clean_paths, 'path_from', 'tidy_path_from')
               .pipe(clean_paths, 'path_to', 'tidy_path_to')
               .assign(v1=v1, v2=v2)
           )
def parse_blanks_df(f, v):
    return (pd.read_csv(f, sep=';')
               .pipe(clean_paths, 'tidy_path', 'tidy_path')
               .assign(version=v)
           )

In [None]:
profile_df = pd.concat([parse_blanks_df(tool_file_path(v, 'ws'), v) for v in VERSIONS])

profile_df.lines = profile_df.lines.astype(int)

# DIFF - collect empty lines diff and use WS to solve common lines

In [None]:
gnu_diff_df = pd.concat([parse_diff_gnu_df(tool_file_path(f'{v1}-{v2}', 'diff-gnu-au0bi_BLANK'), v1, v2) for v1, v2 in DIFFS])

In [None]:
merged_df = (pd.merge(gnu_diff_df, merged_profile_df, 'inner', left_on=['v1', 'tidy_path_from'], right_on=['version', 'tidy_path'])
            .rename(columns={'ws_lines':'ws_lines_from', 'wc_lines':'wc_lines_from', 'empty_lines_count':'empty_lines_count_from'}))
merged_df = (pd.merge(merged_df, merged_profile_df, 'inner', left_on=['v2', 'tidy_path_to'], right_on=['version', 'tidy_path'])
            .rename(columns={'ws_lines':'ws_lines_to', 'wc_lines':'wc_lines_to', 'empty_lines_count':'empty_lines_count_to'})
            .drop(['path_from', 'path_to'], axis=1)
            .assign(ext=merged_df['tidy_path_from'].str.rsplit('.', 1, expand=True)[1]))

merged_df = merged_df[merged_df['ext'].isin(['h', 'c', 'cpp', 'hpp', 'hxx', 'cxx', 'py', 'cc', 'asm'])]
merged_df

In [None]:
merged_df = merged_df.assign(ws_common_from = merged_df.ws_lines_from - (merged_df.Deleted + merged_df.Blank_Deleted),
                             ws_common_to = merged_df.ws_lines_to - (merged_df.Added + merged_df.Blank_Added),
                             ws_common_empty_from = (merged_df.ws_lines_from - merged_df.empty_lines_count_from) - (merged_df.Deleted),
                             ws_common_empty_to = (merged_df.ws_lines_to - merged_df.empty_lines_count_to) - (merged_df.Added))

merged_df

In [None]:
(merged_df.ws_common_from != merged_df.ws_common_to).sum()

In [None]:
(merged_df.ws_common_empty_from != merged_df.ws_common_empty_to).sum()

In [None]:
((merged_df.ws_lines_from + merged_df.Added - merged_df.Deleted - merged_df.empty_lines_count_from)-(merged_df.ws_lines_to - merged_df.empty_lines_count_to)).unique()

In [None]:
((merged_df.ws_lines_from + merged_df.Added - merged_df.Deleted - merged_df.empty_lines_count_from)-(merged_df.ws_lines_to - merged_df.empty_lines_count_to)).unique()

In [None]:
((merged_df.ws_lines_from + merged_df.Added + merged_df.Blank_Added - merged_df.Deleted - merged_df.Blank_Deleted) - merged_df.ws_lines_to).unique()