# Can we use Tufano Code Changes Dataset for commit message generation task?

### At least we must have commit messages. We have them in cvs file describing PRs. Some PRs are large, but some of them are small (we need small). Also Tufano had extracted method pairs. Here we can go several ways: attach commit messages to existing datapoints or collect those changes that have single method pair changed. First method is worse because some changes have mutliple method pairs. Therefore one method pair can be not enough to generate commit message.

### If we go with first method we have same dataset size: 21774

### Let's check how many datapoints we could have if we go with second method

In [97]:
LIST_OF_ALL_IDS = {}
LIST_OF_SINGLE_CHANGED_FILES = {}
METHOD_LENGTHS = {}

In [98]:
def caclulate_for_project(project):
    import os
    def is_single_changed(i):
        return len(os.listdir(f'../Tufano_data/{project}/{i}')) == 1 and len(os.listdir(f'../Tufano_data/{project}/{i}/0/')) == 1 and len(os.listdir(f'../Tufano_data/{project}/{i}/0/0')) == 4
    
    def calculate_method_length(i):
        METHOD_LENGTHS[project][i] = 0
        length = 0
        filenames = ['before.java', 'after.java']
        for filename in filenames:
            with open(f'../Tufano_data/{project}/{i}/0/0/{filename}', 'r') as f:
                for line in f:
                    METHOD_LENGTHS[project][i] += len(line.split())
        METHOD_LENGTHS[project][i] /= len(filenames)
    LIST_OF_ALL_IDS[project] = [i for i in os.listdir(f'../Tufano_data/{project}')]
    LIST_OF_SINGLE_CHANGED_FILES[project] = [i for i in LIST_OF_ALL_IDS[project] if is_single_changed(i)]
    METHOD_LENGTHS[project] = {}
    for i in LIST_OF_SINGLE_CHANGED_FILES[project]:
        calculate_method_length(i) 

In [99]:
caclulate_for_project('android')

In [100]:
caclulate_for_project('google')

In [101]:
caclulate_for_project('ovirt')

In [102]:
def print_statistics_on_single_datapoints():
    total_single = 0
    total = 0
    for project in LIST_OF_ALL_IDS.keys():
        total_single += len(LIST_OF_SINGLE_CHANGED_FILES[project])
        total += len(LIST_OF_ALL_IDS[project])
        print(f'{project}: {len(LIST_OF_SINGLE_CHANGED_FILES[project])} / {len(LIST_OF_ALL_IDS[project])} = {round(len(LIST_OF_SINGLE_CHANGED_FILES[project]) / len(LIST_OF_ALL_IDS[project]), 2)}')
    print(f'total: {total_single} / {total} = {round(total_single / total, 2)}')

In [103]:
print_statistics_on_single_datapoints()

android: 8515 / 22746 = 0.37
google: 5087 / 14099 = 0.36
ovirt: 7591 / 22800 = 0.33
total: 21193 / 59645 = 0.36


In [104]:
def print_methods_lengths_statistics(upper_bound=100):
    import numpy as np
    total_single = []
    for project in LIST_OF_ALL_IDS.keys():
        lengths = list(METHOD_LENGTHS[project].values())
        total_single += lengths
        np_lengths = np.array(lengths)
        print(f'{project}: mean = {round(np.mean(lengths))} std = {round(np.std(lengths))} len(<={upper_bound}) = {len(np_lengths[np_lengths <= upper_bound])}')
    np_total_single = np.array(total_single)
    print(f'total: mean = {round(np.mean(total_single))} std = {round(np.std(total_single))} {len(np_total_single[np_total_single <= upper_bound])}')

In [105]:
print_methods_lengths_statistics()

android: mean = 146.0 std = 278.0 len(<=100) = 5140
google: mean = 94.0 std = 91.0 len(<=100) = 3403
ovirt: mean = 75.0 std = 109.0 len(<=100) = 5951
total: mean = 108.0 std = 196.0 14494


In [106]:
import pandas as pd
CSV_DATA = pd.read_csv('../Tufano_data/PullRequests.csv')

In [107]:
CSV_DATA.keys()

Index(['Pull Request ID', 'URL', 'Project Name', 'Title',
       'Creation Timestamp'],
      dtype='object')

In [108]:
COMMIT_MESSAGES = {}

In [125]:
def generate_ind_to_row():
    ind_to_row = {'android': {}, 'google': {}, 'ovirt': {}}
    project_prefixes = {'android': 'android', 'google': 'gerrit-review', 'ovirt': 'gerrit.ovirt'}
    for j, url in enumerate(CSV_DATA['URL']):
        for project, project_prefix in project_prefixes.items():
            if url.startswith(project_prefix):
                ind_to_row[project][str(CSV_DATA['Pull Request ID'][j])] = j
    return ind_to_row

In [126]:
IND_TO_ROW = generate_ind_to_row()

In [141]:
def extract_commit_messages(project):
    from tqdm.auto import tqdm
    
    def get_row_in_csv(i):
        if i not in IND_TO_ROW[project].keys():
            return None
        return IND_TO_ROW[project][i]
    def get_commit_msg(i):
        row = get_row_in_csv(i)
        if row is None:
            print(f'For PR with ID = {i} in project {project} record not found in csv file.')
            return None
        return CSV_DATA['Title'][row]
    
    COMMIT_MESSAGES[project] = {}
    for i in tqdm(LIST_OF_SINGLE_CHANGED_FILES[project]):
        msg =  get_commit_msg(i)
        if msg is not None:
            COMMIT_MESSAGES[project][i] = get_commit_msg(i)

In [128]:
extract_commit_messages('google')

HBox(children=(IntProgress(value=0, max=5087), HTML(value='')))




In [129]:
COMMIT_MESSAGES['google']['64710'] # should be 'Do cheaper checks first in GroupCountrol.isVisible'

'Do cheaper checks first in GroupCountrol.isVisible'

In [132]:
import random

In [134]:
random.sample(list(COMMIT_MESSAGES['google'].values()), 20)

['Ignore errors when current row no longer exists in a table',
 'Fixing jdbc code: correct PreparedStatement closing',
 'Allow publishing old non-current patch sets',
 'Fix possible NPE in LsUserRefs',
 'Use context user for change message on submit',
 'Fixed regression caused by the defaultValue feature',
 'ChangeScreen2: Add title tooltip to reviewed checkbox',
 'SetReadyForReview: Change action label to "Start Review"',
 'Remove unused members',
 'Move a lot of the state for unifiedPatchDetailAction into the base',
 'BugFix on latest GitHub API: list repos from private repos',
 'ReceiveCommits.ReplaceRequest: Parse prior commit body earlier',
 'Mergeable: Reindex change asynchronously',
 'CloneWithCommitMsgHook: Fix HTTP clone command inconsistency',
 'Open RevWalk in try-with-resource',
 'Fix wrong date/time for commits in refs/users/default branch',
 'init: ensure that tmp dir exists before extracting plugins',
 'RelativeDateFormatter: Simplify rounding of years and months',
 "Don

In [136]:
extract_commit_messages('ovirt')

HBox(children=(IntProgress(value=0, max=7591), HTML(value='')))




In [137]:
random.sample(list(COMMIT_MESSAGES['ovirt'].values()), 20)

["webadmin: Sync file size and uploaded image's actual size.",
 'engine : Async tasks should handle exception in creating task',
 'core: Replacing StringUtils for StringHelper in LdapBrokerCommandBase.',
 'tools: Fixes password prompt for admin user',
 'gluster: fix releasing lock in task sync-job',
 'webadmin: display SD status in Storage sub-tab',
 'core: DirectorySearcherTest: reduce timeouts',
 'restapi: Fix backend vm resource test failure',
 'core: @Inject GetVnicProfilesByDataCenterIdQuery Daos',
 'restapi: Remove "DnsServers" and "DnsServer" complex types',
 'core: Use container id for memory lock in import process.',
 'core: [ExternalTasks] Cannot end existing job',
 'restapi: Do not cast memory.used to int',
 'core: AuditLogableBase: @Inject VdsKdumpStatusDao',
 'core: do not set run-on-vds name in monitoring',
 'tools: fix error logging',
 'core: ProcessOvfUpdate - skip vms/templates without ovf',
 'core: Fix MathUtils.greatestCommonDivisor()',
 'core: fix reattempt to go to

In [142]:
extract_commit_messages('android')

HBox(children=(IntProgress(value=0, max=8515), HTML(value='')))

For PR with ID = 664242 in project android record not found in csv file.
For PR with ID = 688460 in project android record not found in csv file.



In [144]:
random.sample(list(COMMIT_MESSAGES['android'].values()), 20)

['Fix suspend crash issue on no GPU platform.',
 'Fix renderscript compilation from Ant.',
 'Remove an incorrect assert',
 'DO NOT MERGE fix failing test testWifiInfoProperties for non-telephony devices',
 'NullPointerException invoking Field.getModifiers',
 'Renamed getNetworkId and getSystemId',
 'Adds CTS test for Script.Closure.getGlobal()',
 'Move the "huge method" test into its own test case.',
 'Scan for methods in extra interface hierarchy',
 'Match language-specific flags before default locale country matches',
 'Fix crash on ending call.',
 'Allow to share classloader for non-inline dexmaker',
 'payment: Do not reset the default payment is non-null.',
 'Fix merging HLoadClass with HNewInstance.',
 'Remove obsolete getIsimChallengeResponse',
 'HFP-Client: Set current device to null after firing broadcast',
 '75700: Do not flag appcompat method suggestions in non-appcompat activities',
 'Fix NotificationManagerTest.checkNotificationExistence',
 'DO NOT MERGE - [ActivityManager]

### We can see some patterns: "Seq1: Seq2". Maybe it is good idea to leave only "Seq2". Also, we should probably delete punctuations ('.' in the end of messages). Also, we should remove those which contain MERGE if there are a lot of such samples.

In [151]:
def print_number_of_prefixes(prefix_bounds):
    print(f'Prefix ":" statistics with bounds: {prefix_bounds}')
    def get_prefix_num(commit_msgs_dict, bounds):
        if bounds is None:
            with_prefix = [msg for msg in commit_msgs_dict.values() if ':' in msg]
        else:
            with_prefix = [msg for msg in commit_msgs_dict.values() if bounds[0] <= msg.count(':') < bounds[1]]
        return len(with_prefix)
    
    total_prefix_num = 0
    total = 0
    for project, commit_mgs in COMMIT_MESSAGES.items():
        prefix_num = get_prefix_num(commit_mgs, prefix_bounds)
        total_prefix_num += prefix_num
        total += len(commit_mgs)
        print(f'{project}: {prefix_num} / {len(commit_mgs)} = {prefix_num / len(commit_mgs)}')
    print(f'total: {total_prefix_num} / {total} = {total_prefix_num / total}')

def print_commit_msg_statistics(prefix_bounds):
    print_number_of_prefixes(prefix_bounds)

In [152]:
print_commit_msg_statistics(prefix_bounds=None)

Prefix ":" statistics with bounds: None
google: 1387 / 5087 = 0.27265578926675843
android: 1797 / 8513 = 0.21108892282391636
ovirt: 7305 / 7591 = 0.9623238045053353
total: 10489 / 21191 = 0.49497428153461376


In [153]:
print_commit_msg_statistics(prefix_bounds=(1, 2))

Prefix ":" statistics with bounds: (1, 2)
google: 1381 / 5087 = 0.27147631216827206
android: 1633 / 8513 = 0.1918242687654176
ovirt: 7050 / 7591 = 0.928731392438414
total: 10064 / 21191 = 0.4749185975178142


### As we can see half of samples are "seq1: seq2".

In [313]:
def print_after_filtering(predicate, label, n_sample):
    print(f'Filtering: {label}')
    total_filtered = 0
    total = 0
    for project, commit_mgs in COMMIT_MESSAGES.items():
        filtered = list(filter(predicate, commit_mgs.values()))
        total_filtered += len(filtered)
        total += len(commit_mgs)
        print(random.sample(filtered, min(n_sample, len(filtered))))
        print(f'{project}: {len(filtered)} / {len(commit_mgs)} = {len(filtered) / len(commit_mgs)}')
    print(f'total: {total_filtered} / {total} = {total_filtered / total}')

In [258]:
print_after_filtering(lambda msg: 1 <= msg.count(':') < 2 and ': ' in msg, '": " in msg once', n_sample=3)

Filtering: ": " in msg once
['SiteLibraryLoaderUtil: catch NoSuchFileException when scanning for JARs', 'OAuth extension point: Allow to authenticate without email', 'ChangeInserter: fix comparison of Change.Id to Change.Key']
google: 1358 / 5087 = 0.26695498329074113
['PBAP: remove dead code', 'Telephony: Fix call forward info logging.', 'Test: skip video test for watch']
android: 1594 / 8513 = 0.18724304005638434
['frontend: Fix NPE in ImportVmFromExternalSourceModel', 'core: Fix RandomUtils#nextPropertyString', "webadmin: Don't trigger event when assigning label"]
ovirt: 6998 / 7591 = 0.9218811750757476
total: 9950 / 21191 = 0.46953895521683736


In [179]:
print_after_filtering(lambda msg: 1 <= msg.count(':') < 2 and len(msg.split(':')[0].split()) == 1, 'only one word before ":"', n_sample=3)

Filtering: only one word before ":"
['HostPageServlet: Avoid no-op Cookie#setHttpOnly call', 'MergeSuperSet: Inline empty set', 'ReviewerRecommender: Make array of weights a constant']
google: 1251 / 5087 = 0.2459209750344014
['LocationManagerService: Fix bug removing proximity alerts.', 'CTS: clear notification when testPerformGlobalActionQuickSettings is done', 'sun.security.pkcs: type tyding. Port from jdk8u60']
android: 1243 / 8513 = 0.14601198167508517
ovirt: 6940 / 7591 = 0.914240548017389
total: 9434 / 21191 = 0.4451889953282054


In [175]:
print_after_filtering(lambda msg: 2 <= msg.count(':') < 100000, 'more than two ":"', n_sample=3)

Filtering: more than two ":"
['Close DirContext context in LdapRealm::isActive', 'Make bug: an alias for tr: in search', 'ChangeNoteUtil: Fix parsing of "0:0-0:0" range']
google: 6 / 5087 = 0.0011794770984863377
['53995: Lint: UnusedIds false positive on defined integer', 'Lint: Warn that @android:string/yes returns OK, not Yes', 'integrate https://android-review.googlesource.com/#/c/145848/: add API allowing one RunConfigurationProducer to replace another one (cherry picked from commit 6a3ed85)']
android: 164 / 8513 = 0.019264654058498767
['aaa: Cannot get administration portal after logging to IPA domain, WFLYEJB0442: Unexpected Error', 'core:refactor: Add IStorageHelper.prepareConnectHostToStoragePoolServers', 'vdsbroker: refactoring: Inject ResourceManager into SpmStopVDSCommand']
ovirt: 255 / 7591 = 0.03359241206692135
total: 425 / 21191 = 0.020055684016799583


In [176]:
print_after_filtering(lambda msg: 1 <= msg.count(':') < 2 and len(msg.split(':')[0].split()) > 1, 'several words before ":"', n_sample=3)

Filtering: several words before ":"
['Cosmetic change: chain setters', 'Change screen: Make change owner votes removable', 'Fix jdbc code: Both PreparedStatements must be closed in any case']
google: 130 / 5087 = 0.02555533713387065
['Possible fix for https://code.google.com/p/android/issues/detail?id=152085', 'Align with main: two ways of parsing repeated packable fields.', 'Late binding: track differences in RI behavior']
android: 390 / 8513 = 0.045812287090332436
['Revert "engine: Obtain old network name from correct NIC (#849971)"', 'userportal, webadmin: moved ApplicationResourcesWithLookup', 'Revert "core: Prevent StopVm to interleave with other VM locked actions"']
ovirt: 110 / 7591 = 0.014490844421024898
total: 630 / 21191 = 0.029729602189608795


In [255]:
print_after_filtering(lambda msg: '::' in msg, '"::" in msg', n_sample=3)

Filtering: "::" in msg


In [180]:
print_after_filtering(lambda msg: 1 <= msg.lower().split().count('revert') < 2, 'revert words', n_sample=3)

Filtering: revert words
['Revert "Workaround Guice bug "getPathInfo not decoded""', 'Revert "Fix GWT UI AddFileBox to provide path suggestions continuously"', 'Revert "Make VisibleRefFilter.Filter reuse the refs passed from JGit."']
google: 38 / 5087 = 0.007470021623746805
['Revert "Reverted to Gradle 1.6 due to RMI problem when importing projects."', 'Revert "Revert "ExpatParser LP64 fixes.""', 'Revert "HIDL Java getService now the same as C++."']
android: 210 / 8513 = 0.02466815458710208
['Revert "core: Kernel cmdline - host deploy"', 'Revert "engine: Create default network QoS for ovirtmgmt upon DC creation"', 'core: Revert Delete jobs that their steps have no async-tasks']
ovirt: 63 / 7591 = 0.008299301804768806
total: 311 / 21191 = 0.014676041715822754


In [186]:
print_after_filtering(lambda msg: 'engine: Create default network QoS' in msg, 'revert has always unrevert', n_sample=4)

Filtering: revert has always unrevert
[]
google: 0 / 5087 = 0.0
[]
android: 0 / 8513 = 0.0
['Revert "engine: Create default network QoS for ovirtmgmt upon DC creation"', 'engine: Create default network QoS for ovirtmgmt upon DC creation', 'Revert "engine: Create default network QoS for ovirtmgmt upon DC creation"', 'Revert "engine: Create default network QoS for ovirtmgmt upon DC creation"']
ovirt: 4 / 7591 = 0.0005269397971281781
total: 4 / 21191 = 0.00018875937898164315


In [166]:
def print_merge_statistics():
    print(f'Merge ":" statistics')
    def get_merge_num(commit_msgs_dict):
        with_prefix = [msg for msg in commit_msgs_dict.values() if 'merge' in msg.lower().split()]
        # print(random.sample(with_prefix, 5))
        return len(with_prefix)
    
    total_merge_num = 0
    total = 0
    for project, commit_mgs in COMMIT_MESSAGES.items():
        merge_num = get_merge_num(commit_mgs)
        total_merge_num += merge_num
        total += len(commit_mgs)
        print(f'{project}: {merge_num} / {len(commit_mgs)} = {merge_num / len(commit_mgs)}')
    print(f'total: {total_merge_num} / {total} = {total_merge_num / total}')

In [167]:
print_merge_statistics()

Merge ":" statistics
google: 168 / 5087 = 0.03302535875761746
android: 239 / 8513 = 0.02807470926817808
ovirt: 14 / 7591 = 0.0018442892899486233
total: 421 / 21191 = 0.019866924637817942


In [160]:
def print_do_not_merge_statistics():
    print(f'Merge ":" statistics')
    def get_merge_num(commit_msgs_dict):
        with_prefix = [msg for msg in commit_msgs_dict.values() if msg.startswith('DO NOT MERGE')]
        return len(with_prefix)
    
    total_merge_num = 0
    total = 0
    for project, commit_mgs in COMMIT_MESSAGES.items():
        merge_num = get_merge_num(commit_mgs)
        total_merge_num += merge_num
        total += len(commit_mgs)
        print(f'{project}: {merge_num} / {len(commit_mgs)} = {merge_num / len(commit_mgs)}')
    print(f'total: {total_merge_num} / {total} = {total_merge_num / total}')

In [161]:
print_do_not_merge_statistics()

Merge ":" statistics
google: 0 / 5087 = 0.0
android: 152 / 8513 = 0.017855045224950076
ovirt: 0 / 7591 = 0.0
total: 152 / 21191 = 0.00717285640130244


### Looks like 'merge' word in commit messages is not really a problem.

### Let's look at punctuation

In [189]:
import string
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [190]:
print_after_filtering(lambda msg: len([p for p in string.punctuation if p in msg]) == 0, 'do no contain punctuation at all', n_sample=3)

Filtering: do no contain punctuation at all
['Remove index defaultMaxClauseCount config setting while reusing maxTerms', 'Set hasChange on index queries depending on schema version', 'Add test for search of an empty topic']
google: 2090 / 5087 = 0.4108511893060743
['Let voicemail broadcast test pass if not applicable', 'Add property to be able to import malformed jar files', 'Remove verification for DownloadReceiver when failure']
android: 2562 / 8513 = 0.30095148596264537
['Introduce template diskattachments resources and remove old API', 'Add import template from configuration', 'Report constraint matches when using TestOptimizer']
ovirt: 179 / 7591 = 0.02358055592148597
total: 4831 / 21191 = 0.2279741399650795


In [191]:
# Ovirt is very low because of ':'

In [223]:
MY_PUNCTUATION = set(string.punctuation) - {':', '.', ','}
print(MY_PUNCTUATION)

{'\\', '@', '`', '>', ')', '&', '~', '$', '^', ']', '<', '+', '/', '_', '?', '{', '!', '|', '=', '#', ';', '"', '}', '[', '-', '*', '%', '(', "'"}


In [224]:
print_after_filtering(lambda msg: len([p for p in MY_PUNCTUATION if p in msg]) == 0, 'do no contain punctuation at all', n_sample=3)

Filtering: do no contain punctuation at all
['ChangeScreen2: Respect user preference for unified diff view', 'Extract fetching commit messages into injectable class', 'Reindex account on push to user branch']
google: 3237 / 5087 = 0.6363278946333792
['Launch new Home app when selecting Home app in Settings', 'SELinuxTest:  Add a few more cases to testFileContexts.', 'Unregister data enabled changed listener on default phone']
android: 5650 / 8513 = 0.6636908257958416
['webadmin: Hotfix of icon validation for IE', 'core: Make ConfigValuesTest more informative', 'webadmin: display Disk Snapshot ID on cinder domains']
ovirt: 5231 / 7591 = 0.689105519694375
total: 14118 / 21191 = 0.6662262281157095


In [241]:
print_after_filtering(lambda msg: msg[-1] == '.', '"." is last character', n_sample=3)

Filtering: "." is last character
['FIX: deep-link into individual commit for non-root URLs.', "Add ',' to be encoded in email headers.", 'Allow forcing mergeability check through the REST API.']
google: 192 / 5087 = 0.03774326715156281
android: 2943 / 8513 = 0.3457065664278163
['core:  QueryData2 generates slow SQL for...', 'engine: Query getdisksvmguid caused postmaster processes to consume constantly 100%cpu.', 'core,webadmin: change switch type default value.']
ovirt: 396 / 7591 = 0.052167039915689634
total: 3531 / 21191 = 0.1666273417960455


In [242]:
# Looks like we should remove point as last character too.

In [243]:
print_after_filtering(lambda msg: '.' in msg and msg[-1] != '.', '"." is not last character', n_sample=3)

Filtering: "." is not last character
["Merge branch 'stable-2.14' into stable-2.15", 'PatchLineCommentsUtil#setCommentRevId(...): Remove unused return value', 'Adapt to the simplified GitReferenceUpdatedListener.Event API']
google: 548 / 5087 = 0.10772557499508552
['Update ECJ to 4.6M4', 'Bug #201268 android.telecom.cts.RemoteConnectionTest -- testRemoteConnectionVideoCallbacks_CallDataUsage fail', "41782: Graphical Layout Editor can't handle TabWidget. DO NOT MERGE"]
android: 724 / 8513 = 0.0850463996241043
['core: CompatibilityVersionUtils.getEffective() with Supplier<Version>', 'restapi: Do not cast memory.used to int', 'host-deploy: upgrade to apache-sshd 0.11.0']
ovirt: 349 / 7591 = 0.04597549729943354
total: 1621 / 21191 = 0.07649473833231088


In [253]:
print_after_filtering(lambda msg: '. ' in msg or msg[-1] == '.', '"." as end of sentence', n_sample=3)

Filtering: "." as end of sentence
['web.otherUrls auto-config based on Gerrit SSH/HTTPD URLs.', 'Not displaying the "Sign Out" for CLIENT_SSL_CERT_LDAP.', 'Fix flaky test.']
google: 195 / 5087 = 0.03833300570080598
['Fix a possible NPE when reading bad prop files.', 'Fix ADT build: use new AndroidTargetHash.', 'logcat: Validate regex patterns before creating filters.']
android: 3006 / 8513 = 0.3531070128039469
['webadmin: Modify minimum threshold of Quota.', 'core+webadmin: Audit log a network error while uploading disk.', 'webadmin: Add storage attributes on import SD.']
ovirt: 408 / 7591 = 0.05374785930707417
total: 3609 / 21191 = 0.17030814968618754


In [250]:
print_after_filtering(lambda msg: '...' in msg, '"..." in sentence', n_sample=3)

Filtering: "..." in sentence


In [247]:
def print_punc_frequency():
    def print_freq_for_p(p):
        print()
        print_after_filtering(lambda msg: p in msg, f'frequency for {p}', n_sample=3)
    
    for p in string.punctuation:
        print_freq_for_p(p)

In [222]:
print_punc_frequency()


Filtering: frequency for !

Filtering: frequency for "
['Not displaying the "Sign Out" for CLIENT_SSL_CERT_LDAP.', 'Fix z-index of "session expired" dialog', 'Fix RpcStatus to display "Working..." when header is hidden']
google: 178 / 5087 = 0.03499115392176135
['Merge "DO NOT MERGE: Test for bug 33137046" into nougat-cts-dev am: 38465d5435  -s ours', 'Add dalvik subitems for -d in dumpsys meminfo "Total PSS by category"', 'Revert "java.time.Instant.toEpochMilli() fixes"']
android: 429 / 8513 = 0.05039351579936568
['restapi: Always use "general" value of VdcFenceOptionTypes', 'core: Clear current SPM upon "Not SPM" error', 'core: Use "Integer.parseInt" in "VmHandler"']
ovirt: 204 / 7591 = 0.026873929653537083
total: 811 / 21191 = 0.03827096408852815

Filtering: frequency for #
['VersionedMetaData#load: Use provided RevWalk instead of creating a new one', 'Avoid long overflow in AccessToken#isExpired()', 'VersionedMetaData#updateRef: Use passed newId value']
google: 115 / 5087 = 0.0226

In [231]:
from nltk.tokenize import word_tokenize

In [232]:
word_tokenize('Added CLOSED_INBOUND and CLOSED_OUTBOUND states to OpenSSLEngineImpl#getHandshakeStatus()')

['Added',
 'CLOSED_INBOUND',
 'and',
 'CLOSED_OUTBOUND',
 'states',
 'to',
 'OpenSSLEngineImpl',
 '#',
 'getHandshakeStatus',
 '(',
 ')']

In [233]:
# Let's check that these changes have only one change in method for real.

In [236]:
def sample_urls_for_project(project, n_sample):
    print([CSV_DATA['URL'][IND_TO_ROW[project][i]] for i in random.sample(LIST_OF_SINGLE_CHANGED_FILES[project], n_sample)])

In [237]:
sample_urls_for_project('android', 10)

['android-review.googlesource.com/c/123111', 'android-review.googlesource.com/c/400852', 'android-review.googlesource.com/c/231511', 'android-review.googlesource.com/c/166181', 'android-review.googlesource.com/c/681962', 'android-review.googlesource.com/c/15460', 'android-review.googlesource.com/c/114946', 'android-review.googlesource.com/c/14917', 'android-review.googlesource.com/c/115814', 'android-review.googlesource.com/c/623828']


In [239]:
sample_urls_for_project('google', 10)

['gerrit-review.googlesource.com/c/121070', 'gerrit-review.googlesource.com/c/47913', 'gerrit-review.googlesource.com/c/148150', 'gerrit-review.googlesource.com/c/72787', 'gerrit-review.googlesource.com/c/74469', 'gerrit-review.googlesource.com/c/71697', 'gerrit-review.googlesource.com/c/112210', 'gerrit-review.googlesource.com/c/30016', 'gerrit-review.googlesource.com/c/79981', 'gerrit-review.googlesource.com/c/6723']


In [240]:
sample_urls_for_project('ovirt', 10)

['gerrit.ovirt.org/c/60884', 'gerrit.ovirt.org/c/69607', 'gerrit.ovirt.org/c/10990', 'gerrit.ovirt.org/c/6318', 'gerrit.ovirt.org/c/40423', 'gerrit.ovirt.org/c/9126', 'gerrit.ovirt.org/c/34955', 'gerrit.ovirt.org/c/30998', 'gerrit.ovirt.org/c/18664', 'gerrit.ovirt.org/c/63158']


### I took a look at those PRs. Most of the methods which were changed are big enough. Therefore I have big doubts about final size of dataset. Also approximately a half of PRs contained more than one file changed.

In [263]:
print_after_filtering(lambda msg: len([w for w in msg.split() if len([str(i) for i in range(10) if str(i) in w]) != 0]) != 0, 'msgs with numbers', n_sample=3)

Filtering: msgs with numbers
['Upgrade gerrit plugin api to 2.11.1', 'Update H2 to 1.2.134', 'Improve LDAP login times, transfer 40x less data.']
google: 486 / 5087 = 0.09553764497739335
['resolve merge conflicts of 1cf3c1dd2a to nougat-mr1-cts-dev', 'Revert "Add regression tests for 32-bit x86 struct layout bug fixes."', 'Merge "DO NOT MERGE: Fix VrDisplayTest" into oreo-cts-dev am: 92d3cbf146  -s ours']
android: 1315 / 8513 = 0.1544696346763773
["core: Improve VmDevice's toString (#851991)", 'core: report 0 on negative values in maxSchedulingMemory', 'engine: Unit tests fail in non-English locale (#1171139)']
ovirt: 437 / 7591 = 0.057568172836253456
total: 2238 / 21191 = 0.10561087254022934


## Final decisions: we should remove the whole prefix before ":", we should remove "." if it is the last character

In [254]:
word_tokenize('Close DirContext context in LdapRealm::isActive')

['Close', 'DirContext', 'context', 'in', 'LdapRealm', ':', ':isActive']

In [259]:
word_tokenize('integrate https://android-review.googlesource.com/#/c/145848/: add API allowing one RunConfigurationProducer to replace another one (cherry picked from commit 6a3ed85)')

['integrate',
 'https',
 ':',
 '//android-review.googlesource.com/',
 '#',
 '/c/145848/',
 ':',
 'add',
 'API',
 'allowing',
 'one',
 'RunConfigurationProducer',
 'to',
 'replace',
 'another',
 'one',
 '(',
 'cherry',
 'picked',
 'from',
 'commit',
 '6a3ed85',
 ')']

In [325]:
def generate_unabstracted_cmg_dataset():
    from shutil import copyfile

    root = '../Tufano_data/'
    with open(root + 'datasets/cmg_unabstracted/ids.txt', 'w') as ids_file, \
         open(root + 'datasets/cmg_unabstracted/msgs.txt', 'w') as msgs_file:
        ids_to_write = []
        msgs_to_write = []
        for project, pr_ids in LIST_OF_SINGLE_CHANGED_FILES.items():
            for pr_id in pr_ids:
                if pr_id not in IND_TO_ROW[project].keys():
                    print(f'{pr_id} not found in csv for project {project}, it will be skipped')
                    continue
                ids_to_write.append(str(IND_TO_ROW[project][pr_id]))
                assert(len(COMMIT_MESSAGES[project][pr_id].splitlines()) == 1)
                msgs_to_write.append(COMMIT_MESSAGES[project][pr_id])
                cur_id = len(ids_to_write)
                copyfile(root + f'{project}/{pr_id}/0/0/before.java', root + f'datasets/cmg_unabstracted/prev/{cur_id}.java')
                copyfile(root + f'{project}/{pr_id}/0/0/after.java', root + f'datasets/cmg_unabstracted/updated/{cur_id}.java')
        ids_file.write('\n'.join(ids_to_write))
        msgs_file.write('\n'.join(msgs_to_write))

In [326]:
generate_unabstracted_cmg_dataset()

664242 not found in csv for project android, it will be skipped
688460 not found in csv for project android, it will be skipped


In [300]:
print_after_filtering(lambda msg: 'merge' in msg.lower().split(), 'merge', n_sample=3)

Filtering: merge
['Display hash of the cherry-pick merge in comment', 'Fix: Gerrit cannot display Change with non-resolvable Merge Commit', "Merge branch 'stable-2.11'"]
google: 168 / 5087 = 0.03302535875761746
['resolve merge conflicts of f15f0ef to gradle-dev.', 'DO NOT MERGE Remove docked stack size assertion', 'resolve merge conflicts of 4b2496ba197fb161dd35cb3956211d3dd3ef5e3f to oreo-mr1-vts-dev']
android: 239 / 8513 = 0.02807470926817808
total: 421 / 21191 = 0.019866924637817942


### Let's remove merge commits

In [327]:
def generate_unabstracted_cmg_dataset_with_removed_merge_commits():
    from shutil import copyfile

    root = '../Tufano_data/'
    with open(root + 'datasets/cmg_unabstracted_without_merge_commits/ids.txt', 'w') as ids_file, \
         open(root + 'datasets/cmg_unabstracted_without_merge_commits/msgs.txt', 'w') as msgs_file:
        ids_to_write = []
        msgs_to_write = []
        skipped_merge = 0
        for project, pr_ids in LIST_OF_SINGLE_CHANGED_FILES.items():
            for pr_id in pr_ids:
                if pr_id not in IND_TO_ROW[project].keys():
                    print(f'{pr_id} not found in csv for project {project}, it will be skipped')
                    continue
                msg = COMMIT_MESSAGES[project][pr_id]
                if 'merge' in msg.lower().split():
                    skipped_merge += 1
                    continue
                ids_to_write.append(str(IND_TO_ROW[project][pr_id]))
                assert(len(msg.splitlines()) == 1)
                msgs_to_write.append(msg)
                cur_id = len(ids_to_write)
                copyfile(root + f'{project}/{pr_id}/0/0/before.java', root + f'datasets/cmg_unabstracted_without_merge_commits/prev/{cur_id}.java')
                copyfile(root + f'{project}/{pr_id}/0/0/after.java', root + f'datasets/cmg_unabstracted_without_merge_commits/updated/{cur_id}.java')
        ids_file.write('\n'.join(ids_to_write))
        msgs_file.write('\n'.join(msgs_to_write))
        print(f'Skipped merge commits: {skipped_merge}')

In [328]:
generate_unabstracted_cmg_dataset_with_removed_merge_commits()

664242 not found in csv for project android, it will be skipped
688460 not found in csv for project android, it will be skipped
Skipped merge commits: 421


In [329]:
def generate_unabstracted_cmg_dataset_with_removed_merge_commits_with_removed_not_single_colons():
    from shutil import copyfile

    root = '../Tufano_data/'
    with open(root + 'datasets/unabstracted_cmg_dataset_with_removed_merge_commits_with_removed_not_single_colons/ids.txt', 'w') as ids_file, \
         open(root + 'datasets/unabstracted_cmg_dataset_with_removed_merge_commits_with_removed_not_single_colons/msgs.txt', 'w') as msgs_file:
        ids_to_write = []
        msgs_to_write = []
        skipped_merge = 0
        skipped_colons = 0
        for project, pr_ids in LIST_OF_SINGLE_CHANGED_FILES.items():
            for pr_id in pr_ids:
                if pr_id not in IND_TO_ROW[project].keys():
                    print(f'{pr_id} not found in csv for project {project}, it will be skipped')
                    continue
                msg = COMMIT_MESSAGES[project][pr_id]
                if 'merge' in msg.lower().split():
                    skipped_merge += 1
                    continue
                if len(msg.split(': ')) > 2:
                    skipped_colons += 1
                    continue
                ids_to_write.append(str(IND_TO_ROW[project][pr_id]))
                assert(len(msg.splitlines()) == 1)
                msgs_to_write.append(msg)
                cur_id = len(ids_to_write)
                copyfile(root + f'{project}/{pr_id}/0/0/before.java', root + f'datasets/unabstracted_cmg_dataset_with_removed_merge_commits_with_removed_not_single_colons/prev/{cur_id}.java')
                copyfile(root + f'{project}/{pr_id}/0/0/after.java', root + f'datasets/unabstracted_cmg_dataset_with_removed_merge_commits_with_removed_not_single_colons/updated/{cur_id}.java')
        ids_file.write('\n'.join(ids_to_write))
        msgs_file.write('\n'.join(msgs_to_write))
        print(f'Skipped merge commits: {skipped_merge}')
        print(f'Skipped colons: {skipped_colons}')

In [330]:
generate_unabstracted_cmg_dataset_with_removed_merge_commits_with_removed_not_single_colons()

664242 not found in csv for project android, it will be skipped
688460 not found in csv for project android, it will be skipped
Skipped merge commits: 421
Skipped colons: 334


In [318]:
print_after_filtering(lambda msg: msg.count(': ') > 1, 'merge', n_sample=3)

Filtering: merge
['Use Tested-by: instead of Verified-by: during cherry-pick', 'Make bug: an alias for tr: in search']
google: 2 / 5087 = 0.00039315903282877927
['Android patch: CLDR ticket #7969: Percent formatting in RTL text.', 'Merge "DO NOT MERGE: Crop the real height of status bar when taking preference screenshot" into oreo-cts-dev am: a6be3b7154  -s ours', 'DO NOT MERGE: CTS test for fix of b/65717533 am: 50d956befd  -s ours']
android: 132 / 8513 = 0.015505697169035593
['core: framework: add domain processor pre-processor', 'utils: LocalConfig: support empty file names', 'sercon: servlet: db: send correct login name']
ovirt: 234 / 7591 = 0.03082597813199842
total: 368 / 21191 = 0.01736586286631117


In [352]:
def preprocess_prefix_colon(msg):
    assert(len(msg.split(': ')) <= 2)
    if ':' in msg and len(msg.split(': ')) == 1 and len(msg.split(':')[0].split()) == 1:
        return msg.split(':')[-1]
    return msg.split(': ')[-1]

def preprocess_suffix_punctuation(msg):
    if msg.endswith('.'):
        return msg[:-1]
    elif msg.endswith('...'):
        print(msg)
        print(msg[:-3])
        assert(False)
        return msg[:-3]
    return msg

def preprocess_msg(msg):
    msg = preprocess_prefix_colon(msg)
    msg = preprocess_suffix_punctuation(msg)
    return msg
    
def preprocess_msgs(dataset_folder_name):
    root = '../Tufano_data/'
    with open(root + f'datasets/{dataset_folder_name}/msgs.txt', 'r') as msgs_file, \
         open(root + f'datasets/{dataset_folder_name}/msgs_preprocessed.txt', 'w') as msgs_preprocessed_file:
        preprocessed_msgs = [preprocess_msg(msg.strip()) for msg in msgs_file]
        msgs_preprocessed_file.write('\n'.join(preprocessed_msgs))

In [353]:
preprocess_msg('webadmin:Removed console tab from new/edit cluster popup in gluster only mode')

'Removed console tab from new/edit cluster popup in gluster only mode'

In [338]:
preprocess_msgs()

In [357]:
def generate_unabstracted_cmg_dataset_with_removed_merge_commits_with_removed_not_single_colons_for_abstraction():
    from shutil import copyfile
    import os

    root = '../Tufano_data/'
    with open(root + 'datasets/unabstracted_cmg_dataset_with_removed_merge_commits_with_removed_not_single_colons_for_abstraction/ids.txt', 'w') as ids_file, \
         open(root + 'datasets/unabstracted_cmg_dataset_with_removed_merge_commits_with_removed_not_single_colons_for_abstraction/msgs.txt', 'w') as msgs_file:
        ids_to_write = []
        msgs_to_write = []
        skipped_merge = 0
        skipped_colons = 0
        for project, pr_ids in LIST_OF_SINGLE_CHANGED_FILES.items():
            for pr_id in pr_ids:
                if pr_id not in IND_TO_ROW[project].keys():
                    print(f'{pr_id} not found in csv for project {project}, it will be skipped')
                    continue
                msg = COMMIT_MESSAGES[project][pr_id]
                if 'merge' in msg.lower().split():
                    skipped_merge += 1
                    continue
                if len(msg.split(': ')) > 2:
                    skipped_colons += 1
                    continue
                ids_to_write.append(str(IND_TO_ROW[project][pr_id]))
                assert(len(msg.splitlines()) == 1)
                msgs_to_write.append(msg)
                cur_id = len(ids_to_write)
                root_of_sample = root + f'datasets/unabstracted_cmg_dataset_with_removed_merge_commits_with_removed_not_single_colons_for_abstraction/{cur_id}/'
                os.mkdir(root_of_sample)
                copyfile(root + f'{project}/{pr_id}/0/0/before.java', root_of_sample + 'prev_method.java')
                copyfile(root + f'{project}/{pr_id}/0/0/after.java', root_of_sample + 'updated_method.java')
        ids_file.write('\n'.join(ids_to_write))
        msgs_file.write('\n'.join(msgs_to_write))
        print(f'Skipped merge commits: {skipped_merge}')
        print(f'Skipped colons: {skipped_colons}')

In [358]:
generate_unabstracted_cmg_dataset_with_removed_merge_commits_with_removed_not_single_colons_for_abstraction()

664242 not found in csv for project android, it will be skipped
688460 not found in csv for project android, it will be skipped
Skipped merge commits: 421
Skipped colons: 334


In [359]:
preprocess_msgs('unabstracted_cmg_dataset_with_removed_merge_commits_with_removed_not_single_colons_for_abstraction')

In [412]:
def collect_abstracted_code(dataset_folder_name, new_dataset_folder_name):
    import os
    from pathlib import Path

    root = '../Tufano_data/'
    with open(root + f'datasets/{new_dataset_folder_name}/prev.txt', 'w') as prev_file, \
         open(root + f'datasets/{new_dataset_folder_name}/updated.txt', 'w') as updated_file, \
         open(root + f'datasets/{new_dataset_folder_name}/msgs.txt', 'r') as msgs_file, \
         open(root + f'datasets/{new_dataset_folder_name}/ids.txt', 'r') as ids_file, \
         open(root + f'datasets/{new_dataset_folder_name}/msgs_new.txt', 'w') as msgs_file_new, \
         open(root + f'datasets/{new_dataset_folder_name}/ids_new.txt', 'w') as ids_file_new, \
         open(root + f'datasets/{new_dataset_folder_name}/folder_names.txt', 'w') as folder_names:
        lines_prev = []
        lines_updated = []
        errored_skipped = 0
        msgs_lines = msgs_file.readlines()
        ids_lines = ids_file.readlines()
        new_msgs = []
        new_ids = []
        idx = sorted([int(i) for i in os.listdir(root + f'datasets/{dataset_folder_name}') if not i.endswith('.txt')])
        for i in idx:
            prev = Path(root + f'datasets/{dataset_folder_name}/{i}/abstracted/prev.txt').read_text().strip()
            updated = Path(root + f'datasets/{dataset_folder_name}/{i}/abstracted/updated.txt').read_text().strip()
            if prev == '<ERROR>':
                print(f'Skipped {i} because of error')
                errored_skipped += 1
                continue
            lines_prev.append(prev)
            lines_updated.append(updated)
            new_msgs.append(msgs_lines[i - 1].strip())
            new_ids.append(ids_lines[i - 1].strip())
            folder_names.write(f'{i}\n')
        prev_file.write('\n'.join(lines_prev))
        updated_file.write('\n'.join(lines_updated))
        msgs_file_new.write('\n'.join(new_msgs))
        ids_file_new.write('\n'.join(new_ids))
        print(f'Total skipped: {errored_skipped} / {20436} = {errored_skipped / 20436}')

In [413]:
collect_abstracted_code('unabstracted_cmg_dataset_with_removed_merge_commits_with_removed_not_single_colons_for_abstraction', 'final_dataset')

Skipped 6792 because of error
Skipped 8230 because of error
Skipped 8577 because of error
Skipped 8675 because of error
Skipped 8690 because of error
Skipped 8899 because of error
Skipped 8982 because of error
Skipped 9065 because of error
Skipped 9669 because of error
Skipped 9708 because of error
Skipped 9932 because of error
Skipped 9982 because of error
Skipped 10163 because of error
Skipped 10682 because of error
Skipped 11029 because of error
Skipped 11057 because of error
Skipped 11325 because of error
Skipped 11372 because of error
Skipped 11386 because of error
Skipped 11530 because of error
Skipped 11688 because of error
Skipped 11738 because of error
Skipped 11788 because of error
Skipped 12288 because of error
Skipped 12586 because of error
Skipped 13062 because of error
Total skipped: 26 / 20436 = 0.001272264631043257


In [414]:
def tokenize_messages(dataset_folder_name):
    import os
    from nltk.tokenize import word_tokenize
    root = '../Tufano_data/'
    with open(root + f'datasets/{dataset_folder_name}/msgs_new.txt', 'r') as msg_file, \
         open(root + f'datasets/{dataset_folder_name}/msg_tokenized.txt', 'w') as msg_new_file:
        tokenized_msgs = [' '.join(word_tokenize(msg.strip())) for msg in msg_file]
        msg_new_file.write('\n'.join(tokenized_msgs))

In [415]:
tokenize_messages('final_dataset')

In [429]:
def abstract_messages(dataset_folder_name, dataset_folder_name_mapping):
    import os
    
    def read_mapping(folder_name):
        with open(root + f'datasets/{dataset_folder_name_mapping}/{folder_name}/abstracted/mapping.txt', 'r') as mapping_file:
            mapping = {}
            i = 0
            mapping_lines = [l.strip() for l in mapping_file]
            while i < len(mapping_lines) and mapping_lines[i] != '':
                keys = mapping_lines[i].split(',')[:-1]
                values = mapping_lines[i + 1].split(',')[:-1]
                mapping.update(zip(keys, values))
                i += 2
        return mapping
    
    root = '../Tufano_data/'
    with open(root + f'datasets/{dataset_folder_name}/msg_tokenized.txt', 'r') as msg_file, \
         open(root + f'datasets/{dataset_folder_name}/msg_abstract.txt', 'w') as msg_new_file, \
         open(root + f'datasets/{dataset_folder_name}/folder_names.txt', 'r') as folder_names:
        abstracted_msgs = []
        msg_lines = [l.strip() for l in msg_file]
        folder_names_lines = [l.strip() for l in folder_names]
        i = 1
        changed_lines = []
        for msg, folder_name in zip(msg_lines, folder_names_lines):
            mapping = read_mapping(folder_name)
            abstracted_msg = []
            number_of_mapped = 0
            for t in msg.split(' '):
                if t in mapping and t.lower() != t:
                    abstracted_msg.append(mapping[t])
                    number_of_mapped += 1
                else:
                    abstracted_msg.append(t)
            if number_of_mapped > 0:
                changed_lines.append(i)
            abstracted_msgs.append(' '.join(abstracted_msg))
            i += 1
        msg_new_file.write('\n'.join(abstracted_msgs))
        print(f'Msg changed: {len(changed_lines)} / {len(msg_lines)} = {len(changed_lines) / len(msg_lines)}')
        print(changed_lines)

In [430]:
abstract_messages('final_dataset', 'unabstracted_cmg_dataset_with_removed_merge_commits_with_removed_not_single_colons_for_abstraction')

Msg changed: 1223 / 20410 = 0.05992160705536502
[20, 22, 25, 57, 58, 67, 78, 80, 89, 99, 114, 162, 175, 201, 218, 244, 256, 267, 305, 310, 321, 333, 345, 351, 360, 371, 377, 390, 393, 423, 427, 429, 434, 438, 442, 459, 488, 508, 560, 562, 575, 579, 605, 622, 624, 625, 629, 637, 640, 666, 669, 691, 695, 719, 740, 742, 749, 751, 799, 809, 819, 821, 847, 864, 891, 908, 915, 936, 942, 966, 983, 990, 993, 1010, 1018, 1027, 1048, 1052, 1072, 1081, 1114, 1131, 1134, 1137, 1149, 1182, 1190, 1196, 1197, 1206, 1235, 1251, 1259, 1263, 1281, 1295, 1296, 1299, 1305, 1307, 1308, 1317, 1336, 1339, 1343, 1352, 1354, 1360, 1364, 1378, 1381, 1396, 1406, 1411, 1426, 1435, 1479, 1489, 1502, 1524, 1525, 1533, 1540, 1566, 1572, 1574, 1587, 1591, 1593, 1596, 1619, 1630, 1639, 1654, 1655, 1672, 1697, 1698, 1735, 1745, 1789, 1802, 1812, 1814, 1817, 1818, 1825, 1835, 1838, 1842, 1860, 1867, 1876, 1898, 1915, 1935, 1958, 2001, 2004, 2025, 2029, 2054, 2071, 2097, 2098, 2111, 2127, 2137, 2140, 2156, 2165, 2184, 21

In [454]:
def analyze_methods_length(predicate, label):
    import os
    from pathlib import Path

    root = '../Tufano_data/'
    with open(root + f'datasets/final_dataset/prev.txt', 'r') as prev_file, \
         open(root + f'datasets/final_dataset/updated.txt', 'r') as updated_file:
        correct_idx = []
        prev_lines = [l.strip() for l in prev_file]
        updated_lines = [l.strip() for l in updated_file]
        i = 0
        for prev, updated in zip(prev_lines, updated_lines):
            if predicate(prev.split(' '), updated.split(' ')):
                correct_idx.append(i)
            i += 1
        print(label)
        print(f'Result: {len(correct_idx)} / {len(prev_lines)} = {len(correct_idx) / len(prev_lines)}')
        return correct_idx

In [462]:
analyze_methods_length(lambda p, u: len(p) <= 100 and len(u) <= 100, 'All method pairs are <= 100')[:10]

All method pairs are <= 100
Result: 6593 / 20410 = 0.3230279274865262


[3, 8, 9, 11, 13, 14, 18, 20, 22, 23]

In [457]:
out = analyze_methods_length(lambda p, u: len(p) <= 150 and len(u) <= 150, 'All method pairs are <= 150')

All method pairs are <= 150
Result: 9519 / 20410 = 0.4663890249877511


In [459]:
out = analyze_methods_length(lambda p, u: len(p) <= 200 and len(u) <= 200, 'All method pairs are <= 200')

All method pairs are <= 200
Result: 11693 / 20410 = 0.5729054385105341


In [464]:
def filter_by_methods_length(predicate, label):
    import os
    root = '../Tufano_data/'
    idx = analyze_methods_length(predicate, label)
    list_of_files = os.listdir(root + f'datasets/final_dataset/')
    for filename in list_of_files:
        with open(root + f'datasets/final_dataset/' + filename, 'r') as in_file, \
             open(root + f'datasets/final_dataset/filtered_' + filename, 'w') as out_file:
            in_lines = [l.strip() for l in in_file]
            filtered_lines = [in_lines[i] for i in idx]
            out_file.write('\n'.join(filtered_lines))

In [465]:
filter_by_methods_length(lambda p, u: len(p) <= 100 and len(u) <= 100, 'All method pairs are <= 100')

All method pairs are <= 100
Result: 6593 / 20410 = 0.3230279274865262


In [483]:
def split_train_val_test():
    import os
    import random
    root = '../Tufano_data/'
    dataset_size = len([_ for _ in open(root + f'datasets/final_dataset/prev.txt', 'r')])
    print(dataset_size)
    idx = [i for i in range(dataset_size)]
    random.shuffle(idx)
    b1 = int(len(idx) * 0.8)
    b2 = int(len(idx) * 0.9)
    train_idx = idx[:b1]
    val_idx = idx[b1:b2]
    test_idx = idx[b2:]
    print(f'train {len(train_idx)}, val {len(val_idx)}, test {len(test_idx)}')
    print(f'test idx[:10]: {test_idx[:10]}')
    list_of_files = os.listdir(root + f'datasets/final_dataset/')
    for filename in list_of_files:
        if not filename.endswith('.txt'):
            continue
        with open(root + f'datasets/final_dataset/' + filename, 'r') as in_file:
            in_lines = [l.strip() for l in in_file]
            with open(root + f'datasets/final_dataset/train/' + filename, 'w') as out:
                out.write('\n'.join([in_lines[i] for i in train_idx]))
            with open(root + f'datasets/final_dataset/val/' + filename, 'w') as out:
                out.write('\n'.join([in_lines[i] for i in val_idx]))
            with open(root + f'datasets/final_dataset/test/' + filename, 'w') as out:
                out.write('\n'.join([in_lines[i] for i in test_idx]))

In [484]:
split_train_val_test()

6593
train 5274, val 659, test 660
test idx[:10]: [3601, 4083, 2663, 5319, 5962, 361, 6212, 830, 5619, 1663]
