In [10]:
import pandas as pd
import json
from tqdm.auto import tqdm

# vocab-based results

In [11]:
projects = ['commons-imaging-1.0-alpha3-src', 'spark', 'commons-lang3-3.12.0-src', 'http-request', 'commons-geometry-1.0-src', 'springside4', 'commons-jexl3-3.2.1-src', 'joda-time', 'async-http-client', 'JSON-java', 'bcel-6.5.0-src', 'commons-weaver-2.0-src', 'commons-numbers-1.0-src', 'commons-collections4-4.4-src', 'jsoup', 'commons-jcs3-3.1-src', 'commons-validator-1.7', 'commons-net-3.8.0', 'commons-beanutils-1.9.4', 'commons-pool2-2.11.1-src', 'commons-rng-1.4-src', 'commons-configuration2-2.8.0-src', 'commons-vfs-2.9.0', 'scribejava', 'commons-dbutils-1.7']
vocab = pd.read_json(f'../fold0/out-of-vocab.json', orient='index')
results = pd.read_csv(f'../fold0/test_stats.csv')
results[0] = results.apply(lambda row: 'P' if row[0] == 1 else 'F', axis=1)
vocab['predicted'] = results[0]

In [12]:
vocab['prc_C_fail'] = vocab.apply(lambda row: round(row['C_tokens_fail'] / row['C_tokens'], 4), axis=1)
vocab['prc_T_fail'] = vocab.apply(lambda row: round(row['T_tokens_fail'] / row['T_tokens'], 4), axis=1)
vocab['prc_tokens_fail'] = vocab.apply(lambda row: round((row['T_tokens_fail'] + row['C_tokens_fail']) / (row['C_tokens'] + row['T_tokens']), 4), axis=1)
# Changing column order
vocab = vocab[['project', 'label', 'predicted', 'C_tokens', 'C_tokens_fail', 'T_tokens', 'T_tokens_fail', 'prc_C_fail', 'prc_T_fail', 'prc_tokens_fail']]
vocab.head()

Unnamed: 0,project,label,predicted,C_tokens,C_tokens_fail,T_tokens,T_tokens_fail,prc_C_fail,prc_T_fail,prc_tokens_fail
0,commons-imaging-1.0-alpha3-src,P,F,7,2,63,37,0.2857,0.5873,0.5571
1,commons-imaging-1.0-alpha3-src,P,F,7,2,63,37,0.2857,0.5873,0.5571
2,commons-imaging-1.0-alpha3-src,P,F,7,2,63,37,0.2857,0.5873,0.5571
3,commons-imaging-1.0-alpha3-src,P,F,7,2,63,37,0.2857,0.5873,0.5571
4,commons-imaging-1.0-alpha3-src,P,F,7,2,63,37,0.2857,0.5873,0.5571


In [13]:
vocab.sort_values(by=['prc_tokens_fail'], ascending=True).to_csv(f'./out-of-vocab.csv', index=True)
vocab.sort_values(by=['prc_tokens_fail'], ascending=True)

Unnamed: 0,project,label,predicted,C_tokens,C_tokens_fail,T_tokens,T_tokens_fail,prc_C_fail,prc_T_fail,prc_tokens_fail
35775,joda-time,F,P,46,0,9,0,0.0000,0.0000,0.0000
35773,joda-time,F,P,46,0,9,0,0.0000,0.0000,0.0000
35432,joda-time,F,P,132,0,9,1,0.0000,0.1111,0.0071
35397,joda-time,F,P,132,0,9,2,0.0000,0.2222,0.0142
35426,joda-time,F,P,132,0,9,2,0.0000,0.2222,0.0142
...,...,...,...,...,...,...,...,...,...,...
62823,async-http-client,P,P,7,2,70,50,0.2857,0.7143,0.6753
142022,commons-jcs3-3.1-src,P,F,8,4,59,42,0.5000,0.7119,0.6866
166858,commons-configuration2-2.8.0-src,P,P,9,3,82,62,0.3333,0.7561,0.7143
1900,commons-imaging-1.0-alpha3-src,P,P,7,2,56,43,0.2857,0.7679,0.7143


In [29]:
thresholds = [.05, .10, .25, .50, .75, .90, .95]
N = []
for threshold in thresholds:
    temp = vocab.copy()
    temp = temp[temp['prc_tokens_fail'] < threshold]
    N.append(len(temp))

    temp['Predicted Label'] = temp.apply(lambda row: 1 if row['predicted'] == 'P' else 0, axis=1)
    temp['Actual Label'] = temp.apply(lambda row: 1 if row['label'] == 'P' else 0, axis=1)
    string = "{:.2f}".format(threshold)
    temp[['Predicted Label', 'Actual Label']].to_csv(f'./test_stats_{string[2:]}.csv', index=False)

## Summary stats

In [14]:
columns = ['project', 'total_C_tokens', 'total_C_fail', 'ratio_C', 'total_T_tokens', 'total_T_fail', 'ratio_T']
vocab_summary = pd.DataFrame(columns=columns)
for project in projects:
    temp = vocab[vocab['project']==project]
    total_C_tokens = temp['C_tokens'].sum()
    total_T_tokens = temp['T_tokens'].sum()
    total_T_fail = temp['T_tokens_fail'].sum()
    total_C_fail = temp['C_tokens_fail'].sum()
    ratio_C = total_C_fail / total_C_tokens
    ratio_T = total_T_fail / total_T_tokens
    data = [project, total_C_tokens, total_C_fail, ratio_C, total_T_tokens, total_T_fail, ratio_T]

    temp_df = pd.DataFrame(data).T
    temp_df.columns = columns
    vocab_summary = pd.concat([vocab_summary, temp_df], ignore_index=True)

vocab_summary['ratio_C'] = vocab_summary.apply(lambda row: round(row['ratio_C'], 2), axis=1)
vocab_summary['ratio_T'] = vocab_summary.apply(lambda row: round(row['ratio_C'], 2), axis=1)
print("average out of vocab in code:", round(vocab_summary['ratio_C'].mean(), 3))
vocab_summary[['project', 'ratio_C', 'ratio_T']].to_csv('./out-of-vocab-summary.csv', index=False)
vocab_summary[['project', 'ratio_C', 'ratio_T']]

average out of vocab in code: 0.291


Unnamed: 0,project,ratio_C,ratio_T
0,commons-imaging-1.0-alpha3-src,0.3,0.3
1,spark,0.36,0.36
2,commons-lang3-3.12.0-src,0.2,0.2
3,http-request,0.27,0.27
4,commons-geometry-1.0-src,0.33,0.33
5,springside4,0.25,0.25
6,commons-jexl3-3.2.1-src,0.27,0.27
7,joda-time,0.29,0.29
8,async-http-client,0.38,0.38
9,JSON-java,0.23,0.23
