In [266]:
import pandas as pd
from pathlib import Path
from tqdm.notebook import tqdm

In [267]:
files = {
    'csv/ScienceDirect.csv': ('Url', 'Title'),
    'csv/ieee.csv': ('PDF Link', 'Document Title'),
    'csv/acm.csv': ('Url', 'Title'),
    'csv/springer.csv': ('URL', 'Item Title'),
}

files = {Path(k): v for k, v in files.items()}

In [268]:
data = {k: pd.read_csv(k) for k in files.keys()}

print(f'Total papers: {sum(len(d) for d in data.values())} in {len(data)} sources:')
for f, d in data.items():
    print(f'\t{f.name} -> {len(d)}')

Total papers: 344 in 4 sources:
	ScienceDirect.csv -> 12
	ieee.csv -> 112
	acm.csv -> 41
	springer.csv -> 179


In [269]:
duplicates = {}

for f in files.keys():
    for idx, row in data[f].iterrows():
        title = row[files[f][1]].strip()
        for k in data.keys():
            if k != f and title in [t.strip() for t in data[k][files[k][1]].values]:
                duplicates[title] = (f, k)

print(f'Total duplicates: {len(duplicates)}')
for k, v in duplicates.items():
    print(f'\t{v[0].name} and {v[1].name} -> {k}')

Total duplicates: 8
	acm.csv and ieee.csv -> Binary Code Clone Detection across Architectures and Compiling Configurations
	acm.csv and ieee.csv -> Transferring Code-Clone Detection and Analysis to Practice
	acm.csv and ieee.csv -> A Novel Neural Source Code Representation Based on Abstract Syntax Tree
	acm.csv and ieee.csv -> SCDetector: Software Functional Clone Detection Based on Semantic Tokens Analysis
	acm.csv and ieee.csv -> MoCoP: Towards a Model Clone Portal
	acm.csv and ieee.csv -> PHANTA: Diversified Test Code Quality Measurement for Modern Software Development
	acm.csv and ieee.csv -> A Mocktail of Source Code Representations
	acm.csv and ieee.csv -> Unleashing the Power of Compiler Intermediate Representation to Enhance Neural Program Embeddings


In [270]:
import webbrowser
from dotmap import DotMap

browser = webbrowser.get('firefox')

In [271]:
relevant = {}
discard = {}
review = {}

for f, d in data.items():
    try:
        relevant[f] = pd.read_csv(Path(f'./relevant/{f.name}'))
        discard[f] = pd.read_csv(Path(f'./discard/{f.name}'))
        review[f] = pd.read_csv(Path(f'./review/{f.name}'))
    except FileNotFoundError:
        relevant[f] = pd.DataFrame(columns=d.columns)
        discard[f] = pd.DataFrame(columns=d.columns)
        review[f] = pd.DataFrame(columns=d.columns)

total = 0
for s in [relevant, discard, review]:
    for f, d in s.items():
        total += len(d)
print(f'Total files already seen: {total} in {len(relevant)} sources:')

Total files already seen: 298 in 4 sources:


In [272]:
for f, d in tqdm(data.items(), desc='Sources', total=len(data)):
    tqdm.write(f'Filtering {f}')
    quit = False
    
    for idx, row in tqdm(d.iterrows(), desc='Papers', leave=False, total=len(d)):
        url = row[files[f][0]]

        if url in relevant[f][files[f][0]].values or url in discard[f][files[f][0]].values or url in review[f][files[f][0]].values:
            continue

        browser.open(url)

        while (check := input(f'Keep {url}? [y/n] ')) not in ['y', 'n', 'q', 'r']:
            print('Invalid input')
        if check == 'y':
            relevant[f] = pd.concat([relevant[f], row.to_frame().T], ignore_index=True)
        elif check == 'n':
            discard[f] = pd.concat([discard[f], row.to_frame().T], ignore_index=True)
        elif check == 'r':
            review[f] = pd.concat([review[f], row.to_frame().T], ignore_index=True)
        elif check == 'q':
            quit = True
            break
    tqdm.write(f'Saving {f.name}')
    relevant[f].to_csv(Path(f'./relevant/{f.name}'), index=False)
    discard[f].to_csv(Path(f'./discard/{f.name}'), index=False)
    review[f].to_csv(Path(f'./review/{f.name}'), index=False)
    if quit:
        break



Sources:   0%|          | 0/4 [00:00<?, ?it/s]

Filtering csv/ScienceDirect.csv


Papers:   0%|          | 0/12 [00:00<?, ?it/s]

Saving ScienceDirect.csv
Filtering csv/ieee.csv


Papers:   0%|          | 0/112 [00:00<?, ?it/s]

Saving ieee.csv
Filtering csv/acm.csv


Papers:   0%|          | 0/41 [00:00<?, ?it/s]

Saving acm.csv
Filtering csv/springer.csv


Papers:   0%|          | 0/179 [00:00<?, ?it/s]

Invalid input
Saving springer.csv


In [273]:
print(f'Analysis results:')
for f in files.keys():
    print(f'\t{f.name} -> {len(relevant[f])} relevant, {len(discard[f])} discarded, {len(review[f])} to review')

Analysis results:
	ScienceDirect.csv -> 9 relevant, 3 discarded, 0 to review
	ieee.csv -> 81 relevant, 31 discarded, 0 to review
	acm.csv -> 21 relevant, 20 discarded, 0 to review
	springer.csv -> 39 relevant, 140 discarded, 0 to review
