In [1]:
import os
import pandas as pd

def check_files(directory1, directory2):
    """
    Compares filesnames between two directories and return information about intersections and differences
    
    Args:
        directory1 (str): Path to the first directory.
        directory2 (str): Path to the second directory.
        
    Returns:
        dict: A dictionary containing unique intersections and files from each directory.
    """
    files1 = os.listdir(directory1)
    files2 = os.listdir(directory2)
    
    names1 = set([x[:x.find(".")] for x in files1])
    names2 = set([x[:x.find(".")] for x in files2])
    
    intersection = names1.intersection(names2)
    excluisive_directory1 = names1 - names2
    excluisive_directory2 = names2 - names1
    
    excluisive_directory1 = sorted(list(excluisive_directory1))
    excluisive_directory2 = sorted(list(excluisive_directory2))
    
    result =  {
        "intersection": intersection,
        "excluisive_directory1": excluisive_directory1,
        "excluisive_directory2": excluisive_directory2
    }

    print(f"Directories compared: {directory1} and {directory2}")
    print(f"Intersection: {result['intersection']}")
    print(f"Exclusive files in directory 1: {result['excluisive_directory1']}")
    print(f"Exclusive files in directory 2: {result['excluisive_directory2']}")
    print("=" * 250)

    return result

def check_keys(csv_path):
    """
    Checks if all keys in a CSV file start with "G.D".

    Args:
        csv_path (str): Path to the CSV file.

    Returns:
        dict: A dictionary containing:
              - "valid_keys": List of valid keys (that start with "G.D").
              - "invalid_keys": List of invalid keys (that do not start with "G.D").
              - "total_keys": Total number of key checked.
              - "valid_count": Number of valid keys.
              - "invalid_count": Number of invalid keys.
    """
    try:
        df = pd.read_csv(csv_path)
    except Exception as e:
        return {"error": f"Error readinf CSV file: {e}"}

    if "keys" not in df.columns:
        return {"error": "The 'keys' column was not found in the CSV file."}

    keys = df["keys"]

    valid_keys = [key for key in keys if str(key).startswith("G.D")]
    invalid_keys = [key for key in keys if not str(key).startswith("G.D")]

    return {
        "valid_keys": valid_keys,
        "invalid_keys": invalid_keys,
        "total_keys": len(keys),
        "valid_count": len(valid_keys),
        "invalid_count": len(invalid_keys)
    } 
    
def generate_filename_map(directory):
    """
    Generates a mapping of filenames in a directory.

    Args:
        directory (str): Path to the directory.

    Returns:
        dict: A dictionary where the keys are filenames without extensions and 
              the values are their respective positions in the directory listing.
    """
    files = os.listdir(directory)
    filenames = [x[:x.find(".")] for x in files]
    filenames_map = {v: k for k, v in enumerate(filenames)}
    
    return filenames_map    


In [2]:
req_doc_path = os.path.join("data", "ReqList_ReqNet_ReqSim", "0 Requirement Specification Documents")

req_doc_raw_path = os.path.join("data", "ReqList_ReqNet_ReqSim", "0.1 Raw Text")

req_list_path = os.path.join("data", "ReqList_ReqNet_ReqSim", "1 ReqLists")

doc_structure_path = os.path.join("data", "ReqList_ReqNet_ReqSim", "2 DocumentStructure - Metadata")

mock_req_doc_raw_path = os.path.join("tests", "mocks", "mock_0_req_doc_raw_text")

mock_req_list_path = os.path.join("tests", "mocks", "mock_1_req_lists")

mock_doc_structure_path = os.path.join("tests", "mocks", "mock_2_doc_struct_metadata")

directories = [
    #req_doc_path,
    #req_doc_raw_path,
    #req_list_path,
    #doc_structure_path,
    #mock_req_doc_raw_path,
    #mock_req_list_path,
    mock_doc_structure_path
]

In [8]:
8799/12

733.25

In [7]:
generate_filename_map(req_doc_raw_path)


{'1997 - Modis': 0,
 '2012 - EMR HHS': 1,
 '2004 - sprat': 2,
 '2002 - evla corr': 3,
 '2011 - KMS': 4,
 '2012 - EMR CCHCS EA': 5,
 '2004 - SGVTraffic': 6,
 '2022 - UAM IMS': 7,
 '2012 - EMR HL7 DC': 8,
 '2000 - nasa x38': 9,
 '2007 - eirene fun 7': 10,
 '2005 - phin': 11,
 '1995 - gemini': 12,
 '2011 - CCHIT': 13,
 '2019 - MOSAR': 14,
 '2022 - MobileSurveillance': 15,
 '2012 - EMR Pharmacy': 16,
 '2017 - NISTMfgData': 17,
 '2005 - clarus low': 18,
 '2018 - DataWarehouse': 19,
 '2007 - get real 0': 20,
 '2005 - grid 3D': 21,
 '2007 - ertms': 22,
 '2010 - blit draft': 23,
 '1999 - tcs': 24,
 '2010 - split merge': 25,
 '2008 - caiso': 26,
 'EHR System FuncReq - LA DHS': 27,
 '2002 - evla back': 28,
 '2008 - virtual ed': 29,
 '2007 - e-store': 30,
 '2000 - Barrel': 31,
 '2006 - stewards': 32,
 '2009 - library2': 33,
 '2005 - pontis': 34,
 '2001 - beyond': 35,
 '1999 - multi-mahjong': 36,
 '2009 - library': 37,
 '2009 - peppol': 38,
 '2005 - znix': 39,
 '2017 - ePhyto': 40,
 '0000 - gamma 

In [4]:
#req_x_req_doc = check_files(req_list_path, req_doc_path)
req_x_req_doc_raw = check_files(req_list_path, req_doc_raw_path)
req_x_doc_structure = check_files(req_list_path, doc_structure_path)
#req_doc_x_doc_structure = check_files(req_doc_path, doc_structure_path)
#req_doc_x_req_doc_raw = check_files(req_doc_path, req_doc_raw_path)
req_doc_raw_x_doc_structure = check_files(req_doc_raw_path, doc_structure_path)

Directories compared: data/ReqList_ReqNet_ReqSim/1 ReqLists and data/ReqList_ReqNet_ReqSim/0.1 Raw Text
Intersection: {'2009 - library', '2014 - Minutia', '2005 - grid 3D', '2012 - NASAProcessReq', '2000 - nasa x38', '2011 - KMS', '2005 - clarus low', '2006 - eirene sys 15', '1998 - themas', '2016 - FDP Clearinghouse', '2002 - evla corr', '2018 - DataWarehouse', '2010 - mashboot', '2003 - agentmom', '1997 - Modis', '0000 - inventory', '2009 - gaia', '0000 - gamma j', '2004 - grid bgc', '0000 - cctns', '2021 - ReqView', '2019 - MOSAR', '2000 - Barrel', '2005 - nenios', '2022 - MobileSurveillance', '2007 - mdot', '2010 - split merge', '1995 - Landsat7', 'automated-insulin-pump', '2008 - virtual ed', '2001 - elsfork', '2022 - UAM IMS', '2001 - beyond', '2010 - blit draft', '2012 - EMR CCHIT LT', '2007 - e-store', '2005 - triangle', '2010 - fishing', '2021 - ConnectedVehiclePilotNYC', '2017 - NISTMfgData', '1999 - multi-mahjong', '2008 - caiso', '2013 - iTrust', '2001 - ctc network', '2012

In [5]:

csv_path = os.path.join("data","pure_req_user_stories.csv") 
result = check_keys(csv_path)

if "error" in result:
    print(result["error"])
else:
    print(f"Total de chaves verificadas: {result['total_keys']}")
    print(f"Chaves válidas: {result['valid_count']}")
    print(f"Chaves inválidas: {result['invalid_count']}")

    if result["invalid_keys"]:
        print("\nChaves inválidas encontradas:")
        for key in result["invalid_keys"]:
            print(f"- {key}")
    else:
        print("\nTodas as chaves são válidas!")

Total de chaves verificadas: 12725
Chaves válidas: 12725
Chaves inválidas: 0

Todas as chaves são válidas!


In [6]:
doc_req = {
        "2012 - EMR CCHCS EA ISO HL7 IN 20120420_Attach8" : ["2012 - EMR CCHCS EA", "2012 - EMR CCHCS ISO", "2012 - EMR HL7 IN"],
        "2012 - EMR HL7 DC - CCHIT LT - Pharmacy - HHS_Attach9" : ["2012 - EMR CCHIT LT", "2012 - EMR HHS", "2012 - EMR HL7 DC", "2012 - EMR Pharmacy"]
        }	