In [None]:
import pandas as pd
from tqdm.auto import tqdm
from collections import Counter
import numpy as np
import os, json, fasttext, re
from lxml import etree as ET
import copy
import unicodedata as ud

In [None]:
# Enable tqdm for pandas
tqdm.pandas()

# Create a flatten function
flatten = lambda t: [item for sublist in t for item in sublist]

# Initial data cleaning

In [None]:
data_root = 'data/repo-genmymodel-uml/data'

In [None]:
filenames = next(os.walk(data_root), (None, None, []))[2]

# Start wrangling

In [None]:
len(filenames)

In [None]:
filenames[0]

## Defining patterns

### XML UML tag patterns

In ed9c3064-7fb5-4ec0-bded-c114251076fa.xmi.uml, packagedElement tags are used to define classes (```uml:Class```): 

```
<packagedElement xsi:type="uml:Class" xmi:id="_eI6xoFdMEDCGTb6ZuLm3Mw" name="Client">
```

The ownedAttribute tag holds attribute names:

```
<ownedAttribute xmi:id="_GVIe2JUXEeqqGZh46IEtXQ" name="id">
```

## Search for patterns in files


In [None]:
def get_classes_and_attributes_from_uml(filepath):
    root = ET.parse(data_root + '/' + filepath)
    
    # Gather classes and attributes using defined patterns
    classes = root.xpath("//packagedElement[@xsi:type='uml:Class']/@name", namespaces={'xsi': 'http://www.w3.org/2001/XMLSchema-instance'})
    attributes = root.xpath("//packagedElement[@xsi:type='uml:Class']/ownedAttribute/@name", namespaces={'xsi': 'http://www.w3.org/2001/XMLSchema-instance'})
    
    # Save classes and attributes in list if they hold values
    found_data = {}
    if len(classes) > 0:
        found_classes = [x for x in [x.strip() for x in classes] if x != '']
        if len(found_classes) > 0:
            found_data['classes'] = found_classes
    if len(attributes) > 0:
        found_attributes = [x for x in [x.strip() for x in attributes] if x != '']
        if len(found_classes) > 0:
            found_data['attributes'] = found_attributes
    
    return found_data

In [None]:
get_classes_and_attributes_from_uml('ed9c3064-7fb5-4ec0-bded-c114251076fa.xmi.uml')

In [None]:
data_dict = {}

for file in tqdm(filenames):
    try:
        data = get_classes_and_attributes_from_uml(file)
    except Exception as e:
        print(e)
    
    if len(data.keys()) > 0:
        data_dict[file] = data

In [None]:
data_dict

In [None]:
for file in data_dict.keys():
    if 'classes' not in data_dict[file].keys() or len(data_dict[file]['classes']) == 0:
        print(file)
        print(data_dict[file])

In [None]:
# Get total amount of unique classes
print(len(np.unique(np.array(list(map(lambda x: x.strip(), flatten([data_dict[file]['classes'] for file in data_dict.keys()])))))))

# Get total amount of unique attributes
print(len(np.unique(np.array(flatten([[str(attr).strip() for attr in values['attributes']] for values in data_dict.values() if 'attributes' in values.keys()])))))

In [None]:
with open('data/genmymodel_uml_extracted_metadata.json', 'w') as fp:
    json.dump(data_dict, fp)

## First "raw" data cleaning cycle

In [None]:
with open('data/genmymodel_uml_extracted_metadata.json') as json_file:
    data_dict = json.load(json_file)

In [None]:
def should_be_kept(text):
    # Check if larger than 1 character
    if len(text) > 1:
        # Remove function calls '_8Ux_QFaxEeSu48YT12vqpQ.xmi.uml'
        if text[-2:] == '()':
            return False

        # Remove getters and setters '_UWr2cM_JEeeLcIicqHdTUQ.xmi.uml' '_va9xEFzhEeqK2M3E1LfZ7Q.xmi.uml'
        if re.search(r'^[gs]et(?:[A-Z]|_)\w+', text):
            return False

        # Remove dot seperated widgets/views and filenames '_tGyV0EbLEeeTJ_4Vl2J2rQ.xmi.uml' '_WZQeEMJ8EeSII650IQ0Z1w.xmi.uml'
        if re.search(r'^\w+\.\w+(?:\.\w+)*', text):
            return False

        # Remove comma-separated attributes 'f6e1ef7a-107f-419e-ab7d-a51aa49c13d2.xmi.uml'
        if re.search(r'^\w+?(?:,\w+?)+', text):
            return False
    else: 
        return False
    
    # All cases do not match, text may stay
    return True

data_dict = {
    filename: {
        metadata: list(filter(lambda x: should_be_kept(x), texts))
        for metadata, texts in extracted_data.items()
    }
    for filename, extracted_data in tqdm(data_dict.items())
}

In [None]:
# Get total amount of unique classes
print(len(np.unique(np.array(list(map(lambda x: x.strip(), flatten([data_dict[file]['classes'] for file in data_dict.keys()])))))))

# Get total amount of unique attributes
print(len(np.unique(np.array(flatten([[str(attr).strip() for attr in values['attributes']] for values in data_dict.values() if 'attributes' in values.keys()])))))

In [None]:
# Remove classes/attributes keys if they don't hold any values
iteration_copy = copy.deepcopy(data_dict)
for file in tqdm(iteration_copy.keys()):
    for metadata in iteration_copy[file].keys():
        if len(iteration_copy[file][metadata]) == 0:
            del data_dict[file][metadata]

# Remove files from dataset if they don't hold any classes and attributes anymore
iteration_copy = copy.deepcopy(data_dict)
for file in tqdm(iteration_copy.keys()):
    if len(iteration_copy[file].keys()) == 0:
        del data_dict[file]

# Clear up memory
del iteration_copy

In [None]:
with open('data/genmymodel_uml_extracted_metadata_cleaned1.json', 'w') as fp:
    json.dump(data_dict, fp)

In [None]:
data_dict

## Language detection

Because the dataset contains attributes and classes in lots of languages, we want to identify all the English files that contain attributes and classes, and filter out all the rest. For this, we use FastText.

In [None]:
# Reload data_dict from file
with open('data/genmymodel_uml_extracted_metadata_cleaned1.json') as json_file:
    data_dict = json.load(json_file)

    # Make sure all programmic cases are turned into spaces like "normal" text for better language detection
    cleaned_data = {
        file: {
            key: list(map(lambda x: ' '.join(list(map(lambda data_str: re.sub(r'((?<=[a-z])[A-Z]|(?<!\A)[A-Z](?=[a-z]))', r' \1', data_str.replace('_', ' ').strip()).lower(), x.split(' ')))), value))
            for key, value in metadata.items()
        } 
        for file, metadata in tqdm(data_dict.items())
    }

In [None]:
cleaned_data

In [None]:
PRETRAINED_MODEL_PATH = 'data/lid.176.bin'

model = fasttext.load_model(PRETRAINED_MODEL_PATH)

In [None]:
# Approximate the language from the extracted classes and attributes
def get_file_language(metadata):
    # Combine all classes and attributes into one array
    all_metadata = sum(metadata.values(), [])
    
    # Detect language for each word
    predictions = model.predict(all_metadata)
    
    # Specific approach for longer texts due to issue with habit of using English in programming
    def get_field_level_prediction(string):
        # Split into single words to detect on word level
        words = string.split(' ')
        
        # Get single-word predictions
        string_predictions = np.array(model.predict(words))
        
        # Return the appropriate language
        if len(string_predictions) == 1:
            return string_predictions[0][0][0]
        else:
            # Option 1: all identical language identification
            if len(np.unique(string_predictions[0])) == 1:
                return string_predictions[0][0][0]
            
            # Option 2: combination of an English programmatic word with another language, e.g. "hovedmenu view"
            elif len(string_predictions) == 2 and any('__label__en' in label for label in string_predictions):
                return [x for x in map(lambda x: x[0], string_predictions[0]) if x != '__label__en'][0]
            
            # Option 3: majority vote
            else:
                return Counter(map(lambda x: x[0], string_predictions[0])).most_common(1)[0][0]
        
    # Apply to all fields, identify language for all classes and attributes
    predictions = list(map(lambda x: get_field_level_prediction(x), all_metadata))

    # Get the most common language from all attributes
    return Counter(predictions).most_common(1)[0][0]

In [None]:
labeled_cleaned_data = {
    key: {**value, **{'lang': get_file_language(value)}}
    for key, value in tqdm(cleaned_data.items())
}

In [None]:
labeled_cleaned_data

In [None]:
with open('data/genmymodel_uml_extracted_metadata_annotated.json', 'w') as fp:
    json.dump(labeled_cleaned_data, fp)

# Further data cleaning

In [None]:
with open('data/genmymodel_uml_extracted_metadata_annotated.json') as json_file:
    labeled_cleaned_data = json.load(json_file)

In [None]:
filtered_files = {key: value for key, value in labeled_cleaned_data.items() if value['lang'] == '__label__en'}

In [None]:
# Remove hand-picked files
for file in ['_0se6kFNjEeS7SK6lkmJgjw.xmi.uml', '_zeUN8FJjEeeTnI9B59buBQ.xmi.uml', '_HKUJMMGAEeSBntdMhqoN9Q.xmi.uml', '48ecca9d-91d4-44f2-a21f-b0285228d504.xmi.uml']:
    del filtered_files[file]

In [None]:
filtered_files['_vZBssKMfEeScLuNN63kNaQ.xmi.uml']

In [None]:
def text_transform(text):
    # Remove HTML tags and other tags '_WZQeEMJ8EeSII650IQ0Z1w.xmi.uml'
    text = re.sub(r'<\w+?>', '', text)

    # Remove dollar sign at the beginning '_2p9bEMasEeadwOIOqK-0Fw.xmi.uml'
    text = re.sub(r'^\$(?=\w+)', '', text)
    
    # Remove all "my" things '_8y0sAIKDEeeveJPbhFhy-g.xmi.uml'
    text = text.replace('my ', '')

    # Replace all parentheses with spaces '_2A0eUK99EealN5YbbDMqoA.xmi.uml' '_UxXH4Ji_EeWP9KF1Y_wKUg.xmi.uml'
    text = re.sub(r'\(\w+?\)', '', text)
    text = text.replace('(', ' ')
    text = text.replace(')', ' ')
    
    # Remove all attached numbers '_cOog0CIgEeisAYMSV00L2Q.xmi.uml'
    text = re.sub(r'(?<=\w)\d', '', text)

    # Replace "impl" with something '19515821-de9c-4371-ab13-0c10c1f0741e.xmi.uml'
    text = re.sub(r'\bimpl\b', 'implementation', text)

    # Remove questions '_arOCQOkeEeiV94kHgjpOMg.xmi.uml'
    text = re.sub(r'(?<=\w)\?', '', text)

    # Re-combine dashes '_vZBssKMfEeScLuNN63kNaQ.xmi.uml'
    text = re.sub(r'-\s(?=\w+?)', '-', text)

    # Turn ref abbreviation into reference '_w-4dMCDQEeqqcaoAsxFIeg.xmi.uml'
    text = re.sub(r'\bref\b', 'reference', text)

    # Remove single letter with space at beginning '_zTVWYHphEeas_KSqa81bqw.xmi.uml'
    text = re.sub(r'^\w\s(?=\w+)', '', text)
    
    # Remove colons
    text = text.replace(':', ' ')

    # Remove duplicate spaces
    text = re.sub(r'\s{2,}', ' ', text)
    
    # Remove scripted pattern " comments";
    text = re.sub(r'" (.+)";{0,1}', r'\1', text)
    
    # Remove all blokhaken
    text = re.sub(r'\[.*\]', '', text)
    
    # Remove dash to start
    text = re.sub(r'^-', '', text)
    
    # Remove plus to start
    text = re.sub(r'^\+', '', text)
    
    # Replace # with number when at end
    text = re.sub(r' ?#$', ' number', text)
    
    # Replace # with number at start
    text = re.sub(r'^#', 'number ', text)
    
    # Replace all dashes with spaces
    text = text.replace('-', ' ')
    
    # Remove * + 
    text = re.sub('\*$', '', text)
    text = re.sub('[\?]', '', text)
    
    # Remove tags
    text = re.sub(r'.+?< ([\w\s]+)>', r'\1', text)
    text = re.sub(r'^<>', '', text)
    text = re.sub(r'<$', '', text)
    
    # Change ampersand to "and"
    text = text.replace(' & ', ' and ')
    
    # Replace backslash in front
    text = re.sub(r'^\\ ', '', text)
    
    # Remove duplicate spaces
    text = text.replace('  ', ' ')
    
    # Trim output
    return text.strip()

text_transform('abstract resource item< style sheet item, style sheet params,url>')

In [None]:
filtered_files = {
    filename: {
        metadata: list(map(lambda x: text_transform(x), texts))
        for metadata, texts in extracted_data.items()
        if metadata != 'lang'
    }
    for filename, extracted_data in tqdm(filtered_files.items())
}

In [None]:
filtered_files

In [None]:
latin_letters= {}

def is_latin(uchr):
    try: return latin_letters[uchr]
    except KeyError:
         return latin_letters.setdefault(uchr, 'LATIN' in ud.name(uchr))

def only_roman_chars(unistr):
    return all(is_latin(uchr)
           for uchr in unistr
           if uchr.isalpha())

def not_three_consecutive_chars(unistr):
    start = unistr[0]
    
    for char in unistr[1:]:
        if char == start[0]:
            start = start + char
            
            if len(start) > 2:
                return False
        else:
            start = char
    
    return True

In [None]:
# Create non-changing copy for iteration through the files
iteration_copy = copy.deepcopy(filtered_files)

for file in iteration_copy.keys():
    for metadata in iteration_copy[file].keys():
        # Remove all empty strings and non-alphanumerical entries
        new_metadata = list(filter(lambda x: x != '' and len(x) > 1 and x.replace(' ', '').replace("'", '').isalnum() and only_roman_chars(x) and any(c.isalpha() for c in x) and len(set(filter(str.isalpha, x))) != 1 and not re.match(r'(?:aa)|(?:uu)|(?:yy)|(?:ii)|(?:jj)', x) and not_three_consecutive_chars(x) and not '0f' in x and model.predict([x])[0][0][0] == '__label__en', filtered_files[file][metadata]))
        
        # Either remove the now-empty metadata or change it in the file
        if len(new_metadata) > 0:
            filtered_files[file][metadata] = new_metadata
        else:
            del filtered_files[file][metadata]
    
    # Remove file if it doesn't contain metadata anymore after filtering
    if len(filtered_files[file].keys()) == 0:
        del filtered_files[file]

In [None]:
with open('data/genmymodel_uml_extracted_metadata_final.json', 'w') as fp:
    json.dump(filtered_files, fp)