In [1]:
import pandas as pd
from tqdm.auto import tqdm
from collections import Counter
import numpy as np
import os, json, fasttext, re
from lxml import etree as ET

In [2]:
# Enable tqdm for pandas
tqdm.pandas()

  from pandas import Panel


# Initial data cleaning

In [3]:
data_root = 'data/repo-genmymodel-uml/data'

In [4]:
filenames = next(os.walk(data_root), (None, None, []))[2]

# Start wrangling

In [5]:
len(filenames)

352216

In [6]:
filenames[0]

'_cDm6cGfBEeepGY91r9Nuow.xmi.uml'

In [7]:
# Saving regular expression patterns
attr_pattern_list = []
class_pattern_list = []

## Defining patterns

### XML UML tag patterns

In ed9c3064-7fb5-4ec0-bded-c114251076fa.xmi.uml, packagedElement tags are used to define classes (```uml:Class```): 

```
<packagedElement xsi:type="uml:Class" xmi:id="_eI6xoFdMEDCGTb6ZuLm3Mw" name="Client">
```

The ownedAttribute tag holds attribute names:

```
<ownedAttribute xmi:id="_GVIe2JUXEeqqGZh46IEtXQ" name="id">
```

In [8]:
class_pattern_list.append(r'uml:Class.+?name="([^"]+)"')
attr_pattern_list.append(r'<ownedAttribute.*? name="(\S+)"')

## Search for patterns in files


In [9]:
flatten = lambda t: [item for sublist in t for item in sublist]

def get_classes_and_attributes_from_uml(filepath):
    root = ET.parse(data_root + '/' + filepath)
    
    # Gather classes and attributes using defined patterns
    classes = root.xpath("//packagedElement[@xsi:type='uml:Class']/@name", namespaces={'xsi': 'http://www.w3.org/2001/XMLSchema-instance'})
    attributes = root.xpath("//packagedElement[@xsi:type='uml:Class']/ownedAttribute/@name", namespaces={'xsi': 'http://www.w3.org/2001/XMLSchema-instance'})
    
    # Save classes and attributes in list if they hold values
    found_data = {}
    if len(classes) > 0:
        found_data['classes'] = list(dict.fromkeys(classes))
    if len(attributes) > 0:
        found_data['attributes'] = list(dict.fromkeys(attributes))
    
    return found_data

In [10]:
get_classes_and_attributes_from_uml('ed9c3064-7fb5-4ec0-bded-c114251076fa.xmi.uml')

{'classes': ['Client', 'Adaptor', 'Adaptee']}

In [11]:
data_dict = {}

for file in tqdm(filenames):
    try:
        data = get_classes_and_attributes_from_uml(file)
    except Exception as e:
        print(e)
    
    if len(data.keys()) > 0:
        data_dict[file] = data

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=352216.0), HTML(value='')))

Start tag expected, '<' not found, line 1, column 1 (_mu7qEH1yEeShroDxTnL5_A.xmi.uml, line 1)
Start tag expected, '<' not found, line 1, column 1 (_m_Rk4KwTEea217yLRs2OoA.xmi.uml, line 1)
Start tag expected, '<' not found, line 1, column 1 (_QccQIO_IEeSujetaMhozjg.xmi.uml, line 1)
Start tag expected, '<' not found, line 1, column 1 (2f9d0cc3-7598-4e0e-a1ce-5aa700687522.xmi.uml, line 1)
Start tag expected, '<' not found, line 1, column 1 (_5yFvIKYsEei1evnFuWTWfg.xmi.uml, line 1)
Start tag expected, '<' not found, line 1, column 1 (_TPfGIDM0EeSm5tDnu9wFCA.xmi.uml, line 1)
Start tag expected, '<' not found, line 1, column 1 (_9y-TQJ4sEeiGsdp6eB6oaw.xmi.uml, line 1)
Start tag expected, '<' not found, line 1, column 1 (_y2OWACeREeShcfbUiDP6ug.xmi.uml, line 1)
Start tag expected, '<' not found, line 1, column 1 (_9w0aYK1CEDGNWLI2ml8Hng.xmi.uml, line 1)
Opening and ending tag mismatch: link line 1 and head, line 1, column 259 (_jOJ8gOS7EeekKLRLyKXO4Q.xmi.uml, line 1)
Opening and ending tag mi

Start tag expected, '<' not found, line 1, column 1 (54749843-b387-4ee8-8f4b-c2630321f1af.xmi.uml, line 1)
Start tag expected, '<' not found, line 1, column 1 (_uW1j8CUfEDGlaJTZcof-Dw.xmi.uml, line 1)
Start tag expected, '<' not found, line 1, column 1 (_g24SgCtMEDGWq_RLO4Gl4w.xmi.uml, line 1)
Start tag expected, '<' not found, line 1, column 1 (_YtikoEm5EeS_velUIC8tvA.xmi.uml, line 1)
Start tag expected, '<' not found, line 1, column 1 (6e20cd3f-25ed-44d9-b339-5ba8a2e148bb.xmi.uml, line 1)
Start tag expected, '<' not found, line 1, column 1 (f12c8f43-8bad-4035-9fb0-3a77f60d0472.xmi.uml, line 1)
Start tag expected, '<' not found, line 1, column 1 (5d02602f-7397-4b4d-a50d-f7302baab3f0.xmi.uml, line 1)
Start tag expected, '<' not found, line 1, column 1 (_5x-WEJzsEeWWRMvT9FgLDg.xmi.uml, line 1)
Start tag expected, '<' not found, line 1, column 1 (031bbef7-da12-4a68-a65d-90fe85b86bc7.xmi.uml, line 1)
Start tag expected, '<' not found, line 1, column 1 (_T1bakHQIEeSZWo9AFr5YZg.xmi.uml, lin

Start tag expected, '<' not found, line 1, column 1 (_tb5msO-UEeSujetaMhozjg.xmi.uml, line 1)
Start tag expected, '<' not found, line 1, column 1 (_cKV-kDzHEeSn3ZXpVIPWYw.xmi.uml, line 1)
Start tag expected, '<' not found, line 1, column 1 (_jbCAEAquEeWNVPA9hBwttA.xmi.uml, line 1)
Start tag expected, '<' not found, line 1, column 1 (_D9v3ELwQEeSvIovPojnUsw.xmi.uml, line 1)
Start tag expected, '<' not found, line 1, column 1 (1648bfb8-15cc-4d99-be61-9bbf29974caf.xmi.uml, line 1)
Start tag expected, '<' not found, line 1, column 1 (f2aa59dc-3aeb-4787-aaee-1a28414022bd.xmi.uml, line 1)
Start tag expected, '<' not found, line 1, column 1 (_DLNxgOx6EeW6A_AbwsZiiQ.xmi.uml, line 1)
Start tag expected, '<' not found, line 1, column 1 (_rZiSsDNdEeSm5tDnu9wFCA.xmi.uml, line 1)
Start tag expected, '<' not found, line 1, column 1 (_dUA9wNHzEeSyKtHqJeGVMw.xmi.uml, line 1)
Start tag expected, '<' not found, line 1, column 1 (_YVGuYA7pEeWa5JwclR5VRw.xmi.uml, line 1)
Start tag expected, '<' not found,

Start tag expected, '<' not found, line 1, column 1 (_jBy1oIhuEeWSIKoWblTvZg.xmi.uml, line 1)
Start tag expected, '<' not found, line 1, column 1 (__SM48KzwEeS57ICseEaz5A.xmi.uml, line 1)
Start tag expected, '<' not found, line 1, column 1 (_aXCtoPgDEeSJkrdQR2VoEQ.xmi.uml, line 1)
Start tag expected, '<' not found, line 1, column 1 (5cf8aafe-fb28-4cd7-ad7f-99d42f938248.xmi.uml, line 1)
Start tag expected, '<' not found, line 1, column 1 (_pAJ10J44EeWWRMvT9FgLDg.xmi.uml, line 1)
Start tag expected, '<' not found, line 1, column 1 (4223130b-d098-41d8-9acf-f5ef087d71cf.xmi.uml, line 1)
Start tag expected, '<' not found, line 1, column 1 (_6CcSAFO4EeS7SK6lkmJgjw.xmi.uml, line 1)
Start tag expected, '<' not found, line 1, column 1 (_fxlTwNlDEeSpEswBeJxMRQ.xmi.uml, line 1)
Start tag expected, '<' not found, line 1, column 1 (639942fa-1753-433a-acf2-c189766be951.xmi.uml, line 1)
Start tag expected, '<' not found, line 1, column 1 (8e81eb17-37f5-4370-99eb-8310b36807b8.xmi.uml, line 1)
Start ta

In [12]:
data_dict

{'5fae885c-3fde-430f-a3c5-ff44b81f2668.xmi.uml': {'classes': ['ClassA',
   'ClassB',
   'ClassC',
   'ClassD',
   'ClassE',
   'ClassF',
   'ClassG',
   'ClassH',
   'ClassI']},
 '_oVCEUIHiEeeveJPbhFhy-g.xmi.uml': {'classes': ['SitePage', 'SiteSection'],
  'attributes': ['id', 'status', 'name', 'url', 'locked', 'defaultPage']},
 '20b51f77-a952-4206-b35c-b47acd0c631f.xmi.uml': {'classes': ['MoteurImpl',
   'Buffer',
   'Selection',
   'PressePapier'],
  'attributes': ['texte', 'sel', 'attribute']},
 'e8a7e84d-73b3-40d2-ac78-f550566b4de8.xmi.uml': {'classes': ['Client',
   'AdapterImpl',
   'Adaptee',
   'Context',
   'StateA',
   'StateB',
   'Singleton',
   'Abstraction',
   'RefinedAbstraction',
   'ConcreteImplementor',
   'CreatorImpl',
   'Product',
   'ConcreteObserver',
   'ConcreteSubject'],
  'attributes': ['instance']},
 '_oWwJoHfLEeqeQcxm9hmzHw.xmi.uml': {'classes': ['ShoppingCart',
   'Order',
   'LineItem',
   'Account',
   'Customer'],
  'attributes': ['creationDate', 'id'

In [13]:
for file in data_dict.keys():
    if 'classes' not in data_dict[file].keys() or len(data_dict[file]['classes']) == 0:
        print(file)
        print(data_dict[file])

_Sbu1cD7pEeeTJ_4Vl2J2rQ.xmi.uml
{'attributes': ['connect']}


In [14]:
# Drop files with only attributes
del data_dict['_Sbu1cD7pEeeTJ_4Vl2J2rQ.xmi.uml']

In [15]:
classes = np.unique(np.array(list(map(lambda x: x.strip(), flatten([data_dict[file]['classes'] for file in data_dict.keys()])))))

In [27]:
attributes = np.unique(np.array(flatten([[str(attr).strip() for attr in values['attributes']] for values in data_dict.values() if 'attributes' in values.keys()])))

In [28]:
len(attributes)

455735

In [16]:
len(classes)

344982

In [29]:
list(attributes)

['',
 '!VDataWrapper()',
 '"ByAssestType:ShowKeyArt"',
 '"ByCustomValue:{mediaType}{TVEpisode}"',
 '"I need or want something"',
 '"Name',
 '"byCategories:Series"',
 '"byProvider:<Any-Brand>"',
 '"byProvider:<Brand>"',
 '"client_id,client_secret,grant_type',
 '"createdBy"',
 '"createdDate"',
 '"hello world"',
 '"nativeIdentifier"',
 '"pour mourad : a remplacer par ta version OWL/RDF"',
 '"requestId"',
 '#',
 '# * id_customer',
 '# * id_ingredient',
 '# * id_product',
 '# * id_purchase',
 '# * id_tracking',
 '# Alter',
 '# Crew',
 '# Data',
 '# Etapa',
 '# Flame',
 '# Geburtstag',
 '# Living At Address',
 '# Name',
 '# Packages',
 '# Sex',
 '# Smoke',
 '# Students',
 '# Temp',
 '# angestelltSeit',
 '# assignment',
 '# clientID',
 '# currentState : BaseState*',
 '# customerID',
 '# data',
 '# de apuntes',
 '# de apuntes aportados',
 '# de parciales proporcionados',
 '# de parciales y ejercicos alquilados',
 '# de publicaciones',
 '# de_plazas',
 '# doGet(request : HttpServletRequest, res

In [19]:
with open('data/genmymodel_uml_extracted_metadata.json', 'w') as fp:
    json.dump(data_dict, fp)

In [42]:
data_dict

{'https://github.com/intel/umf/raw/master/docs/design/LLD.uml': {'classes': ['MetaFile',
   'MetaSource',
   'Property',
   'Set',
   'Item',
   'MetaFileImpl',
   'MetaSourceImpl',
   'PropertyImpl',
   'SetImpl',
   'ItemImpl',
   'ObjectFactory',
   'DataSourceFactory',
   'DataSourceXMP',
   'DataSource',
   'FrameRegion',
   'Time']},
 'https://github.com/1-aarsproeve/1-aarsproeve/raw/master/1aarsproeve/UML/ModelDefinition/Package_1059.uml': {'classes': ['HovedmenuView',
   'Beskeder',
   'VagtplanView',
   'Ansatte',
   'AnsatteView',
   'Stillinger',
   'Vagter',
   'Anmodninger',
   'Ugedage',
   'ViewContext',
   'TableContext']},
 'https://github.com/1-aarsproeve/1-aarsproeve/raw/master/1aarsproeve/UML/ModelDefinition/Package_1205.uml': {'classes': ['Hovedmenu',
   'Login',
   'OpretBruger',
   'OpretVagt',
   'Profil',
   'RedigerVagt',
   'SkrivBesked',
   'Vagtplan',
   'Splash',
   'Anmodninger']},
 'https://github.com/1-aarsproeve/1-aarsproeve/raw/master/1aarsproeve/UML/

# Language detection

Because the dataset contains attributes and classes in lots of languages, we want to identify all the English files that contain attributes and classes, and filter out all the rest. For this, we use FastText.

In [30]:
# Reload data_dict from file
with open('data/genmymodel_uml_extracted_metadata.json') as json_file:
    data_dict = json.load(json_file)

    # Make sure all programmic cases are turned into spaces like "normal" text for better language detection
    cleaned_data = {
        file: {
            key: list(map(lambda x: ' '.join(list(map(lambda data_str: re.sub(r'((?<=[a-z])[A-Z]|(?<!\A)[A-Z](?=[a-z]))', r' \1', data_str.replace('_', ' ').strip()).lower(), x.split(' ')))), value))
            for key, value in metadata.items()
        } 
        for file, metadata in data_dict.items()
    }

In [31]:
cleaned_data

{'5fae885c-3fde-430f-a3c5-ff44b81f2668.xmi.uml': {'classes': ['class a',
   'class b',
   'class c',
   'class d',
   'class e',
   'class f',
   'class g',
   'class h',
   'class i']},
 '_oVCEUIHiEeeveJPbhFhy-g.xmi.uml': {'classes': ['site page', 'site section'],
  'attributes': ['id', 'status', 'name', 'url', 'locked', 'default page']},
 '20b51f77-a952-4206-b35c-b47acd0c631f.xmi.uml': {'classes': ['moteur impl',
   'buffer',
   'selection',
   'presse papier'],
  'attributes': ['texte', 'sel', 'attribute']},
 'e8a7e84d-73b3-40d2-ac78-f550566b4de8.xmi.uml': {'classes': ['client',
   'adapter impl',
   'adaptee',
   'context',
   'state a',
   'state b',
   'singleton',
   'abstraction',
   'refined abstraction',
   'concrete implementor',
   'creator impl',
   'product',
   'concrete observer',
   'concrete subject'],
  'attributes': ['instance']},
 '_oWwJoHfLEeqeQcxm9hmzHw.xmi.uml': {'classes': ['shopping cart',
   'order',
   'line item',
   'account',
   'customer'],
  'attributes

In [32]:
PRETRAINED_MODEL_PATH = 'data/lid.176.bin'

model = fasttext.load_model(PRETRAINED_MODEL_PATH)



In [33]:
# Approximate the language from the extracted classes and attributes
def get_file_language(metadata):
    # Combine all classes and attributes into one array
    all_metadata = sum(metadata.values(), [])
    
    # Detect language for each word
    predictions = model.predict(all_metadata)
    
    # Specific approach for longer texts due to issue with habit of using English in programming
    def get_field_level_prediction(string):
        # Split into single words to detect on word level
        words = string.split(' ')
        
        # Get single-word predictions
        string_predictions = np.array(model.predict(words))
        
        # Return the appropriate language
        if len(string_predictions) == 1:
            return string_predictions[0][0][0]
        else:
            # Option 1: all identical language identification
            if len(np.unique(string_predictions[0])) == 1:
                return string_predictions[0][0][0]
            
            # Option 2: combination of an English programmatic word with another language, e.g. "hovedmenu view"
            elif len(string_predictions) == 2 and any('__label__en' in label for label in string_predictions):
                return [x for x in map(lambda x: x[0], string_predictions[0]) if x != '__label__en'][0]
            
            # Option 3: majority vote
            else:
                return Counter(map(lambda x: x[0], string_predictions[0])).most_common(1)[0][0]
        
    # Apply to all fields, identify language for all classes and attributes
    predictions = list(map(lambda x: get_field_level_prediction(x), all_metadata))

    # Get the most common language from all attributes
    return Counter(predictions).most_common(1)[0][0]

In [35]:
labeled_cleaned_data = {
    key: {**value, **{'lang': get_file_language(value)}}
    for key, value in cleaned_data.items()
}

In [37]:
labeled_cleaned_data

{'5fae885c-3fde-430f-a3c5-ff44b81f2668.xmi.uml': {'classes': ['class a',
   'class b',
   'class c',
   'class d',
   'class e',
   'class f',
   'class g',
   'class h',
   'class i'],
  'lang': '__label__en'},
 '_oVCEUIHiEeeveJPbhFhy-g.xmi.uml': {'classes': ['site page', 'site section'],
  'attributes': ['id', 'status', 'name', 'url', 'locked', 'default page'],
  'lang': '__label__en'},
 '20b51f77-a952-4206-b35c-b47acd0c631f.xmi.uml': {'classes': ['moteur impl',
   'buffer',
   'selection',
   'presse papier'],
  'attributes': ['texte', 'sel', 'attribute'],
  'lang': '__label__fr'},
 'e8a7e84d-73b3-40d2-ac78-f550566b4de8.xmi.uml': {'classes': ['client',
   'adapter impl',
   'adaptee',
   'context',
   'state a',
   'state b',
   'singleton',
   'abstraction',
   'refined abstraction',
   'concrete implementor',
   'creator impl',
   'product',
   'concrete observer',
   'concrete subject'],
  'attributes': ['instance'],
  'lang': '__label__en'},
 '_oWwJoHfLEeqeQcxm9hmzHw.xmi.uml': {

In [38]:
with open('data/genmymodel_uml_extracted_metadata_annotated.json', 'w') as fp:
    json.dump(labeled_cleaned_data, fp)

In [47]:
filtered_files = {key: value for key, value in labeled_cleaned_data.items() if value['lang'] == '__label__en'}

In [48]:
filtered_files

{'5fae885c-3fde-430f-a3c5-ff44b81f2668.xmi.uml': {'classes': ['class a',
   'class b',
   'class c',
   'class d',
   'class e',
   'class f',
   'class g',
   'class h',
   'class i'],
  'lang': '__label__en'},
 '_oVCEUIHiEeeveJPbhFhy-g.xmi.uml': {'classes': ['site page', 'site section'],
  'attributes': ['id', 'status', 'name', 'url', 'locked', 'default page'],
  'lang': '__label__en'},
 'e8a7e84d-73b3-40d2-ac78-f550566b4de8.xmi.uml': {'classes': ['client',
   'adapter impl',
   'adaptee',
   'context',
   'state a',
   'state b',
   'singleton',
   'abstraction',
   'refined abstraction',
   'concrete implementor',
   'creator impl',
   'product',
   'concrete observer',
   'concrete subject'],
  'attributes': ['instance'],
  'lang': '__label__en'},
 '_oWwJoHfLEeqeQcxm9hmzHw.xmi.uml': {'classes': ['shopping cart',
   'order',
   'line item',
   'account',
   'customer'],
  'attributes': ['creation date', 'id', 'quantity', 'price'],
  'lang': '__label__en'},
 '_OR8QgFq_EeeTF9GWmq2Xvg

In [56]:
all_classes = []
all_attrs = []

for key, value in filtered_files.items():
    if 'attributes' in value.keys():
        all_attrs.append(value['attributes'])
    all_classes.append(value['classes'])

all_classes = np.unique(np.array([x.strip() for x in flatten(all_classes)]))
all_attrs = np.unique(np.array([x.strip() for x in flatten(all_attrs)]))

In [57]:
len(all_classes)

237945

In [54]:
len(all_attrs)

1309564

In [58]:
len(all_attrs)

286324

In [55]:
all_attrs

['id',
 'status',
 'name',
 'url',
 'locked',
 'default page',
 'instance',
 'creation date',
 'id',
 'quantity',
 'price',
 'id',
 'name',
 'response columns',
 'description',
 'text',
 'video unit',
 'unit title',
 'video chapter',
 'subject',
 'lead name',
 'lead phone',
 'lead last name',
 'group',
 'school',
 'result',
 'exam',
 'application date',
 'identifier',
 'kind',
 'maximum score',
 'minimum score',
 'city',
 'state',
 'country',
 'lead exam',
 'school name',
 'school group',
 'rank',
 'version',
 'correct answers',
 'subject diagnostic',
 'study plan',
 'lead reports',
 'school average',
 'group diagnostic',
 'area diagnostic',
 'status',
 'update',
 'sensor id',
 'sensor type',
 'user id',
 'alert id',
 'door id',
 'camera id',
 'speaker id',
 'mic id',
 'light id',
 'sot',
 'eot',
 'tvid',
 'htid',
 'time id',
 'coffee',
 'dish washer',
 'alarm',
 'washing machine',
 'device id',
 'csname',
 'cs phon no.',
 'address',
 'credit cardinfo',
 'email',
 'user id',
 'password