In [1]:
import pandas as pd
import requests
import re
from tqdm.auto import tqdm
import json

In [3]:
# Enable tqdm for pandas
tqdm.pandas()

In [79]:
# Load overview of files
df = pd.read_csv('data/UMLFiles_List_V2.0.csv', sep=',')

# Initial data cleaning

In [80]:
# Get filetypes: we cannot use image files for scraping
df['filetype'] = df['Model Link - Github'].apply(lambda x: x.split('.')[-1])

# Check which filetypes are there
print(df['filetype'].unique())

# Remove rows with #VALUE! in it
df = df[df['Model Link - Github'] != '#VALUE!']

# Remove unusable file types
df = df[df['filetype'].isin(['xmi', 'uml'])]

# The dataframe contains GitHub URLs, not the "raw" files that we need
# Function to get the raw file URL from the GitHub URL
def get_raw_file(url):
    response = requests.get(url)
    data = response.text
    targets = re.findall(r'href="(.+/raw/.+?)"', data)
    
    try: 
        return 'https://github.com' + targets[0]
    except:
        # Page no longer exists, add tag to remove later
        return 'REMOVE'

# Do the harvesting 
df['raw_filepath'] = df['Model Link - Github'].progress_apply(lambda x: get_raw_file(x))

# Remove all (currently) non-existent file paths from the dataframe
df = df[df['raw_filepath'] != 'REMOVE']

['png' 'xmi' 'uml' 'jpg' 'svg' 'jpeg' 'gif' 'bmp' '#VALUE!']


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=35766.0), HTML(value='')))




In [4]:
# Save file for later use
df.to_csv('data/cleaned_files.csv', sep=',', index=False)

# Start wrangling

In [4]:
df = pd.read_csv('data/cleaned_files.csv', sep=',')

## UML file types

In [5]:
# UML file types
umls = df[df['filetype'] == 'uml']

# Saving regular expression patterns
attr_pattern_list = []
class_pattern_list = []

## Defining patterns


### Programming-like class definition 

[In this file,](https://raw.githubusercontent.com/2ndchance/1400/master/projects/lionheart/docs/class.uml) classes are loosely defined as they would be in a programming language, as follows:

- class Unit
- class Crown
- class Knight
- class Archer
- class Infantry


In [6]:
class_pattern_list.append(r'\s[C|c]lass (\w+)')

### StarUML patterns

[In this file](https://raw.githubusercontent.com/ahmadpaudji/DTTI/master/Data%20Analisis/Model/UML/Relasiclass/RelasiClass.uml), classes are defined as types of XPD tags with the type UMLClassView, holding - among a lot of information about styling - the name in a NameLabel:

```
<XPD:OBJ name="OwnedViews[1]" type="UMLClassView" guid="v3U+HcJ2A0Spjyh+TK3TXwAA">
    <XPD:REF name="Model">kLKcecl2p0mgUzYZeLw6ugAA</XPD:REF>
    <XPD:OBJ name="NameCompartment" type="UMLNameCompartmentView" guid="mWqid1MIQkOogFZpBhhw4wAA">
        <XPD:OBJ name="NameLabel" type="LabelView" guid="8MeBm9Aod0ONFTy5oBebLwAA">
            <XPD:ATTR name="FontStyle" type="integer">1</XPD:ATTR>
            <XPD:ATTR name="Text" type="string">Restaurateur</XPD:ATTR>
        </XPD:OBJ>
        <XPD:OBJ name="StereotypeLabel" type="LabelView" guid="+JkN9MPr60W4Xzrm5xB/yAAA">
            <XPD:ATTR name="Visible" type="boolean">False</XPD:ATTR>
        </XPD:OBJ>
        <XPD:OBJ name="PropertyLabel" type="LabelView" guid="815dty7IP0CoNKguDRdc1gAA">
            <XPD:ATTR name="Visible" type="boolean">False</XPD:ATTR>
        </XPD:OBJ>
    </XPD:OBJ>
    <XPD:OBJ name="AttributeCompartment" type="UMLAttributeCompartmentView" guid="a/fEJW1c8UWMqIqW2WiKggAA">
        <XPD:REF name="Model">kLKcecl2p0mgUzYZeLw6ugAA</XPD:REF>
    </XPD:OBJ>
</XPD:OBJ>
```

Attributes have their own XPD tag with the type UMLAttribute:

```
<XPD:OBJ name="Attributes[0]" type="UMLAttribute" guid="X1GHr9/mo0qO3cVzMG4rygAA">
    <XPD:ATTR name="Name" type="string">idIngredient</XPD:ATTR>
    <XPD:REF name="Owner">vCJCqw/CaE6zNzyIYm3smwAA</XPD:REF>
</XPD:OBJ>
```

In [7]:
attr_pattern_list.append(r'XPD:OBJ .+?type="UMLAttribute" .+?type="string">(.+?)<\/XPD:ATTR>')
class_pattern_list.append(r'<XPD:OBJ .+? type="UMLClassView" .+? name="NameLabel" .+?XPD:ATTR name="Text".+?>(.+?)<\/XPD:ATTR>')

### Eclipse patterns

[In this file,](https://raw.githubusercontent.com/yuanheng1988/sedr/master/WebRoot/WEB-INF/classes/iscas/nfs/itechs/ese/servlets/servlets.uml) classes and attributes are defined following the OMG UML2 standard, as follows:

#### Class
```<packagedElement xmi:type="uml:Class" xmi:id="_Bkv94HeZEeCY6ocbkEuouA" name="SingleServlet">```

#### Attribute
```<ownedAttribute xmi:id="_G7hjY5X0EdyE2YnIO6QIiA" name="name" visibility="private" type="_GXK5JJX0EdyE2YnIO6QIiA">```

In [8]:
attr_pattern_list.append(r'<ownedAttribute.+?name="(\w+)"')
class_pattern_list.append(r'"uml:Class".+?name="(\w+)"')
class_pattern_list.append(r'name="(\w+)".+?"uml:Class"')

### Visual Studio patterns

[In this file,](https://raw.githubusercontent.com/aa8391093/LixuecongOnlineRepository/master/2015.7.14_UML_Learning_eg_class_time/2015.7.14_UML_Learning_eg_class_time/ModelDefinition/2015.7.14_UML_Learning_eg_class_time.uml) classes are defined as their own tags, having properties as subtags:

```
<class
    Id="1bcf7ec8-3157-477d-bbff-28c495793c6c"
    name="Time"
    isAbstract="false"
    isLeaf="false"
    isActiveClass="false">
    <ownedAttributesInternal>
      <property
        Id="98ba3fbd-3004-4f43-9e46-e0d026d5a0db"
        name="sec"
        visibility="Protected"
        isLeaf="false"
        isStatic="false"
        isReadOnly="false"
        isUnique="false"
        isDerived="false"
        isDerivedUnion="false"
        aggregation="None"
        isComposite="false" />
    </ownedAttributesInternal>
</class>
```

In [9]:
attr_pattern_list.append(r'<property .+?name="(\w+)"(?=.+?<\/ownedAttributesInternal>.+?<\/class>)')
class_pattern_list.append(r'<class.+?name="(\w+)"')

### Nodes and edges patterns

[In this file,](https://raw.githubusercontent.com/1berg/SimMA/master/Bilder/classes.uml) classes are defined as subproperties of the main ```classes``` object, and represented as a node: 

```
<nodes>
    <node x="285.0" y="343.5">classes.LightSensor</node>
    <node x="246.99999999999994" y="556.0">classes.Simulator</node>
    <node x="879.0" y="256.0">classes.Roboter</node>
    <node x="681.0" y="343.5">classes.TouchSensor</node>
    <node x="18.0" y="311.0">classes.Parcours</node>
    <node x="948.0" y="741.0">classes.Delay</node>
    <node x="0.0" y="741.0">classes.MotorPort</node>
    <node x="755.0" y="741.0">classes.UltrasonicSensor</node>
    <node x="547.0" y="741.0">classes.BildEinleser</node>
    <node x="394.921875" y="21.5">classes.SensorPort</node>
    <node x="0.0" y="0.0">classes.Leinwand</node>
    <node x="323.0" y="741.0">classes.Button</node>
    <node x="23.0" y="1123.0">classes.Motor</node>
    <node x="26.5" y="932.0">classes.NXTRegulatedMotor</node>
    <node x="483.0" y="343.5">classes.ColorSensor</node>
</nodes>
```

In [10]:
class_pattern_list.append(r'classes.(\w+)')

### XML UML tag patterns

[In this file,](https://raw.githubusercontent.com/arjunswaj/self-healing-water-networks/master/Documentation/UML/Class%20Diagram/class_diagram.uml) UML tags are used to define classes (```<UML:Class />```) and attributes (```<UML:Attribute />```): 

```
<UML:Class xmi.id = '-84--88--128-68--5c1f21aa:14811985826:-8000:0000000000000962' name = 'Source' visibility = 'public' isSpecification = 'false' isRoot = 'false' isLeaf = 'false' isAbstract = 'false' isActive = 'false'>
    <UML:Classifier.feature>
        <UML:Attribute xmi.id = '-84--88--128-68--5c1f21aa:14811985826:-8000:0000000000000963' name = 'type' visibility = 'public' isSpecification = 'false' ownerScope = 'instance' changeability = 'changeable' targetScope = 'instance'>
            <UML:StructuralFeature.multiplicity>
                <UML:Multiplicity xmi.id = '-84--88--128-68--5c1f21aa:14811985826:-8000:0000000000000964'>
                    <UML:Multiplicity.range>
                        <UML:MultiplicityRange xmi.id = '-84--88--128-68--5c1f21aa:14811985826:-8000:0000000000000965' lower = '1' upper = '1'/>
                    </UML:Multiplicity.range>
                </UML:Multiplicity>
            </UML:StructuralFeature.multiplicity>
            <UML:StructuralFeature.type>
                <UML:DataType href = 'http://argouml.org/profiles/uml14/default-uml14.xmi#-84-17--56-5-43645a83:11466542d86:-8000:000000000000087E'/>
            </UML:StructuralFeature.type>
        </UML:Attribute>
    </UML:Classifier.feature>
</UML:Class>
```

In [11]:
class_pattern_list.append(r'<UML:Class [^/]+?name ?= ?[\'|"]([\w\s]+?)[\'|"]')
attr_pattern_list.append(r'<UML:Attribute [^/]+?name ?= ?[\'|"]([\w\s]+?)[\'|"]')

## Search for patterns in files


In [12]:
flatten = lambda t: [item for sublist in t for item in sublist]

def get_classes_and_attributes_from_uml(uml):
    # Remove all line breaks and other "alternative" whitespace
    uml = uml.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
    
    # Gather classes and attributes using defined patterns
    classes = flatten([re.findall(regex, uml) for regex in class_pattern_list])
    attributes = flatten([re.findall(regex, uml) for regex in attr_pattern_list])
    
    # Save classes and attributes in list if they hold values
    found_data = {}
    if len(classes) > 0:
        found_data['classes'] = list(dict.fromkeys(classes))
    if len(attributes) > 0:
        found_data['attributes'] = list(dict.fromkeys(attributes))
    
    return found_data

def get_file_text(url):
    # Get the file of the url
    response = requests.get(url)
    
    return response.text

def get_data_from_url(url):
    text = get_file_text(url)
    data = get_classes_and_attributes_from_uml(text)
    
    return data

In [13]:
get_data_from_url('https://raw.githubusercontent.com/ahmadpaudji/DTTI/master/Data%20Analisis/Model/UML/Relasiclass/RelasiClass.uml')

{'classes': ['Absen',
  'Model_absen',
  'Model_tambahan',
  'Model_izin',
  'Izin',
  'Model_kpi',
  'KPI',
  'Model_muhasabah',
  'Muhasabah',
  'Model_pegawai',
  'Pegawai'],
 'attributes': ['izin_rules array()',
  '$tambah_pegawai_rules',
  '$pegawai_rules']}

In [None]:
data_dict = {}

for file in tqdm(umls['raw_filepath'].tolist()):
    try:
        data = get_data_from_url(file)

        if len(data.keys()) > 0:
            data_dict[file] = data
    except:
        print(file)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=26872.0), HTML(value='')))

In [None]:
with open('data/uml_extracted_metadata.json', 'w') as fp:
    json.dump(data_dict, fp)

In [None]:
all_classes = []
all_attrs = []

for key in data_dict.keys():
    if 'classes' in data_dict[key].keys():
        all_classes.append(data_dict[key]['classes'])
    
    if 'attributes' in data_dict[key].keys():
        all_attrs.append(data_dict[key]['attributes'])