In [1]:
from transformers import AutoTokenizer
import os

In [2]:
#to login for token
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Get the tokenizer to tokenize the text data

In [2]:
model_id = "google/gemma-2-2b"
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token_id = tokenizer.eos_token_id

## KNOW THE DATA : 
### 1. Do a statistical analysis of the data. 
### 2. Figure out how much tokens are used for non-important data

### Statistical Analysis
Counts the number of tokens in a string

In [3]:
def count_tokens(text: str) -> int:
    """
    Returns the number of tokens in a text string using the provided tokenizer.

    :param text: The input text to tokenize.
    :return: The number of tokens in the text.
    """
    encoded_tokens = tokenizer.encode(text)
    return len(encoded_tokens)

counts the number of tokens in an xmi file

In [4]:

def count_tokens_in_file(file_path: str) -> int:
    """
    Reads the content of the file and returns the number of tokens using the provided tokenizer.

    :param file_path: The path to the file.
    :return: The number of tokens in the file content.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    return count_tokens(content)

Counts the number of tokens in a folder and make a statistics with categories

In [24]:
def count_tokens_in_folder(folder_path: str):
    """
    Counts the number of tokens for each file in a folder and categorizes them.

    :param folder_path: The path to the folder containing the files.
    :param tokenizer: The tokenizer to use for encoding the text.
    """
    token_counts = {
        '<1000': 0,
        '<2000': 0,
        '<3000': 0,
        '<4000': 0,
        '<5000': 0,
        '<6000': 0,
        '<7000': 0,
        '<=8192': 0,
        '>8192': 0
    }

    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        if os.path.isfile(file_path):
            num_tokens = count_tokens_in_file(file_path)
            if num_tokens < 1000:
                token_counts['<1000'] += 1
            elif num_tokens < 2000:
                token_counts['<2000'] += 1
            elif num_tokens < 3000:
                token_counts['<3000'] += 1
            elif num_tokens < 4000:
                token_counts['<4000'] += 1
            elif num_tokens < 5000:
                token_counts['<5000'] += 1
            elif num_tokens < 6000:
                token_counts['<6000'] += 1
            elif num_tokens < 7000:
                token_counts['<7000'] += 1
            elif num_tokens <= 8192:
                token_counts['<=8192'] += 1
            elif num_tokens > 8192:
                token_counts['>8192'] += 1
                
    for category, count in token_counts.items():
            print(f"Number of xmi files with tokens {category}: {count}")            

In [25]:
def count_files_in_folder(folder_path: str) -> int:
    """
    Counts the number of files in a folder.

    :param folder_path: The path to the folder.
    :return: The number of files in the folder.
    """
    file_count = 0
    for item in os.listdir(folder_path):
        if os.path.isfile(os.path.join(folder_path, item)):
            file_count += 1
    return file_count

Execution !!!

In [26]:
folder_path = 'modelset/raw-data/repo-genmymodel-uml/data'
numbers = count_files_in_folder(folder_path)
print(f"There are {numbers} files in the folder.")
count_tokens_in_folder(folder_path)

There are 5120 files in the folder.
Number of xmi files with tokens <1000: 0
Number of xmi files with tokens <2000: 0
Number of xmi files with tokens <3000: 0
Number of xmi files with tokens <4000: 0
Number of xmi files with tokens <5000: 78
Number of xmi files with tokens <6000: 97
Number of xmi files with tokens <7000: 171
Number of xmi files with tokens <=8192: 182
Number of xmi files with tokens >8192: 4592


### How many tokens are used for non-important data?
This is divided into minor points:
* How many tokens are used before encountering the first class?
* How many tokens are used to describe one class (with all its attributes)?
* How many tokens are required for generalization?

How many tokens are used before encountering the first class?

In [43]:
def read_until_class(file_path: str) -> str:
    """
    Reads the content of the file until it encounters the string 'packagedElement xsi:type="uml:Class"'.

    :param file_path: The path to the file.
    :return: The content read until the specified string is found.
    """
    content = []
    search_string = 'packagedElement xsi:type="uml:Class"'

    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            content.append(line)
            if search_string in line:
                break
    
    return ''.join(content)

Count the number of tokens before the first class

In [51]:
def count_tokens_before_first_class(folder_path: str):
    """
    Counts the number of tokens for each file in a folder before the first class

    :param folder_path: The path to the folder containing the files.
    :param tokenizer: The tokenizer to use for encoding the text.
    """
    token_counts = {
        '<100': 0,
        '<200': 0,
        '<300': 0,
        '<400': 0,
        '<500': 0,
        '<600': 0,
        '<700': 0,
        '<800': 0,
        '<900' : 0,
        '>=900' : 0,
    }

    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        if os.path.isfile(file_path):
            xmi_before_class = read_until_class(file_path)
            num_tokens = count_tokens(xmi_before_class)
            if num_tokens < 100:
                token_counts['<100'] += 1
            elif num_tokens < 200:
                token_counts['<200'] += 1
            elif num_tokens < 300:
                token_counts['<300'] += 1
            elif num_tokens < 400:
                token_counts['<400'] += 1
            elif num_tokens < 500:
                token_counts['<500'] += 1
            elif num_tokens < 600:
                token_counts['<600'] += 1
            elif num_tokens < 700:
                token_counts['<700'] += 1
            elif num_tokens < 800:
                token_counts['<800'] += 1
            elif num_tokens < 900:
                token_counts['<900'] += 1
            elif num_tokens >= 900:
                token_counts['>=900'] += 1

    for category, count in token_counts.items():
        print(f"Number of xmi files with tokens before first class {category}: {count}")  

In [52]:
folder_path = 'modelset/raw-data/repo-genmymodel-uml/data'
numbers = count_files_in_folder(folder_path)
print(f"There are {numbers} files in the folder.")
count_tokens_before_first_class(folder_path)

There are 5120 files in the folder.
Number of xmi files with tokens before first class <100: 0
Number of xmi files with tokens before first class <200: 0
Number of xmi files with tokens before first class <300: 0
Number of xmi files with tokens before first class <400: 0
Number of xmi files with tokens before first class <500: 18
Number of xmi files with tokens before first class <600: 599
Number of xmi files with tokens before first class <700: 366
Number of xmi files with tokens before first class <800: 323
Number of xmi files with tokens before first class <900: 77
Number of xmi files with tokens before first class >=900: 3737


How many tokens does a Class with attributes take?

In [16]:
from extraction_remove_one_class_removed import parse_xmi, namespaces_org, get_classes, get_class_attributes, ET

def count_tokens_of_classes_with_att(folder_path: str):
    """
    Counts the number of tokens for each file in a folder before the first class

    :param folder_path: The path to the folder containing the files.
    :param tokenizer: The tokenizer to use for encoding the text.
    """
    token_sums = {
        'class_with_att_num_1': 0,
        'class_with_att_num_2': 0,
        'class_with_att_num_3': 0,
        'class_with_att_num_4': 0,
        'class_with_att_num_5': 0,
        'class_with_att_num_6': 0,
        'class_with_att_num_>=7': 0
    }

    class_counts = {
        'class_with_att_num_1': 0,
        'class_with_att_num_2': 0,
        'class_with_att_num_3': 0,
        'class_with_att_num_4': 0,
        'class_with_att_num_5': 0,
        'class_with_att_num_6': 0,
        'class_with_att_num_>=7': 0
    }

    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        root = parse_xmi(file_path)
        classes = get_classes(root,namespaces_org)
        for class_elem in classes:
           attributes = get_class_attributes(class_elem)
           num_attributes = len(attributes)
           class_text = ET.tostring(class_elem, encoding='unicode')
           attributes_text = ''.join([ET.tostring(attr, encoding='unicode') for attr in attributes])
           combined_text = class_text + attributes_text
           num_tokens = count_tokens(combined_text)

        if num_attributes == 1:
            token_sums['class_with_att_num_1'] += num_tokens
            class_counts['class_with_att_num_1'] += 1
        elif num_attributes == 2:
            token_sums['class_with_att_num_2'] += num_tokens
            class_counts['class_with_att_num_2'] += 1
        elif num_attributes == 3:
            token_sums['class_with_att_num_3'] += num_tokens
            class_counts['class_with_att_num_3'] += 1
        elif num_attributes == 4:
            token_sums['class_with_att_num_4'] += num_tokens
            class_counts['class_with_att_num_4'] += 1
        elif num_attributes == 5:
            token_sums['class_with_att_num_5'] += num_tokens
            class_counts['class_with_att_num_5'] += 1
        elif num_attributes == 6:
            token_sums['class_with_att_num_6'] += num_tokens
            class_counts['class_with_att_num_6'] += 1
        else:
            token_sums['class_with_att_num_>=7'] += num_tokens
            class_counts['class_with_att_num_>=7'] += 1

    for category in token_sums:
        if class_counts[category] > 0:
            average_tokens = token_sums[category] / class_counts[category]
        else:
            average_tokens = 0
        print(f"Average number of tokens for {category}: {average_tokens}")      
        

In [17]:
folder_path = 'modelset/raw-data/repo-genmymodel-uml/data'
count_tokens_of_classes_with_att(folder_path)

Average number of tokens for class_with_att_num_1: 1229.5967741935483
Average number of tokens for class_with_att_num_2: 1678.4948741845294
Average number of tokens for class_with_att_num_3: 2093.126168224299
Average number of tokens for class_with_att_num_4: 3182.4534161490683
Average number of tokens for class_with_att_num_5: 3064.060606060606
Average number of tokens for class_with_att_num_6: 3772.4310344827586
Average number of tokens for class_with_att_num_>=7: 822.687265278252


Simple example with a JSON file

In [6]:
count_tokens_in_file('modelset/graph/repo-genmymodel-uml/data/0a3c61d9-aec4-4842-9334-ea5ac4edc7ef.xmi/0a3c61d9-aec4-4842-9334-ea5ac4edc7ef.json')

5416