## 1. Transform "instance_types_en.ttl"

In [5]:
input_file = 'instance_types_en.ttl'  # Input file name containing the data to process
output_file = 'trans_types.ttl'  # Output file name to write the processed data

with open(input_file, 'r') as file:  # Open the input file in read mode
    lines = file.readlines()  # Read all lines from the input file and store them in a list

formatted_lines = []  # Initialize an empty list to store the formatted lines
for line in lines:  # Iterate over each line in the list of lines
    if line.startswith('<http://dbpedia.org/resource/'):  # Check if the line starts with a specific pattern
        parts = line.strip().split(' ')  # Split the line by spaces and remove leading/trailing whitespace
        subject = parts[0].split('/')[-1][:-1]  # Extract the subject from the line
        rdf_type = parts[2].split('/')[-1][:-1]  # Extract the RDF type from the line
        if rdf_type == 'owl#Thing':  # Check if the RDF type is 'owl#Thing'
            rdf_type = 'Thing'  # If so, update the RDF type to 'Thing'
        formatted_line = f'dbr_{subject} rdf_type dbo_{rdf_type}\n'  # Format the line with the subject and RDF type
        formatted_lines.append(formatted_line)  # Add the formatted line to the list of formatted lines

with open(output_file, 'w') as file:  # Open the output file in write mode
    file.writelines(formatted_lines)  # Write all the formatted lines to the output file


## 2. Transform "labels_en.ttl"

In [4]:
with open("labels_en.ttl", "r") as input_file, open("trans_labels.ttl", "w") as output_file:
    for line in input_file:  # Iterate over each line in the input file
        if line.startswith('<http://dbpedia.org/resource/'):  # Check if the line starts with a specific pattern
            parts = line.strip().split(" ")  # Split the line by spaces and remove leading/trailing whitespace

            entity = parts[0].split('/')[-1][:-1]  # Extract the entity from the line
            if len(parts) == 4:  # Check if the line has four parts
                label = parts[2].strip('"')[:-4]  # Extract the label from the line
                output_file.write("__label__dbr_{} {}\n".format(entity, label))  # Write the formatted label to the output file
            if len(parts) == 5:  # Check if the line has five parts
                label1 = parts[2].strip('"')  # Extract the first part of the label
                label2 = parts[3].strip('"')[:-4]  # Extract the second part of the label
                label = label1 + " " + label2  # Combine the two label parts
                output_file.write("__label__dbr_{} {}\n".format(entity, label))  # Write the formatted label to the output file
            if len(parts) > 5:  # Check if the line has more than five parts
                label1 = parts[2].strip('"')  # Extract the first part of the label
                # Extract and format the middle part of the label
                label2 = str(parts[3:-2]).replace('"', '').replace(',', '').replace('[', '').replace(']', '')  
                label3 = parts[-2].strip('"')[:-4]  # Extract the last part of the label
                label = label1 + " " + label2 + " " + label3  # Combine all three label parts
                output_file.write("__label__dbr_{} {}\n".format(entity, label))  # Write the formatted label to the output file


## 3. Transform "mappingbased_objects_en.ttl"

In [5]:
with open("mappingbased_objects_en.ttl", "r") as input_file, open("trans_objects.ttl", "w") as output_file:
    for line in input_file:  # Iterate over each line in the input file
        if line.startswith('<http://dbpedia.org/resource/'):  # Check if the line starts with a specific pattern
            triple_parts = line.strip().split(" ")  # Split the line by spaces and remove leading/trailing whitespace

            # Check if the predicate does not start with specific patterns
            if not triple_parts[1].startswith("<http://xmlns") and not triple_parts[1].startswith("<http://www.w3"):                  
                subject = triple_parts[0][1:-1].split("/")[-1]  # Extract the subject from the line
                predicate = triple_parts[1][1:-1].split("/")[-1]  # Extract the predicate from the line
                object_ = triple_parts[2][1:-1].split("/")[-1]  # Extract the object from the line

                # Create a new line with the formatted subject, predicate, and object
                new_line = f"dbr_{subject} dbo_{predicate} dbr_{object_}"  
                output_file.write(new_line + "\n")  # Write the new line to the output file

                # Additionally, create a reversed line to represent the object as the subject and vice versa
                output_file.write(f"dbr_{object_} dbo_{predicate} dbr_{subject}\n")


## 4. Merge documents

In [6]:
# Read the three files that have been converted
with open("trans_labels.ttl", "r") as labels_file, \
        open("trans_objects.ttl", "r") as objects_file, \
        open("trans_types.ttl", "r") as types_file, \
        open("all.ttl", "w") as output_file:
    # Merge labels files
    for line in labels_file:  # Iterate over each line in the labels file
        output_file.write(line)  # Write the line to the output file

    # Merge objects files
    for line in objects_file:  # Iterate over each line in the objects file
        output_file.write(line)  # Write the line to the output file

    # Merge types files
    for line in types_file:  # Iterate over each line in the types file
        output_file.write(line)  # Write the line to the output file


### 4.1 Delete lines containing characters other than English characters and punctuation

In [1]:

import re

def remove_non_english_lines(filename):
    with open(filename, 'r') as file:
        lines = file.readlines()

    # Matching lines containing non-English characters using regular expressions
    pattern = re.compile(r'[^\x00-\x7F]')
    filtered_lines = [line for line in lines if not pattern.search(line)]

    with open(filename, 'w') as file:
        file.writelines(filtered_lines)

remove_non_english_lines('all.ttl')
