## Dataset preprocessing

In [1]:
import pandas as pd
df = pd.read_csv('2018-02-26_2018-03-25_all.tsv', lineterminator='\n', dtype=str, header=None)

print(len(df))
df.head()

82211742


Unnamed: 0,0
0,anonymizedQuery\ttimestamp\tsourceCategory\tus...
1,SELECT+*%0AWHERE+%7B%0A++%3Fvar1++%3Chttp%3A%2...
2,SELECT+%3Fvar1++%3Fvar2Label++%3Fvar3++%3Fvar4...
3,+ASK%0AWHERE+%7B%0A++BIND+%28++YEAR+%28++%3Fva...
4,SELECT+*%0AWHERE+%7B%0A++%3Fvar1++%3Chttp%3A%2...


In [1]:
import pandas as pd

# Specify the path to your TSV file
file_path = '2018-02-26_2018-03-25_all.tsv'  # replace with actual path

# Load the file and filter for robotic queries directly
df = pd.read_csv(file_path, delimiter='\t', usecols=['anonymizedQuery', 'sourceCategory'])

# Filter for rows where sourceCategory is 'robotic'
robotic_queries = df[df['sourceCategory'] == 'robotic']['anonymizedQuery']

# Count the total number of robotic queries
total_robotic_queries = len(robotic_queries)
print(f"Total number of robotic queries: {total_robotic_queries}")

# Get unique queries and their counts
unique_queries_counts = robotic_queries.value_counts()

# Write the unique queries and their counts to an output CSV file
output_file = 'wiki-robotic-unique-count.csv'  # specify the output path
unique_queries_counts.to_csv(output_file, header=['count'], index_label='query')

print(f"Unique queries and their counts saved to {output_file}")


Total number of robotic queries: 81339186
Unique queries and their counts saved to wiki-robotic-unique-count.csv


In [2]:
import pandas as pd
df = pd.read_csv('wiki-robotic-unique-count.csv')

print(len(df))
df.head()

18940103


Unnamed: 0,query,count
0,SELECT%28++REGEX+%28++%22string1%22%2C+%22stri...,2624171
1,SELECT+*%0AWHERE+%7B%0A++%3Chttp%3A%2F%2Fwww.w...,804269
2,+ASK%0AWHERE+%7B%0A++BIND+%28++%3Chttp%3A%2F%2...,789038
3,SELECT+%3Fvar1++%3Fvar2+%0AWHERE+%7B%0A++VALUE...,749423
4,SELECT+%3Fvar1++%3Fvar2++%3Fvar3++%3Fvar4++%3F...,507649


In [3]:
import pandas as pd
import urllib.parse

# Load the encoded CSV file (replace 'your_file.csv' with your actual file path)
df = pd.read_csv('wiki-robotic-unique-count.csv')

# Decode the 'query' column
df['query'] = df['query'].apply(urllib.parse.unquote_plus)

# Save the decoded data to a new CSV file
df.to_csv('wiki-robotic-unique-count-decoded_queries2.csv', index=False)

print("Decoded CSV saved as 'decoded_queries.csv'")


Decoded CSV saved as 'decoded_queries.csv'


In [4]:
import pandas as pd
df = pd.read_csv('wiki-robotic-unique-count-decoded_queries2.csv')

print(len(df))
df.head()

18940103


Unnamed: 0,query,count
0,"SELECT( REGEX ( ""string1"", ""string2"" ) AS ...",2624171
1,SELECT *\nWHERE {\n <http://www.wikidata.org>...,804269
2,ASK\nWHERE {\n BIND ( <http://www.wikidata....,789038
3,SELECT ?var1 ?var2 \nWHERE {\n VALUES ( ?va...,749423
4,SELECT ?var1 ?var2 ?var3 ?var4 ?var5 ?var...,507649


In [5]:
#Prefix Addition 

import pandas as pd

def add_prefix(query):
    # The common prefix to add to each query
    prefix = """PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
PREFIX dc: <http://purl.org/dc/elements/1.1/>
PREFIX dcterms: <http://purl.org/dc/terms/>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX schema: <http://schema.org/>
PREFIX geo: <http://www.w3.org/2003/01/geo/wgs84_pos#>

"""
    return prefix + query

def add_prefix_to_csv(input_file, output_file):
    # Read the CSV file into a Pandas DataFrame
    df = pd.read_csv(input_file)

    # Add the prefix to the "query" column using the add_prefix function
    df['query'] = df['query'].apply(add_prefix)

    # Save the updated DataFrame to a new CSV file
    df.to_csv(output_file, index=False)

if __name__ == "__main__":
    input_csv_file = "wiki-robotic-unique-count-decoded_queries2.csv"  # Replace with the path to your input CSV file
    output_csv_file = "query_prefixes_added.csv"  # Replace with the path to your output CSV file
    add_prefix_to_csv(input_csv_file, output_csv_file)
    print('Done!')

Done!


In [6]:
df = pd.read_csv('query_prefixes_added.csv', lineterminator='\n', dtype=str, header=None)

print(len(df))
df.head()

18940104


Unnamed: 0,0,1
0,query,count
1,PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-...,2624171
2,PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-...,804269
3,PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-...,789038
4,PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-...,749423


In [None]:
#  parse queries
# Install Node.js dependencies
!npm install sparqljs csv-parser csv-stringify

# Create the JavaScript file
js_code = """
const fs = require('fs');
const SparqlParser = require('sparqljs').Parser;
const csvParser = require('csv-parser');
const { stringify } = require('csv-stringify');

const parser = new SparqlParser();

async function executeQuery(query) {
  try {
    const parsedQuery = parser.parse(query);
    return JSON.stringify(parsedQuery);
  } catch (error) {
    console.error('Error parsing query:', query);
    return 'Error parsing the query.';
  }
}

async function writeBatchToCsv(batch, outputFile) {
  return new Promise((resolve, reject) => {
    const writeStream = fs.createWriteStream(outputFile, { flags: 'a' });
    const csvStringifier = stringify({ header: false, columns: [ 'Parsed_Query','Count'], delimiter: ',' });

    writeStream.on('error', (error) => {
      reject(error);
    });

    csvStringifier.pipe(writeStream);

    csvStringifier.on('end', () => {
      writeStream.end();
      resolve();
    });

    batch.forEach((entry) => {
      csvStringifier.write([ entry.Parsed_Query , entry.Count ]);
    });

    csvStringifier.end();
  });
}

async function main() {
  const inputCsvFile = 'query_prefixes_added.csv';
  const outputCsvFile = 'wikidata-robotic-parsed.csv';
  const batchSize = 1000000;

  const readStream = fs.createReadStream(inputCsvFile).pipe(csvParser());

  let batch = [];
  for await (const row of readStream) {
    const sparqlQuery = row['query'];
    const count = row['count'];

    const parsedQuery = await executeQuery(sparqlQuery);

    batch.push({ Parsed_Query: parsedQuery ,Count: count });

    if (batch.length >= batchSize) {
      await writeBatchToCsv(batch, outputCsvFile);
      console.log(`Processed ${batch.length} queries.`);
      batch = [];
    }
  }

  if (batch.length > 0) {
    await writeBatchToCsv(batch, outputCsvFile);
  }

  console.log('All SPARQL queries executed and results written to output CSV file.');
}

main();
"""

# Write JavaScript code to a file
with open("optimized.js", "w") as f:
    f.write(js_code)

# Python code to run the JavaScript script
python_code = """
import subprocess

def run_script():
    # Run the JavaScript code using Node.js with an increased memory limit
    result = subprocess.run(['node', '--max-old-space-size=30720', 'optimized.js'], stdout=subprocess.PIPE, text=True)

    # Print the output (JSON representation of the parsed SPARQL query)
    print(result.stdout)
    print('hi')

if __name__ == "__main__":
    run_script()
"""

# Write Python code to a file
with open("optimized_from_js_.py", "w") as f:
    f.write(python_code)

# Execute the script directly
print("Running the script...")

!python optimized_from_js_.py > parseoutput.txt 2>&1

# Print the log output
with open("parseoutput.txt", "r") as log_file:
    output = log_file.read()
    print(output)


In [1]:
import pandas as pd
df = pd.read_csv('wikidata-robotic-parsed.csv', lineterminator='\n', dtype=str, header=None)

print(len(df))
df.head()

18940103


Unnamed: 0,0,1
0,"{""queryType"":""SELECT"",""variables"":[{""expressio...",2624171
1,"{""queryType"":""SELECT"",""variables"":[{}],""where""...",804269
2,"{""queryType"":""ASK"",""where"":[{""type"":""bind"",""va...",789038
3,"{""queryType"":""SELECT"",""variables"":[{""termType""...",749423
4,"{""queryType"":""SELECT"",""variables"":[{""termType""...",507649


In [2]:
# seperate valid queries and unvalid queries

import csv

# Define the input and output file paths
input_file_path = 'wikidata-robotic-parsed.csv'
output_file_path = 'valid_wikidata-robotic-parsed.csv'
s = 0
# Open the input CSV file for reading
with open(input_file_path, 'r', newline='\n', encoding='utf-8') as input_file:
    # Create a CSV reader without header
    reader = csv.reader(input_file)
    
    # Create a list to store valid rows
    valid_rows = []
    
    for row in reader:
        if row[0] == 'Error parsing the query.':
            s += 1
        # Check if the second column is not equal to 'Error parsing the query'
        elif len(row) >= 2 and row[0] != 'Error parsing the query.':
            valid_rows.append(row)

# Open the output CSV file for writing
with open(output_file_path, 'w', newline='\n', encoding='utf-8') as output_file:
    # Create a CSV writer with column names
    writer = csv.writer(output_file)
    
    # Write column names
    writer.writerow(['parsed_query', 'count' ])
    
    # Write the valid rows
    writer.writerows(valid_rows)

print(f"Valid rows have been written to '{output_file_path}'.")
print(s)

Valid rows have been written to 'valid_wikidata-robotic-parsed.csv'.
53


In [3]:
import pandas as pd
df = pd.read_csv('valid_wikidata-robotic-parsed.csv', lineterminator='\n', dtype=str, header=None)

print(len(df))
df.head()
#88494

18940051


Unnamed: 0,0,1
0,parsed_query,count\r
1,"{""queryType"":""SELECT"",""variables"":[{""expressio...",2624171\r
2,"{""queryType"":""SELECT"",""variables"":[{}],""where""...",804269\r
3,"{""queryType"":""ASK"",""where"":[{""type"":""bind"",""va...",789038\r
4,"{""queryType"":""SELECT"",""variables"":[{""termType""...",749423\r


In [None]:
#normalize parse tree

import pandas as pd
import json
from multiprocessing import Pool, Lock

# Initialize a global lock for writing to the file safely across processes
lock = Lock()

# The normalization function
def find_and_normalize_variables(parsed_query):
    unique_vars = set()

    def find_variables(node):
        if isinstance(node, dict):
            for key, value in node.items():
                if isinstance(value, dict) or isinstance(value, list):
                    find_variables(value)
                elif key == "value" and node.get("termType") == "Variable":
                    unique_vars.add(value)
        elif isinstance(node, list):
            for item in node:
                find_variables(item)

    def normalize_variables(node, var_mapping):
        if isinstance(node, dict):
            for key, value in node.items():
                if isinstance(value, dict) or isinstance(value, list):
                    normalize_variables(value, var_mapping)
                elif key == "value" and node.get("termType") == "Variable":
                    node[key] = var_mapping.get(value, value)
        elif isinstance(node, list):
            for item in node:
                normalize_variables(item, var_mapping)

    find_variables(parsed_query)
    var_mapping = {var: f"var{index + 1}" for index, var in enumerate(sorted(unique_vars))}
    normalize_variables(parsed_query, var_mapping)

    if "variables" in parsed_query:
        for variable in parsed_query["variables"]:
            var_name = variable.get("value")
            if var_name in var_mapping:
                variable["value"] = var_mapping[var_name]

    return parsed_query

# Function to apply normalization to each row's parsed query
def process_row(row):
    parsed_query = json.loads(row['parsed_query'])
    normalized_query = find_and_normalize_variables(parsed_query)
    return json.dumps(normalized_query)

# Function to process a chunk of data, used by each worker process
def process_chunk(chunk):
    chunk['normalized_parse_tree'] = chunk.apply(process_row, axis=1)
    return chunk[['normalized_parse_tree', 'count']]

# Function to write a processed chunk to the output file with a lock
def write_chunk_to_file(chunk, output_file):
    with lock:
        chunk.to_csv(output_file, mode='a', index=False, header=False)

# Main processing function with parallel processing and file locking
def process_large_file_in_chunks(input_file, output_file, chunk_size=50000, num_cpus=30):
    # Write the header to the output file initially
    pd.DataFrame(columns=['normalized_parse_tree', 'count']).to_csv(output_file, index=False)

    # Use multiprocessing pool with specified number of CPUs
    with Pool(processes=num_cpus) as pool:
        # Read the file in chunks and process each chunk in parallel
        for chunk in pd.read_csv(input_file, chunksize=chunk_size):
            # Apply `process_chunk` on each chunk asynchronously
            result = pool.apply_async(process_chunk, args=(chunk,))
            # Write the processed chunk to file once it's ready
            write_chunk_to_file(result.get(), output_file)

# Run the optimized processing with parallelism and chunking
if __name__ == "__main__":
    input_file = 'valid_wikidata-robotic-parsed.csv'
    output_file = 'tree_normalized.csv'
    chunk_size = 1000000
    num_cpus = 20

    process_large_file_in_chunks(input_file, output_file, chunk_size, num_cpus)
    print('done')


In [None]:
import pandas as pd
df = pd.read_csv('tree_normalized.csv', lineterminator='\n', dtype=str, header=None)

print(len(df))
df.head()
18940050

In [None]:
import csv
import json
import pandas as pd
from multiprocessing import Pool, Lock, cpu_count

# File paths
input_csv_file = 'tree_normalized.csv'
output_csv_file = 'query_features.csv'
count_triples_file = 'count_triples.csv'

# Initialize a global lock for safe writing to the output file
lock = Lock()

# Define a recursive function to handle nested path types
def extract_predicate_values(items):
    predicates = []
    for item in items:
        if "value" in item:
            predicates.append(item["value"])
        elif "pathType" in item:
            nested_predicates = extract_predicate_values(item.get("items", []))
            predicates.extend(nested_predicates)
        else:
            predicates.append("Unknown")
    return predicates

# Modify the extract_triples function to use the recursive function
def extract_triples(parse_tree):
    local_triples = []
    if "triples" in parse_tree:
        for triple in parse_tree["triples"]:
            try:
                subject = triple.get("subject", {}).get("value", "Unknown")
                
                # Check if the predicate is a direct value or a path
                if "value" in triple.get("predicate", {}):
                    predicate = triple["predicate"]["value"]
                elif "pathType" in triple.get("predicate", {}):
                    items = triple["predicate"].get("items", [])
                    predicate_values = extract_predicate_values(items)
                    predicate = ", ".join(predicate_values)
                else:
                    predicate = "nothing"
                
                obj = triple.get("object", {}).get("value", "Unknown")
                local_triples.append((subject, predicate, obj))
            except KeyError as e:
                print(f"Error extracting triple: {e}")
                print("Offending triple:", triple)
    
    # Recursively call the function on child nodes
    for key, value in parse_tree.items():
        if isinstance(value, dict):
            local_triples += extract_triples(value)
        elif isinstance(value, list):
            for item in value:
                if isinstance(item, dict):
                    local_triples += extract_triples(item)
    
    return local_triples

# Process a single row and extract triples
def process_row(row, fieldnames):
    row_dict = {fieldnames[i]: row[i] for i in range(len(row))}
    try:
        parse_tree = json.loads(row[0])
        triples = extract_triples(parse_tree)
        triples_str = '\n'.join([f"{s}, {p}, {o}" for s, p, o in triples])
        row_dict['triples'] = triples_str
    except json.JSONDecodeError:
        row_dict['triples'] = "Invalid JSON"
    return row_dict

# Write processed rows to the output file safely with a lock
def write_rows(rows, fieldnames):
    with lock:
        with open(output_csv_file, 'a', newline='') as outfile:
            writer = csv.DictWriter(outfile, fieldnames=fieldnames)
            for row in rows:
                writer.writerow(row)

# Process a chunk of data, applying process_row to each row
def process_chunk(chunk, fieldnames):
    processed_rows = [process_row(row, fieldnames) for row in chunk]
    write_rows(processed_rows, fieldnames)

# Main processing function with parallelism and chunking
def process_large_file_in_chunks(input_file, chunk_size=700000, num_cpus=20):
    # Get the header and initialize the output file with the header row
    with open(input_file, 'r') as infile:
        reader = csv.reader(infile)
        fieldnames = next(reader) + ['triples']
        with open(output_csv_file, 'w', newline='') as outfile:
            writer = csv.DictWriter(outfile, fieldnames=fieldnames)
            writer.writeheader()

    # Set up a pool for parallel processing
    with Pool(processes=num_cpus) as pool:
        chunk = []
        with open(input_file, 'r') as infile:
            reader = csv.reader(infile)
            next(reader)  # Skip header row

            # Process each row in chunks
            for row in reader:
                chunk.append(row)
                if len(chunk) >= chunk_size:
                    pool.apply_async(process_chunk, args=(chunk, fieldnames))
                    chunk = []
            
            # Process any remaining rows in the last chunk
            if chunk:
                pool.apply_async(process_chunk, args=(chunk, fieldnames))
        
        pool.close()
        pool.join()

    # Load and inspect the resulting DataFrame
    df_input = pd.read_csv(input_file)
    print("Length of tree_normalized.csv:", len(df_input))
    print("Head of tree_normalized.csv:")
    print(df_input.head())

    df_output = pd.read_csv(output_csv_file)
    print("Length of query_features.csv:", len(df_output))
    print("Head of query_features.csv:")
    print(df_output.head())

    # Calculate unique values in 'normalized_parse_tree'
    unique_count = df_input['normalized_parse_tree'].nunique()
    print("Number of unique values in 'normalized_parse_tree':", unique_count)

    # Select 'count' and 'triples' columns and save to count_triples.csv
    selected_columns = df_output[['count', 'triples']]
    selected_columns.to_csv(count_triples_file, index=False)
    print("Selected columns 'count' and 'triples' written to", count_triples_file)

if __name__ == "__main__":
    process_large_file_in_chunks(input_csv_file, chunk_size=700000, num_cpus=20)
    print("Processing complete. Output written to", output_csv_file)


In [None]:
import pandas as pd
df = pd.read_csv('query_features.csv', lineterminator='\n', dtype=str, header=None)

print(len(df))
df.head()
# 844132

In [None]:
# count and triples 


import pandas as pd

# Define the path to the input CSV file
input_csv_path = 'query_features.csv'  # Make sure to replace this with the actual path to your CSV file

# Define the path to the output CSV file
output_csv_path = 'count_triples.csv'

# Read the input CSV file into a pandas DataFrame
df = pd.read_csv(input_csv_path)

# Select the 'count' and 'triples' columns
selected_columns = df[['count', 'triples']]

# Write the selected columns to the output CSV file
selected_columns.to_csv(output_csv_path, index=False)

print('CSV file "count_triples.csv" has been created successfully.')


In [1]:
import pandas as pd
df = pd.read_csv('count_triples.csv', lineterminator='\n', dtype=str, header=None)

print(len(df))
df.head()

18940051


Unnamed: 0,0,1
0,count,triples
1,2624171,
2,804269,"http://www.wikidata.org, http://schema.org/dat..."
3,789038,"var1, http://www.wikidata.org/prop/direct/P279..."
4,749423,"var1, http://www.wikidata.org/prop/direct/P434..."


In [None]:


import csv

def refine_list(input_list, exclude_starts_with, exclude_contains):
    """
    Refines the input list by excluding items based on the starting strings or contained strings.
    """
    refined_list = []
    for item in input_list:
        if any(item.startswith(exclude) for exclude in exclude_starts_with) or any(exclude in item for exclude in exclude_contains):
            continue
        refined_list.append(item)
    return refined_list

def process_triples(triples):
    errors = 0
    entities = []
    predicates = []
    for triple in triples.split('\n'):
        parts = triple.split(', ')
        
        # Handling triples with more than three elements
        if len(parts) != 3:
            if len(parts) > 3 and parts[0] != "http://www.bigdata.com/rdf#serviceParam":
                entities.append(parts[0])
                entities.append(parts[-1])
                predicates.extend(parts[1:-1])
            elif len(parts) > 3 and parts[0] == "http://www.bigdata.com/rdf#serviceParam":
                entities.append(parts[0])
                predicates.append(parts[1])
                entities.extend(parts[2:]) 
            else:
                errors += 1
            continue
        
        entities.extend([parts[0], parts[2]])
        predicates.append(parts[1])

    exclude_starts_with = ['var', 'e_b', 'g_']
    exclude_contains = ['nonsensical']
    
    # Refining entities and predicates lists
    refined_entities = refine_list(entities, exclude_starts_with, exclude_contains)
    refined_predicates = refine_list(predicates, exclude_starts_with, exclude_contains)
    
    return refined_entities, refined_predicates, errors

def write_output(input_csv, output_csv):
    total_errors = 0
    with open(input_csv, mode='r', encoding='utf-8') as infile, \
         open(output_csv, mode='w', newline='', encoding='utf-8') as outfile:
        reader = csv.DictReader(infile)
        fieldnames = reader.fieldnames + ['entities', 'predicates']
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        writer.writeheader()
        for row in reader:
            entities, predicates, errors = process_triples(row['triples'])
            total_errors += errors
            row['entities'] = '; '.join(entities)
            row['predicates'] = '; '.join(predicates)
            writer.writerow(row)
    return total_errors

# Replace 'input.csv' and 'output.csv' with the actual filenames
input_csv = 'count_triples.csv'
output_csv = 'entity_predicates.csv'

errors_encountered = write_output(input_csv, output_csv)
print("Enhanced data with entities and predicates have been saved to", output_csv)
print(f"Total triples processing errors encountered: {errors_encountered}")


Enhanced data with entities and predicates have been saved to entity_predicates.csv
Total triples processing errors encountered: 456039


In [3]:
import pandas as pd
df = pd.read_csv('entity_predicates.csv', lineterminator='\n', dtype=str, header=None)

print(len(df))
df.head()
# 844132

18940051


Unnamed: 0,0,1,2,3
0,count,triples,entities,predicates\r
1,2624171,,,\r
2,804269,"http://www.wikidata.org, http://schema.org/dat...",http://www.wikidata.org,http://schema.org/dateModified\r
3,789038,"var1, http://www.wikidata.org/prop/direct/P279...",http://www.bigdata.com/queryHints#Prior; forward,http://www.wikidata.org/prop/direct/P279; http...
4,749423,"var1, http://www.wikidata.org/prop/direct/P434...",,http://www.wikidata.org/prop/direct/P434\r


# Unique queries

## Used schema types


In [1]:
import pandas as pd

# Read the input CSV file
input_csv = "entity_predicates.csv"  # Update this path to your input CSV file
df = pd.read_csv(input_csv)

# Initialize a dictionary to hold the aggregated counts
aggregated_counts = {}

# Iterate through each row of the DataFrame
for index, row in df.iterrows():
    # Check if the "entities" column value is a string
    if isinstance(row['entities'], str):
        # Split the entities and convert to a set of unique entities
        unique_entities = set(row['entities'].strip().split(';'))
    else:
        # If not a string, proceed with an empty set for this row
        unique_entities = set()
    
    # Simply count each unique entity as one occurrence per row
    for entity in unique_entities:
        entity = entity.strip()
        if entity in aggregated_counts:
            aggregated_counts[entity] += 1
        else:
            aggregated_counts[entity] = 1

# Convert the aggregated dictionary to a DataFrame for writing to CSV
output_df = pd.DataFrame(list(aggregated_counts.items()), columns=['Entity', 'TotalCount'])

# Write the output DataFrame to a CSV file
output_csv = "8_count_unique_entity_no_repeat.csv"  # Update this path to your output CSV file
output_df.to_csv(output_csv, index=False)

print(f"Aggregated counts written to {output_csv}")


Aggregated counts written to 8_count_unique_entity_no_repeat.csv


In [2]:
import pandas as pd
df = pd.read_csv('8_count_unique_entity_no_repeat.csv', lineterminator='\n', dtype=str, header=None)

print(len(df))
df.head()
# 31004

12318451


Unnamed: 0,0,1
0,Entity,TotalCount
1,http://www.wikidata.org,1
2,http://www.bigdata.com/queryHints#Prior,37221
3,forward,23678
4,string1,17796


In [9]:
# extract the types from queries that exist in KG schema

import csv

# File paths _data/codes/dsri__6_march/journal_paper_codes/wikidata/data
input_file_path_1 = '8_count_unique_entity_no_repeat.csv'  # First input file with two columns: type, TotalCount
input_file_path_2 = 'clasenahayi.csv'  # Second input file with one column
output_file_path = 'robotic_heatmap_unique_types2018_no_repeat.csv'   # Output file

# Read the types from the second input file into a set for faster search
types_in_second_file = set()
with open(input_file_path_2, mode='r', newline='') as file:
    reader = csv.reader(file)
    next(reader, None)  # Skip header if there is one
    for row in reader:
        types_in_second_file.add(row[0])

# Read the first input file and write relevant rows to the output file
with open(input_file_path_1, mode='r', newline='') as infile, \
     open(output_file_path, mode='w', newline='') as outfile:
    reader = csv.reader(infile)
    writer = csv.writer(outfile)
    
    header = next(reader)  # Assuming the first row is a header
    writer.writerow(header)  # Write the header to the output file
    
    for row in reader:
        
        if row[0] in types_in_second_file:  # Check if the type is in the second file
            writer.writerow(row)  # Write the whole row to the output file
        # else:
            # print( row[0])
print("Aggregated counts written to heatmap_types.csv")

Aggregated counts written to heatmap_types.csv


In [10]:
import pandas as pd
df = pd.read_csv('robotic_heatmap_unique_types2018_no_repeat.csv', lineterminator='\n', dtype=str)
df_u= df.drop_duplicates()
print(len(df))
print(len(df_u))
df.head()
# 12590

58648
58648


Unnamed: 0,Entity,TotalCount\r
0,http://www.wikidata.org/entity/Q698,9\r
1,http://www.wikidata.org/entity/Q5,441414\r
2,http://www.wikidata.org/entity/Q16521,1232\r
3,http://www.wikidata.org/entity/Q15981151,15\r
4,http://www.wikidata.org/entity/Q6581072,1833\r


#used Schema predicates based on the unique queries



In [5]:
import pandas as pd

# Read the input CSV file
input_csv = "entity_predicates.csv"  # Update this path to your input CSV file
df = pd.read_csv(input_csv)

# Initialize a dictionary to hold the aggregated counts
aggregated_counts = {}

# Iterate through each row of the DataFrame
for index, row in df.iterrows():
    # Check if the "entities" column value is a string
    if isinstance(row['predicates'], str):
        # Split the entities and convert to a set of unique entities
        unique_entities = set(row['predicates'].strip().split(';'))
    else:
        # If not a string, proceed with an empty set for this row
        unique_entities = set()

    # Aggregate counts across all rows
    for entity in unique_entities:
        entity = entity.strip()
        if entity in aggregated_counts:
            aggregated_counts[entity] += 1
        else:
            aggregated_counts[entity] = 1

# Convert the aggregated dictionary to a DataFrame for writing to CSV
output_df = pd.DataFrame(list(aggregated_counts.items()), columns=['predicate', 'TotalCount'])

# Write the output DataFrame to a CSV file
output_csv = "8_unique_count_predicates_no_repeat.csv"  # Update this path to your output CSV file
output_df.to_csv(output_csv, index=False)

print(f"Aggregated counts written to {output_csv}")


Aggregated counts written to 8_unique_count_predicates_no_repeat.csv


In [6]:
import pandas as pd
df = pd.read_csv('8_unique_count_predicates_no_repeat.csv', dtype=str)

print(len(df))
df.head()
#1911

14164


Unnamed: 0,predicate,TotalCount
0,http://schema.org/dateModified,59
1,http://www.bigdata.com/queryHints#gearing,37221
2,http://www.wikidata.org/prop/direct/P279,1064266
3,http://www.wikidata.org/prop/direct/P434,1100
4,http://www.wikidata.org/prop/P105,4942


In [None]:
input_file_path_2 = 'kpredicate.csv'  # Second input file with one column
output_file_path = 'heatmap_predicates2017_no_repeat.csv'

In [11]:
# extract valid predicates that exist in schema predicate

import csv

# File paths
input_file_path_1 = '8_unique_count_predicates_no_repeat.csv'  # First input file with two columns: type, TotalCount
input_file_path_2 = 'kpredicate.csv'  # Second input file with one column
output_file_path = 'robotic_heatmap_unique_predicates2018_no_repeat.csv'   # Output file

# Read the types from the second input file into a set for faster search
types_in_second_file = set()
with open(input_file_path_2, mode='r', newline='') as file:
    reader = csv.reader(file)
    next(reader, None)  # Skip header if there is one
    for row in reader:
        types_in_second_file.add(row[0])

# Read the first input file and write relevant rows to the output file
with open(input_file_path_1, mode='r', newline='') as infile, \
     open(output_file_path, mode='w', newline='') as outfile:
    reader = csv.reader(infile)
    writer = csv.writer(outfile)
    
    header = next(reader)  # Assuming the first row is a header
    writer.writerow(header)  # Write the header to the output file
    
    for row in reader:
        if row[0] in types_in_second_file:  # Check if the type is in the second file
            writer.writerow(row)  # Write the whole row to the output file
print("Aggregated counts written to heatmap_predicates77.csv")


Aggregated counts written to heatmap_predicates77.csv


In [12]:
import pandas as pd
df = pd.read_csv('robotic_heatmap_unique_predicates2018_no_repeat.csv', lineterminator='\n', dtype=str, header=None)

print(len(df))
df.head()

935


Unnamed: 0,0,1
0,predicate,TotalCount\r
1,http://www.wikidata.org/prop/direct/P279,1064266\r
2,http://www.wikidata.org/prop/direct/P17,75600\r
3,http://www.wikidata.org/prop/direct/P21,22934\r
4,http://www.wikidata.org/prop/direct/P19,55728\r
