In [8]:
import pandas as pd

# Step 1: Load the data
df = pd.read_csv('query_features.csv')

# Step 2: Filter the DataFrame
# Using str.contains to find 'service' in a case-insensitive manner
filtered_df = df[df['query'].str.contains('service', case=False, na=False)]

# Step 3: Write the selected columns to a new CSV file
filtered_df[['query', 'normalized_parse_tree']].to_csv('output.csv', index=False)


In [16]:
import csv
from collections import defaultdict

def process_triples(triples, allowed_vocabularies):
    triple_counts = defaultdict(int)
    # Iterate over each triple, split by newline
    for triple in triples.split('\n'):
        parts = triple.strip().split(', ')
        # Check if first and last elements contain any of the allowed vocabularies
        if len(parts) >= 3 and \
           any(vocab in parts[0] for vocab in allowed_vocabularies) and \
           any(vocab in parts[-1] for vocab in allowed_vocabularies):
            if len(parts) == 3:
                # For exact triplets
                triple_counts[tuple(parts)] += 1
            else:
                # For longer sequences, create multiple triples
                for mid_part in parts[1:-1]:
                    triple_counts[(parts[0], mid_part, parts[-1])] += 1
    return triple_counts

def write_filtered_triples(input_filepath, output_filepath, allowed_vocabularies):
    with open(input_filepath, mode='r', newline='', encoding='utf-8') as infile, \
         open(output_filepath, mode='w', newline='', encoding='utf-8') as outfile:
        
        reader = csv.DictReader(infile)
        writer = csv.writer(outfile)
        
        # Write header for the output file
        writer.writerow(['subject', 'predicate', 'object', 'count'])
        
        triple_counts = defaultdict(int)
        
        # Process each row in the CSV
        for row in reader:
            triples = row.get('triples', '')
            processed_triples = process_triples(triples, allowed_vocabularies)
            
            # Combine counts from this row with the overall counts
            for triple, count in processed_triples.items():
                triple_counts[triple] += count

        # Write the triples with their counts to the output CSV
        for triple, count in triple_counts.items():
            writer.writerow(list(triple) + [count])

allowed_vocabularies = {
    'sgd',
    'taxonomy',
    'homologene',
    'interpro',
    'bioportal',
    'clinicaltrials',
    'kegg',
    'pharmgkb',
    'hgnc',
    'mesh',
    'omim',
    'sider',
    'ctd',
    'drugbank',
    'mgi',
    'goa',
    'ndc',
    'wormbase',
    'lsr',
    'affymetrix',
    'ncbigene',
    'irefindex',
    'eco',
    'hp',
    'go',
    'apo'}

# Specify the path to your input and output files
input_filepath = 'entity_predicates.csv'
output_filepath = 'federatated_query26.csv'

# Run the function
write_filtered_triples(input_filepath, output_filepath, allowed_vocabularies)
print("done!")

done!


In [17]:
import pandas as pd
import re

# Define the allowed vocabularies
allowed_vocabularies = {
    'sgd', 'taxonomy', 'homologene', 'interpro', 'bioportal', 'clinicaltrials',
    'kegg', 'pharmgkb', 'hgnc', 'mesh', 'omim', 'sider', 'ctd', 'drugbank',
    'mgi', 'goa', 'ndc', 'wormbase', 'lsr', 'affymetrix', 'ncbigene', 'irefindex', 'eco', 
    'hp', 'go', 'apo'}

# Load the data from CSV
df = pd.read_csv('federatated_query26.csv')

# Function to extract vocabulary from URI
def extract_vocabulary(uri):
    for vocab in allowed_vocabularies:
        if vocab in uri:
            return vocab
    return None  # or some default value, e.g., 'unknown'

# Apply the function to each relevant column
df['subject'] = df['subject'].apply(extract_vocabulary)
df['object'] = df['object'].apply(extract_vocabulary)

# Simplify the predicate by removing numbers after 'var'
df['predicate'] = df['predicate'].apply(lambda x: re.sub(r'var\d+', 'var', x))

# Filter rows where subject and object are not the same
df_filtered = df[df['subject'] != df['object']]

# Group by subject, predicate, and object, and sum the counts
result_df = df_filtered.groupby(['subject', 'predicate', 'object']).sum().reset_index()

# Save the modified dataframe to a new CSV
result_df.to_csv('2019_query_DS_rel_patterns26.csv', index=False)
