## This code identifies the independent hits (criteria defined below) using the PheWeb output

In [2]:
import json
from collections import defaultdict

# Define the file path
file_path = '/lustre06/project/6060121/CLSA_PheWeb_shared/pheweb/binary/generated-by-pheweb/top_hits.json'

# Load the JSON data
with open(file_path, 'r') as file:
    data = json.load(file)

# Define the distance threshold for independence (500kb)
DISTANCE_THRESHOLD = 500000  # 500 kb in base pairs
P_VALUE_THRESHOLD = 5e-8  # P-value threshold for significance

# Function to check if a hit is independent
def is_independent(new_hit, hits):
    for hit in hits:
        if new_hit['chrom'] == hit['chrom'] and abs(new_hit['pos'] - hit['pos']) < DISTANCE_THRESHOLD:
            return False
    return True

# Dictionary to store hits and independent hits categorized by 'category'
categories = defaultdict(lambda: {'hits': 0, 'independent_hits': 0, 'hits_list': [], 'independent_hits_list': []})

# Process each hit in the data
for hit in data:
    if hit['pval'] < P_VALUE_THRESHOLD:
        category = hit['category']
        categories[category]['hits'] += 1
        categories[category]['hits_list'].append(hit)
        if is_independent(hit, categories[category]['independent_hits_list']):
            categories[category]['independent_hits'] += 1
            categories[category]['independent_hits_list'].append(hit)

# Display the results
for category, info in categories.items():
    print(f"Category: {category}")
    print(f"Total Hits: {info['hits']}")
    print(f"Independent Hits: {info['independent_hits']}\n")


Category: health
Total Hits: 177
Independent Hits: 163

Category: socio-economic
Total Hits: 218
Independent Hits: 180

Category: behaviour
Total Hits: 85
Independent Hits: 79

Category: medications
Total Hits: 5
Independent Hits: 4

Category: Identity
Total Hits: 1
Independent Hits: 1

