In [1]:
import gzip
import shutil

# Rule: Ensure the correct path for the input and output files.
with gzip.open('CAD_META.gz', 'rb') as f_in:
    with open('CAD_META.txt', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

print("GWAS summary data decompressed successfully.")


GWAS summary data decompressed successfully.


In [1]:
import pandas as pd

# Define file path for decompressed file
Genetic_Data_Summary = "C:/Users/emman/Downloads/PM2.5_Pollution_Data-Public/Scripts/Genetic_Data/New_Genetic_Data/CAD_META.txt"  # Replace with your actual path

# Try reading the file in chunks if it's too large
chunk_size = 10000  # Number of rows per chunk
chunks = pd.read_csv(Genetic_Data_Summary, delimiter='\t', chunksize=chunk_size, engine='python')  # Removed low_memory

# Concatenate chunks into a single DataFrame
genetic_df = pd.concat(chunks, ignore_index=True)

# Display first 10 rows to verify loading
print(genetic_df.head(10))

# Display columns to understand dataset structure
print(genetic_df.columns)
print(genetic_df.shape)

# Get basic info on data types and non-null counts
print(genetic_df.info())

# Rule: Clean the data by removing rows with missing critical data and filter significant variants.
# Use 'MarkerName' and 'P-value' instead of 'SNP' and 'P'
genetic_df_cleaned = genetic_df.dropna(subset=['MarkerName', 'Effect', 'P-value'])  # Remove missing critical data
significant_genetic_df = genetic_df_cleaned[genetic_df_cleaned['P-value'] < 5e-8]  # Filter by p-value threshold

# Save cleaned data for future use
significant_genetic_df.to_csv('significant_genetic_data.csv', index=False)
print("Data cleaned and significant SNPs extracted.")

         MarkerName Allele1 Allele2   Freq1  FreqSE  MinFreq  MaxFreq  Effect  \
0  10:100000625_A_G       a       g  0.5604  0.0081   0.5499   0.5667  0.0264   
1  10:100000645_A_C       a       c  0.8060  0.0089   0.7996   0.8184 -0.0119   
2  10:100001867_C_T       t       c  0.0129  0.0007   0.0114   0.0132  0.0296   
3  10:100003242_G_T       t       g  0.8800  0.0034   0.8756   0.8827  0.0107   
4  10:100003304_A_G       a       g  0.9636  0.0037   0.9615   0.9701  0.0066   
5  10:100003785_C_T       t       c  0.6432  0.0006   0.6425   0.6437 -0.0203   
6  10:100004360_A_G       a       g  0.1939  0.0090   0.1814   0.2004  0.0120   
7  10:100004441_C_G       c       g  0.6328  0.0029   0.6290   0.6350 -0.0196   
8  10:100004799_A_C       a       c  0.9863  0.0014   0.9857   0.9891 -0.0185   
9  10:100004906_A_C       a       c  0.4395  0.0081   0.4332   0.4500 -0.0265   

   StdErr   P-value Direction  HetISq  HetChiSq  HetDf  HetPVal        oldID  \
0  0.0056  0.000003        +

In [2]:
import requests

# Step 1: Download the panel file (if you haven't downloaded manually)
url = 'https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/integrated_call_samples_v3.20130502.ALL.panel'
panel_file = 'integrated_call_samples_v3.20130502.ALL.panel'

response = requests.get(url)
with open(panel_file, 'wb') as f:
    f.write(response.content)

print("Panel file downloaded!")

# Step 2: Read panel file and extract GBR sample IDs
panel_df = pd.read_csv(panel_file, sep='\t')
gbr_samples = panel_df[panel_df['pop'] == 'GBR']
gbr_sample_ids = gbr_samples['sample'].tolist()

# Save GBR sample IDs to a file
with open('GBR_sample_ids.txt', 'w') as f:
    for sample_id in gbr_sample_ids:
        f.write(sample_id + '\n')

print(f"Found {len(gbr_sample_ids)} GBR samples.")
print(gbr_sample_ids[:10])


Panel file downloaded!
Found 91 GBR samples.
['HG00096', 'HG00097', 'HG00099', 'HG00100', 'HG00101', 'HG00102', 'HG00103', 'HG00105', 'HG00106', 'HG00107']


In [20]:
import pandas as pd

# Load sample metadata (population info, sample IDs)
samples = pd.read_csv('integrated_call_samples_v3.20130502.ALL.panel', sep='\t')
print(samples.head())

# Columns usually: sample, population, super_population, gender


    sample  pop super_pop  gender  Unnamed: 4  Unnamed: 5
0  HG00096  GBR       EUR    male         NaN         NaN
1  HG00097  GBR       EUR  female         NaN         NaN
2  HG00099  GBR       EUR  female         NaN         NaN
3  HG00100  GBR       EUR  female         NaN         NaN
4  HG00101  GBR       EUR    male         NaN         NaN


In [None]:
import allel              # From scikit-allel: for VCF parsing and genotype analysis
import os                 # For file path and existence checks

# File paths
vcf_path = r"ALL.chr1.phase3_shapeit2_mvncall_integrated_v5b.20130502.genotypes (1).vcf.gz"
samples_path = "integrated_call_samples_v3.20130502.ALL.panel"
output_file = "allele_freq_chr1.csv.gz"  # Output will be compressed CSV

print("VCF file exists:", os.path.exists(vcf_path))#Check VCF File Exists

# Load samples info and filter EUR samples
samples_df = pd.read_csv(samples_path, sep="\t")## Load sample panel
eur_samples = samples_df[samples_df['super_pop'] == 'EUR']['sample'].tolist()#Loads metadata and filters for European ancestry samples

# Prepare chunk parameters
chunk_size = 50_000 #Number of variants per chunk
chrom = '1' #Target chromosome

# Read all positions for chromosome 1 to identify variant locations.
vcf_all_pos = allel.read_vcf(vcf_path, fields=['variants/CHROM', 'variants/POS'])
positions = vcf_all_pos['variants/POS']
chroms = vcf_all_pos['variants/CHROM']

# Filter chromosome 1 positions and Create a mask for chromosome 1 and filters positions accordingly.
chr1_mask = chroms == chrom
chr1_positions = positions[chr1_mask]

# Remove old output file if exists
if os.path.exists(output_file):
    os.remove(output_file)

# Iterate and write chunks directly to CSV
start_idx = 0#Initializes loop variables
first_chunk = True
total_variants_chr1 = len(chr1_positions)#calculates total number of variants.

# Batch size for writing to disk to reduce I/O frequency
batch_size = 10  # Number of chunks to process before saving

while start_idx < total_variants_chr1:#Loops through all variants in chunks.
    end_idx = min(start_idx + chunk_size, total_variants_chr1)
    start_pos = chr1_positions[start_idx]#Calculates start/end positions and formats region string for VCF slicing.
    end_pos = chr1_positions[end_idx - 1]

    region_str = f"{chrom}:{start_pos}-{end_pos}"
    print(f"Processing region: {region_str}")

    #Read VCF Chunk
    chunk = allel.read_vcf(
        vcf_path,
        samples=eur_samples,
        region=region_str,
        fields=['variants/CHROM', 'variants/POS', 'variants/REF', 'variants/ALT', 'calldata/GT'],
        alt_number=1
    )#Loads genotype and variant data for the specified region and EUR samples.

    #Skip Empty Chunks
    if chunk is None or len(chunk['variants/POS']) == 0:
        start_idx = end_idx
        continue#Skips processing if no variants were found in the chunk.

    genotypes = allel.GenotypeArray(chunk['calldata/GT'])## Convert to GenotypeArray
    allele_counts = genotypes.count_alleles()## Count alleles per variant
    alt_freqs = allele_counts[:, 1] / allele_counts.sum(axis=1)## ALT allele frequency

    alts = chunk['variants/ALT']
    if alts.ndim == 2:
        alts = alts[:, 0]#If ALT alleles are stored as 2D arrays (e.g., multiple alleles), extract the first.

    #Create DataFrame for Chunk
    df_chunk = pd.DataFrame({
        'chr': chunk['variants/CHROM'],
        'pos': chunk['variants/POS'],
        'ref': chunk['variants/REF'],
        'alt': alts,
        'alt_freq_eur': alt_freqs
    })

    # Write chunks in batches
    if (start_idx // chunk_size) % batch_size == 0:
        # Save all chunks in batch to CSV
        df_chunk.to_csv(output_file, mode='a', header=first_chunk, index=False, compression='gzip')
        first_chunk = False

    start_idx = end_idx

print(f"✅ Finished saving to {output_file}")


In [None]:
df = pd.read_csv("allele_freq_chr1.csv.gz")

In [None]:
import pandas as pd
# Define the path to your gzipped scoring file
pgs_scoring_file = "C:/Users/emman/Downloads/PM2.5_Pollution_Data-Public/Scripts/Genetic_Data/GEO_Accession_Genetic_Data/PGS000018.txt.gz"

#Load the file using pandas with robust settings
pgs_df = pd.read_csv(
    pgs_scoring_file, ## File path
    sep='\t', ## Tab-separated values
    compression='gzip', ## File is gzipped
    comment='#',         # Skip all comment lines starting with '#'
    engine='python', # More tolerant parser than default 'c'
    on_bad_lines='skip' # Skip malformed lines instead of raising errors
)

#Display the first few rows of the parsed DataFrame
print(pgs_df.head())
print(pgs_df.columns)


In [None]:
print("Genetic Data Columns:", genetic_df.columns.tolist())
print("PGS Columns:", pgs_df.columns.tolist())


In [39]:
import pandas as pd

# Assuming both files are already loaded as DataFrames
# Example:
# genetic_df = pd.read_csv("CAD_META.txt", sep="\t")
# pgs_df = pd.read_csv("PGS000018.txt.gz", sep="\t", compression="gzip", comment="#")

def merge_datasets(genetic_df, pgs_df, left_col="oldID", right_col="rsID", how="inner"):
    # Check columns exist
    if left_col not in genetic_df.columns:
        raise KeyError(f"❌ Column '{left_col}' not found in genetic data")
    if right_col not in pgs_df.columns:
        raise KeyError(f"❌ Column '{right_col}' not found in PGS scoring data")
    
    merged = pd.merge(genetic_df, pgs_df, left_on=left_col, right_on=right_col, how=how)
    
    print(f"✅ Merge complete: {merged.shape[0]} rows, {merged.shape[1]} columns")
    print("\n📄 Preview:")
    print(merged.head())
    
    return merged

# Example usage:
merged_df = merge_datasets(genetic_df, pd.read_csv(pgs_scoring_file, sep="\t", compression="gzip", comment="#"), "oldID", "rsID", "inner")


✅ Merge complete: 1744386 rows, 24 columns

📄 Preview:
         MarkerName Allele1 Allele2   Freq1  FreqSE  MinFreq  MaxFreq  Effect  \
0  10:100003304_A_G       a       g  0.9636  0.0037   0.9615   0.9701  0.0066   
1  10:100004799_A_C       a       c  0.9863  0.0014   0.9857   0.9891 -0.0185   
2   10:10000586_C_T       t       c  0.9864  0.0020   0.9856   0.9913  0.0313   
3  10:100009542_A_G       a       g  0.0253  0.0015   0.0225   0.0261 -0.0102   
4  10:100015153_A_G       a       g  0.0174  0.0004   0.0166   0.0176 -0.0353   

   StdErr  P-value  ... HetPVal        oldID  CHR         BP         rsID  \
0  0.0158   0.6787  ...  0.1433   rs72828461   10  100003304   rs72828461   
1  0.0267   0.4873  ...  0.2299   rs77264786   10  100004799   rs77264786   
2  0.0286   0.2737  ...  0.1131  rs190955300   10   10000586  rs190955300   
3  0.0203   0.6153  ...  0.8578   rs11598533   10  100009542   rs11598533   
4  0.0261   0.1773  ...  0.9068  rs141332000   10  100015153  rs141332000

In [None]:
import pandas as pd

def merge_with_allele_freq(merged_df, allele_freq_df, left_cols=('CHR', 'BP'), right_cols=('chr', 'pos'), how='inner'):
    # Ensure inputs are DataFrames
    if not isinstance(merged_df, pd.DataFrame):
        raise TypeError("❌ 'merged_df' must be a pandas DataFrame")
    if not isinstance(allele_freq_df, pd.DataFrame):
        raise TypeError("❌ 'allele_freq_df' must be a pandas DataFrame")
    
    # Check required columns exist
    for col in left_cols:
        if col not in merged_df.columns:
            raise KeyError(f"❌ Column '{col}' not found in merged_df")
    for col in right_cols:
        if col not in allele_freq_df.columns:
            raise KeyError(f"❌ Column '{col}' not found in allele_freq_df")
    
    # Harmonize chromosome format (remove 'chr' prefix if needed)
    merged_df[left_cols[0]] = merged_df[left_cols[0]].astype(str).str.replace('chr', '', case=False)
    allele_freq_df[right_cols[0]] = allele_freq_df[right_cols[0]].astype(str).str.replace('chr', '', case=False)
    
    # Ensure positions are integers
    merged_df[left_cols[1]] = merged_df[left_cols[1]].astype(int)
    allele_freq_df[right_cols[1]] = allele_freq_df[right_cols[1]].astype(int)
    
    merged_full = pd.merge(merged_df, allele_freq_df, left_on=left_cols, right_on=right_cols, how=how)
    
    print(f"✅ Merge complete: {merged_full.shape[0]} rows, {merged_full.shape[1]} columns")
    print("\n📄 Preview:")
    print(merged_full.head())
    
    return merged_full

# Example usage:
merged_full_df = merge_with_allele_freq(merged, allele_freq_df, ('CHR', 'BP'), ('chr', 'pos'), 'inner')


In [13]:
# Filter for genome-wide significant SNPs (p-value < 5e-8)
significance_snps = gwas_df[gwas_df['P-value'] < 5e-8]
# Keep only needed columns: MarkerName, Effect allele, Effect size, P-value, chromosome, position
filtered_gwas = sig_snps[['MarkerName', 'Allele1', 'Effect', 'P-value', 'CHR', 'BP']]
print(f"Number of significant SNPs: {filtered_gwas.shape[0]}")
filtered_gwas.head()

NameError: name 'gwas_df' is not defined