## This code generates the JSON file for phenotypes required by PheWeb

In [2]:
import sys
!{sys.executable} -m pip install openpyxl --no-index

Looking in links: /cvmfs/soft.computecanada.ca/custom/python/wheelhouse/gentoo2020/avx2, /cvmfs/soft.computecanada.ca/custom/python/wheelhouse/gentoo/avx2, /cvmfs/soft.computecanada.ca/custom/python/wheelhouse/gentoo2020/generic, /cvmfs/soft.computecanada.ca/custom/python/wheelhouse/gentoo/generic, /cvmfs/soft.computecanada.ca/custom/python/wheelhouse/generic
Processing /cvmfs/soft.computecanada.ca/custom/python/wheelhouse/generic/openpyxl-3.1.2+computecanada-py2.py3-none-any.whl
Processing /cvmfs/soft.computecanada.ca/custom/python/wheelhouse/generic/et_xmlfile-1.1.0+computecanada-py3-none-any.whl
[0mInstalling collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile openpyxl


In [1]:
import os
import csv

# Define the input directory where the .txt.gz files are located
input_directory = '/home/mikekaz/scratch/CLSA_Mike/Regenie/FINAL/Regenie_nextflow/Binary/results/step2/summary/Pheweb_ready_summary_files'  # Change this to your actual directory

# The path for the CSV file you want to create
csv_file_path = '/lustre06/project/6061810/mikekaz/CLSA/QC/ALL_binary.csv'  # Change this to where you want to save the CSV


# Get a list of all .txt.gz files in the input directory
txt_gz_files = [os.path.join(input_directory, f) for f in os.listdir(input_directory) if f.endswith('.txt.gz')]

# Define the CSV file's columns
columns = ['assoc_files', 'phenocode', 'phenostring', 'category', 'num_cases', 'num_controls', 'num_samples']

# Populate the CSV, 'assoc_files' will be populated with the file path, and 'phenocode' with the base name
with open(csv_file_path, mode='w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=columns)
    writer.writeheader()
    for file_path in txt_gz_files:
        # Extract base name without the extension and the directory path
        base_name = os.path.basename(file_path)  # Gets the filename from the path
        phenocode = os.path.splitext(base_name)[0]  # Removes the extension from the filename
        phenocode = phenocode.replace('.txt', '')  # Ensures .txt is not part of the phenocode
        
        # Remove any leading underscores from the phenocode
        phenocode = phenocode.lstrip('_')
        
        writer.writerow({'assoc_files': file_path, 'phenocode': phenocode})

print(f"CSV file has been created at: {csv_file_path}")



CSV file has been created at: /lustre06/project/6061810/mikekaz/CLSA/QC/ALL_binary.csv


In [2]:
import pandas as pd

# Load the Excel file
excel_path = '2109005_McGill_DTaliun_Baseline_CoPv7-dictionary.xlsx'
df_excel = pd.read_excel(excel_path)

# Convert the Excel data into a dictionary for easy lookup
# The keys are the values from the "name" column, and the values are tuples containing "label" and "class" values
pheno_dict = {row['name']: (row['label'], row['class']) for index, row in df_excel.iterrows()}

# Now read the CSV, update it with phenostring and category, and then write it back
updated_csv_path = '/lustre06/project/6061810/mikekaz/CLSA/QC/ALL_binary.csv'  # Path for the updated CSV

# Reading the original CSV
df_csv = pd.read_csv(csv_file_path)

# Function to update phenostring and category based on the phenocode
def update_row(row):
    phenocode = row['phenocode']
    if phenocode in pheno_dict:
        row['phenostring'], row['category'] = pheno_dict[phenocode]
    else:
        row['phenostring'], row['category'] = None, None  # Or use some default/fallback value
    return row

# Apply the update function to each row
df_updated = df_csv.apply(update_row, axis=1)

# Write the updated DataFrame back to a new CSV
df_updated.to_csv(updated_csv_path, index=False)

print(f"Updated CSV file has been saved to: {updated_csv_path}")


Updated CSV file has been saved to: /lustre06/project/6061810/mikekaz/CLSA/QC/ALL_binary.csv


In [3]:
import pandas as pd

# Load the phenotypes file
phenotypes_path = 'PHENOTYPES_FILE_Mar9.csv'
df_phenotypes = pd.read_csv(phenotypes_path, sep=' ')

# Prepare a dictionary to hold the counts for each phenocode
phenocode_counts = {}

# Iterate over each phenocode column (starting from the third column)
for phenocode in df_phenotypes.columns[2:]:
    num_cases = df_phenotypes[phenocode].value_counts().get(1, 0)
    num_controls = df_phenotypes[phenocode].value_counts().get(0, 0)
    num_samples = num_cases + num_controls
    phenocode_counts[phenocode] = (num_cases, num_controls, num_samples)

# Now read the existing CSV, update it, and then write it back
final_csv_path = '/lustre06/project/6061810/mikekaz/CLSA/QC/ALL_binary.csv'  # Path for the final updated CSV

# Reading the updated CSV
df_csv = pd.read_csv(csv_file_path)

# Function to update num_cases, num_controls, and num_samples based on the phenocode
def update_counts(row):
    phenocode = row['phenocode']
    if phenocode in phenocode_counts:
        row['num_cases'], row['num_controls'], row['num_samples'] = phenocode_counts[phenocode]
    else:
        row['num_cases'], row['num_controls'], row['num_samples'] = None, None, None  # Or use some default/fallback value
    return row

# Apply the update function to each row
df_final = df_csv.apply(update_counts, axis=1)

# Write the final DataFrame back to a new CSV
df_final.to_csv(final_csv_path, index=False)

print(f"Final CSV file has been saved to: {final_csv_path}")


Final CSV file has been saved to: /lustre06/project/6061810/mikekaz/CLSA/QC/ALL_binary.csv


## we need to add TYPE1,2 DIABETES to this file.

In [4]:
import os
import csv

# Define the input directory where the .txt.gz files are located
input_directory = '/home/mikekaz/scratch/CLSA_Mike/Regenie/FINAL/Regenie_nextflow/Binary/Diabetes_only/results/step2/summary'  # Change this to your actual directory

# The path for the CSV file you want to create
csv_file_path = '/lustre06/project/6061810/mikekaz/CLSA/QC/DIA_pheweb_input_data.csv'  # Change this to where you want to save the CSV


# Get a list of all .txt.gz files in the input directory
txt_gz_files = [os.path.join(input_directory, f) for f in os.listdir(input_directory) if f.endswith('.txt.gz')]

# Define the CSV file's columns
columns = ['assoc_files', 'phenocode', 'phenostring', 'category', 'num_cases', 'num_controls', 'num_samples']

# Populate the CSV, 'assoc_files' will be populated with the file path, and 'phenocode' with the base name
with open(csv_file_path, mode='w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=columns)
    writer.writeheader()
    for file_path in txt_gz_files:
        # Extract base name without the extension and the directory path
        base_name = os.path.basename(file_path)  # Gets the filename from the path
        phenocode = os.path.splitext(base_name)[0]  # Removes the extension from the filename
        phenocode = phenocode.replace('.txt', '')  # Ensures .txt is not part of the phenocode
        
        # Remove any leading underscores from the phenocode
        phenocode = phenocode.lstrip('_')
        
        writer.writerow({'assoc_files': file_path, 'phenocode': phenocode})

print(f"CSV file has been created at: {csv_file_path}")



CSV file has been created at: /lustre06/project/6061810/mikekaz/CLSA/QC/DIA_pheweb_input_data.csv


In [5]:
import pandas as pd

# Load the Excel file
excel_path = '2109005_McGill_DTaliun_Baseline_CoPv7-dictionary.xlsx'
df_excel = pd.read_excel(excel_path)

# Convert the Excel data into a dictionary for easy lookup
# The keys are the values from the "name" column, and the values are tuples containing "label" and "class" values
pheno_dict = {row['name']: (row['label'], row['class']) for index, row in df_excel.iterrows()}

# Now read the CSV, update it with phenostring and category, and then write it back
updated_csv_path = '/lustre06/project/6061810/mikekaz/CLSA/QC/DIA_pheweb_input_data.csv'  # Path for the updated CSV

# Reading the original CSV
df_csv = pd.read_csv(csv_file_path)

# Function to update phenostring and category based on the phenocode
def update_row(row):
    phenocode = row['phenocode']
    if phenocode in pheno_dict:
        row['phenostring'], row['category'] = pheno_dict[phenocode]
    else:
        row['phenostring'], row['category'] = None, None  # Or use some default/fallback value
    return row

# Apply the update function to each row
df_updated = df_csv.apply(update_row, axis=1)

# Write the updated DataFrame back to a new CSV
df_updated.to_csv(updated_csv_path, index=False)

print(f"Updated CSV file has been saved to: {updated_csv_path}")


Updated CSV file has been saved to: /lustre06/project/6061810/mikekaz/CLSA/QC/DIA_pheweb_input_data.csv


In [6]:
import pandas as pd

# Load the phenotypes file
phenotypes_path = '/lustre06/project/6061810/mikekaz/CLSA/Regenie/old_files/data/Phe_DIA_Final.csv'
df_phenotypes = pd.read_csv(phenotypes_path, sep=' ')

# Prepare a dictionary to hold the counts for each phenocode
phenocode_counts = {}

# Iterate over each phenocode column (starting from the third column)
for phenocode in df_phenotypes.columns[2:]:
    num_cases = df_phenotypes[phenocode].value_counts().get(1, 0)
    num_controls = df_phenotypes[phenocode].value_counts().get(0, 0)
    num_samples = num_cases + num_controls
    phenocode_counts[phenocode] = (num_cases, num_controls, num_samples)

# Now read the existing CSV, update it, and then write it back
final_csv_path = '/lustre06/project/6061810/mikekaz/CLSA/QC/DIA_pheweb_input_data.csv'  # Path for the final updated CSV

# Reading the updated CSV
df_csv = pd.read_csv(csv_file_path)

# Function to update num_cases, num_controls, and num_samples based on the phenocode
def update_counts(row):
    phenocode = row['phenocode']
    if phenocode in phenocode_counts:
        row['num_cases'], row['num_controls'], row['num_samples'] = phenocode_counts[phenocode]
    else:
        row['num_cases'], row['num_controls'], row['num_samples'] = None, None, None  # Or use some default/fallback value
    return row

# Apply the update function to each row
df_final = df_csv.apply(update_counts, axis=1)

# Write the final DataFrame back to a new CSV
df_final.to_csv(final_csv_path, index=False)

print(f"Final CSV file has been saved to: {final_csv_path}")


Final CSV file has been saved to: /lustre06/project/6061810/mikekaz/CLSA/QC/DIA_pheweb_input_data.csv


## Merging two files

In [7]:
import csv

# Path to the original CSV file and the file with new rows
original_csv_path = 'ALL_binary.csv'
new_rows_csv_path = 'DIA_pheweb_input_data.csv'


# Open the original CSV file in append mode
with open(original_csv_path, mode='a', newline='') as original_file:
    writer = csv.writer(original_file)
    
    # Open the new rows CSV file in read mode
    with open(new_rows_csv_path, mode='r') as new_file:
        reader = csv.reader(new_file)
        
        # Skip the header row of the new rows file
        next(reader)
        
        # Initialize a counter for the rows beyond the header
        row_counter = 0
        
        # Loop through each row in the new rows file
        for row in reader:
            # Since we only want to add the two rows after the header,
            # we check if we've already added two rows
            if row_counter < 2:
                writer.writerow(row)
                row_counter += 1
            else:
                break  # Stop the loop if we've already added two rows


## now that i have the file with all the information for each phenotype, i dont want to redo it everytime in a new dir, i just want to change the dir(assoc_files column) everytime, the rest of the columns are constant. so below, we take an input dir and just change the assoc_files.

In [9]:
# import csv

# def update_csv_dir(csv_file, new_dir, output_file):
#     with open(csv_file, mode='r', newline='') as infile, open(output_file, mode='w', newline='') as outfile:
#         reader = csv.reader(infile)
#         writer = csv.writer(outfile)
        
#         # Read the header and write it unchanged
#         headers = next(reader)
#         writer.writerow(headers)
        
#         # Update the directory path in the first column for each row and write to the output file
#         for row in reader:
#             # Split the original path to get the filename
#             original_path = row[0]
#             filename = original_path.split('/')[-1]
            
#             # Construct the new path and update the row
#             new_path = f"{new_dir}/{filename}"
#             row[0] = new_path
            
#             # Write the updated row to the output file
#             writer.writerow(row)
            
#     print(f"Updated CSV has been saved to {output_file}")

# # Usage example
# original_csv = 'ALL_binary.csv'  # Replace with your CSV file path
# new_directory = '/home/mikekaz/scratch/CLSA_Mike/Regenie/FINAL/Regenie_nextflow/Binary/results/step2/summary/Pheweb_ready_summary_files'
# output_csv = 'ALL_binary.csv'  # Name/path of the output CSV file

# update_csv_dir(original_csv, new_directory, output_csv)


StopIteration: 

In [None]:
import csv
import os

def update_csv_dir(csv_file, new_dir, output_file, encoding='utf-8'):
    # Check if the file exists and is not empty
    if not os.path.exists(csv_file):
        print(f"The file {csv_file} does not exist.")
        return
    if os.path.getsize(csv_file) == 0:
        print(f"The file {csv_file} is empty.")
        return
    
    try:
        with open(csv_file, mode='r', newline='', encoding=encoding) as infile, open(output_file, mode='w', newline='', encoding=encoding) as outfile:
            reader = csv.reader(infile)
            writer = csv.writer(outfile)
            
            try:
                # Read the header and write it unchanged
                headers = next(reader)
                writer.writerow(headers)
            except StopIteration:
                print("The CSV file appears to be empty beyond the headers.")
                return
            
            # Update the directory path in the first column for each row and write to the output file
            for row in reader:
                # Split the original path to get the filename
                original_path = row[0]
                filename = original_path.split('/')[-1]
                
                # Construct the new path and update the row
                new_path = f"{new_dir}/{filename}"
                row[0] = new_path
                
                # Write the updated row to the output file
                writer.writerow(row)
                
        print(f"Updated CSV has been saved to {output_file}")
    except Exception as e:
        print(f"An error occurred while processing the file: {e}")

# Usage example
original_csv = 'ALL_binary.csv'  # Replace with your actual CSV file path
new_directory = '/home/mikekaz/scratch/CLSA_Mike/Regenie/FINAL/Regenie_nextflow/Binary/results/step2/summary/Pheweb_ready_summary_files'
output_csv = 'updated_ALL_binary.csv'  # Consider changing the name to prevent overwriting

update_csv_dir(original_csv, new_directory, output_csv)


In [23]:
#merging Justin's file
import csv

# Path to the original CSV file and the file with new rows
original_csv_path = 'Wave1_CLSA_Pheweb_data.csv'
new_rows_csv_path = 'CLSA_Pheweb_continuous_data.csv'

# Open the original CSV file in append mode
with open(original_csv_path, mode='a', newline='') as original_file:
    writer = csv.writer(original_file)
    
    # Open the new rows CSV file in read mode
    with open(new_rows_csv_path, mode='r') as new_file:
        reader = csv.reader(new_file)
        
        # Skip the header row of the new rows file
        next(reader)
        
        # Loop through each row in the new rows file and add it to the original file
        for row in reader:
            writer.writerow(row)


## convert to JSON

In [10]:
import csv
import json

csv_file_path = 'updated_ALL_binary.csv'
json_file_path = 'updated_ALL_binary.json'

# Initialize an empty list to hold the formatted data
data_to_export = []

try:
    with open(csv_file_path, mode='r', encoding='utf-8') as csv_file:
        # Reading CSV file using comma as delimiter
        csv_reader = csv.DictReader(csv_file, delimiter=',')
        
        for row in csv_reader:
            # Prepare the row for JSON export, selectively excluding missing values
            formatted_row = {}
            for key, value in row.items():
                # Check if the value is missing for 'num_cases' or 'num_controls', if so, skip adding it
                if value == '' and key in ['num_cases', 'num_controls']:
                    continue

                # Reformat the 'assoc_files' field to be a list
                if key == 'assoc_files':
                    formatted_row[key] = [value]
                # Convert 'num_cases', 'num_controls', and 'num_samples' to integers
                elif key in ['num_cases', 'num_controls', 'num_samples'] and value.isdigit():
                    formatted_row[key] = int(value)
                # Add other values as they are
                else:
                    formatted_row[key] = value
            
            # Add the formatted row to the list of data to export
            data_to_export.append(formatted_row)
    
    # Write the data to a JSON file
    with open(json_file_path, mode='w', encoding='utf-8') as json_file:
        json.dump(data_to_export, json_file, indent=2)
    
    print("JSON file was created successfully.")
except Exception as e:
    print(f"An error occurred: {e}")


JSON file was created successfully.
