In [None]:
# Import libraries that are required to run your project
# You are allowed to add more libraries as you need

import pandas as pd
import numpy as np
from scipy.stats import spearmanr

Useful to initiate new conda environment given the yml file (Note I adjusted it to work for windows)
```bash
conda env create -f project1_base.yml -n ml4g_project1
```

Note: Maybe we rather make a new one, since it hasn't been updated for 4 years

In [None]:
import os
import zipfile
import shutil

# Define paths
source_dir = 'ML4G_Project_1_Data'  # Path to directory with zip files
target_dir = 'data'  # Target directory for extracted files

# Create target directory if it doesn't exist
os.makedirs(target_dir, exist_ok=True)

# Create folders for each cell line
cell_lines = ['X1', 'X2', 'X3']
for cell_line in cell_lines:
    cell_line_dir = os.path.join(target_dir, cell_line)
    os.makedirs(cell_line_dir, exist_ok=True)

# Get all zip files in the source directory
zip_files = [f for f in os.listdir(source_dir) if f.endswith('.zip')]

# Process each zip file separately
for zip_file in zip_files:
    zip_path = os.path.join(source_dir, zip_file)
    zip_name = os.path.splitext(zip_file)[0]
    print(f"Processing {zip_file}...")
    
    # Extract to a unique temporary directory for this zip file
    temp_extract_dir = os.path.join(target_dir, f'temp_{zip_name}')
    os.makedirs(temp_extract_dir, exist_ok=True)
    
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(temp_extract_dir)
    
    # Walk through all extracted files from this zip and organize them
    for root, dirs, files in os.walk(temp_extract_dir):
        for file in files:
            source_file = os.path.join(root, file)
            
            # Handle CAGE-train files separately
            if 'CAGE-train' in root or file.endswith('.tsv'):
                # Keep CAGE-train files in their own folder
                cage_train_target = os.path.join(target_dir, 'CAGE-train')
                os.makedirs(cage_train_target, exist_ok=True)
                
                # Preserve the relative path structure for CAGE files
                rel_path = os.path.relpath(source_file, temp_extract_dir)
                target_file = os.path.join(cage_train_target, rel_path)
                os.makedirs(os.path.dirname(target_file), exist_ok=True)
                shutil.copy2(source_file, target_file)
                print(f"  Copied {file} to CAGE-train/")
                continue
            
            # Determine which cell line this file belongs to
            cell_line_found = False
            for cell_line in cell_lines:
                if cell_line in file:
                    file_lower = file.lower()
                    
                    # Get file extension
                    if file.endswith('.bw') or file.endswith('.bigwig'):
                        ext = '.bw'
                    elif file.endswith('.bed'):
                        ext = '.bed'
                    else:
                        continue
                    
                    # Determine the mark/assay type from the zip file name
                    if 'dnase' in zip_name.lower():
                        data_type = 'DNase'
                    elif 'h3k27ac' in zip_name.lower():
                        data_type = 'H3K27ac'
                    elif 'h3k27me3' in zip_name.lower():
                        data_type = 'H3K27me3'
                    elif 'h3k4me1' in zip_name.lower():
                        data_type = 'H3K4me1'
                    elif 'h3k4me3' in zip_name.lower():
                        data_type = 'H3K4me3'
                    else:
                        continue
                    
                    # Create new filename: {data_type}_{cell_line}{ext}
                    new_filename = f"{data_type}_{cell_line}{ext}"
                    target_file = os.path.join(target_dir, cell_line, new_filename)
                    
                    # Copy the file to the new location
                    shutil.copy2(source_file, target_file)
                    print(f"  Copied {file} -> {cell_line}/{new_filename}")
                    cell_line_found = True
                    break
    
    # Clean up this zip's temporary directory
    shutil.rmtree(temp_extract_dir)
    print(f"  Cleaned up temp_{zip_name}")

print(f"\nAll {len(zip_files)} zip files have been extracted and organized in {target_dir}")
print(f"Structure: data/{'{X1,X2,X3}'}/{'{DataType}_{CellLine}.{bw,bed}'}")

Extracting CAGE-train.zip...
Extracting DNase-bed.zip...
Extracting DNase-bigwig.zip...
Extracting H3K27ac-bed.zip...
Extracting H3K27ac-bed.zip...
Extracting H3K27ac-bigwig.zip...
Extracting H3K27ac-bigwig.zip...


KeyboardInterrupt: 

## Work Package 1.1 - Modeling Choices & Data Pre-processing

In [None]:
# TODO: 
# Load your feature (bed and/or bigwig and/or fasta) and target files (tsv) here.
# Decide which features to use for training. Feel free to process them however you need.

# NOTE: 
# bed and bigwig files contain signals of all chromosomes (including sex chromosomes).
# Training and validation split based on chromosomes has been done for you. 
# However, you can resplit the data in any way you want.

path_data = "data\\CAGE-train\\CAGE-train" 
path_test = "data\\CAGE-train\\CAGE-train\\X3_test_info.tsv"
test_genes = pd.read_csv(path_test, sep='\t')
# ---------------------------INSERT CODE HERE---------------------------




# ---------------------------------------------------------------------- 

## Work Package 1.2 - Model Building

In [None]:
# TODO: 
# Select the best model to predict gene expression from the obtained features in WP 1.1.

# ---------------------------INSERT CODE HERE---------------------------




# ----------------------------------------------------------------------


## Work Package 1.3 - Prediction on Test Data (Evaluation Metric)

In [None]:
# TODO:
# Using the model trained in WP 1.2, make predictions on the test data (chr 1 of cell line X3).
# Store predictions in a variable called "pred" which is a numpy array.
pred: np.ndarray
pred = np.array([])  # TODO
# ---------------------------INSERT CODE HERE---------------------------




# ----------------------------------------------------------------------

# Check if "pred" meets the specified constrains
assert isinstance(pred, np.ndarray), 'Prediction array must be a numpy array'
assert np.issubdtype(pred.dtype, np.number), 'Prediction array must be numeric'
assert pred.shape[0] == len(test_genes), 'Each gene should have a unique predicted expression'

#### Store Predictions in the Required Format

In [None]:
# Store predictions in a ZIP. 
# Upload this zip on the project website under "Your submission".
# Zip this notebook along with the conda environment (and README, optional) and upload this under "Your code".

save_dir = 'path/to/save/output/file'  # TODO
file_name = 'gex_predicted.csv'         # PLEASE DO NOT CHANGE THIS
zip_name = "LastName_FirstName_Project1.zip" # TODO
save_path = f'{save_dir}/{zip_name}'
compression_options = dict(method="zip", archive_name=file_name)

test_genes['gex_predicted'] = pred.tolist()
test_genes[['gene_name', 'gex_predicted']].to_csv(save_path, compression=compression_options)