In [None]:
import requests
from bs4 import BeautifulSoup
import os
import shutil

def clear_directory(local_directory):
    """ Removes all files in the specified directory """
    if os.path.exists(local_directory):
        shutil.rmtree(local_directory)
    print(f"Cleared previous downloads in {local_directory}")

def list_files(url):
    """ Fetches and lists all the .txt files at the given URL """
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        return [url + '/' + node.get('href') for node in soup.find_all('a') if node.get('href').endswith('.txt')]
    else:
        print(f"Failed to access {url}")
        return []

def download_file(url, local_directory):
    """ Downloads a single file given by url and saves it to the specified local directory """
    if not os.path.exists(local_directory):
        os.makedirs(local_directory)
    local_filename = os.path.join(local_directory, url.split('/')[-1])
    response = requests.get(url)
    if response.status_code == 200:
        with open(local_filename, 'wb') as f:
            f.write(response.content)
        print(f"Downloaded {local_filename}")
    else:
        print(f"Failed to download {url}")

# Base URL of the directory containing the subdirectories
base_url = 'https://pages.stat.wisc.edu/~sshen82/bandnorm/Ramani2017'

# List of subdirectories (manually specified or dynamically fetched)
subdirectories = ['GM12878', 'HAP1', 'Hela', 'K562']

# Process each subdirectory
for subdir in subdirectories:
    local_directory = os.path.join(os.getcwd(), subdir)  # Use current directory as the base
    clear_directory(local_directory)  # Clear the directory first
    url = f"{base_url}/{subdir}"
    files = list_files(url)
    for file_url in files:
        download_file(file_url, local_directory) 

In [None]:
import pandas as pd
test_data = pd.read_csv("GM12878/ml3_AAGCGACC-ACCTCTTG.txt", header=None, names=['Chromosome1', 'Start1', 'Chromosome2', 'Start2', 'InteractionCount'], sep="\t")
test_data.head()