In [1]:
#downloads data from tess website into tarballs

import os
import requests

def download_tarball(url, save_path):
    response = requests.get(url)
    with open(save_path, 'wb') as f:
        f.write(response.content)

def main():
    # Base URL where the tarballs are located
    base_url = "https://tess.mit.edu/public/tesstransients/pages/"
    
    # Range of tarball numbers
    start_num = 67
    end_num = 77
    
    # Local directory where you want to save the tarballs
    local_directory = r"C:\Users\eitan\code repos\data\tarballs"
    local_directory = local_directory.strip().replace(" .", ".")
    
    # Create the directory if it does not exist
    if not os.path.exists(local_directory):
        os.makedirs(local_directory)
    
    # Download each tarball file
    for i in range(start_num, end_num + 1):
        # Pad the number with leading zeros if necessary (e.g., s01, s02, ..., s66)
        tarball_num = str(i).zfill(2)
        # Construct the relative URL for the current tarball
        relative_url = f"../lc_bulk/s{tarball_num}.tgz"
        # Construct the full URL by joining the base URL and the relative URL
        full_url = base_url + relative_url
        # Extract the filename from the relative URL
        filename = os.path.basename(relative_url)
        # Construct the local save path
        local_save_path = os.path.join(local_directory, filename)
        # Download the tarball
        print(f"Downloading {filename} from {full_url}...")
        download_tarball(full_url, local_save_path)
        print(f"Downloaded {filename} to {local_save_path}")

if __name__ == "__main__":
    main()


Downloading s67.tgz from https://tess.mit.edu/public/tesstransients/pages/../lc_bulk/s67.tgz...
Downloaded s67.tgz to C:\Users\eitan\code repos\data\tarballs\s67.tgz
Downloading s68.tgz from https://tess.mit.edu/public/tesstransients/pages/../lc_bulk/s68.tgz...
Downloaded s68.tgz to C:\Users\eitan\code repos\data\tarballs\s68.tgz
Downloading s69.tgz from https://tess.mit.edu/public/tesstransients/pages/../lc_bulk/s69.tgz...
Downloaded s69.tgz to C:\Users\eitan\code repos\data\tarballs\s69.tgz
Downloading s70.tgz from https://tess.mit.edu/public/tesstransients/pages/../lc_bulk/s70.tgz...
Downloaded s70.tgz to C:\Users\eitan\code repos\data\tarballs\s70.tgz
Downloading s71.tgz from https://tess.mit.edu/public/tesstransients/pages/../lc_bulk/s71.tgz...
Downloaded s71.tgz to C:\Users\eitan\code repos\data\tarballs\s71.tgz
Downloading s72.tgz from https://tess.mit.edu/public/tesstransients/pages/../lc_bulk/s72.tgz...
Downloaded s72.tgz to C:\Users\eitan\code repos\data\tarballs\s72.tgz
Down

In [44]:
#extracts the tarballs into a local directory, deleting the png files

import os
import tarfile

def extract_tarball(tgz_file, extraction_dir, delete_png=False, add_txt_extension=False):
    try:
        with tarfile.open(tgz_file, 'r:gz') as tar:
            tar.extractall(path=extraction_dir)
            if delete_png:
                # Delete .png files
                for member in tar.getmembers():
                    if member.name.endswith('.png'):
                        os.remove(os.path.join(extraction_dir, member.name))
            if add_txt_extension:
                # Add .txt extension to each kept file
                for member in tar.getmembers():
                    if not member.name.endswith('.png'):
                        new_name = os.path.splitext(member.name)[0] + '.txt'
                        existing_path = os.path.join(extraction_dir, member.name)
                        new_path = os.path.join(extraction_dir, new_name)
                        if not os.path.exists(new_path):  # Check if new path doesn't exist
                            os.rename(existing_path, new_path)
        print(f"Extracted {os.path.basename(tgz_file)} to {extraction_dir}")
    except tarfile.ReadError as e:
        print(f"Error extracting {os.path.basename(tgz_file)}:", e)

def is_extracted(tgz_file, extraction_dir):
    with tarfile.open(tgz_file, 'r:gz') as tar:
        files_in_tar = tar.getnames()
    extracted_files = os.listdir(extraction_dir)
    for file in files_in_tar:
        if file not in extracted_files:
            return False
    return True

def main():
    # Local directory where the .tgz files are located
    local_directory = r"C:\Users\eitan\code repos\data\tarballs"
    
    # Directory where you want to extract the .tgz files
    extraction_dir = r"C:\Users\eitan\code repos\data\extracted tarballs"
    
    # Create the extraction directory if it does not exist
    if not os.path.exists(extraction_dir):
        os.makedirs(extraction_dir)
    
    # Get a list of .tgz files in the local directory
    tgz_files = [f for f in os.listdir(local_directory) if f.endswith('.tgz')]
    
    # Extract each .tgz file if it hasn't been extracted already
    for tgz_file in tgz_files:
        # Construct the full path of the .tgz file
        tgz_file_path = os.path.join(local_directory, tgz_file)
        # Check if the tarball has been extracted
        if not is_extracted(tgz_file_path, extraction_dir):
            # Extract the .tgz file and optionally delete .png files and add .txt extension
            extract_tarball(tgz_file_path, extraction_dir, delete_png=True, add_txt_extension=True)
        else:
            print(f"{os.path.basename(tgz_file)} is already extracted in {extraction_dir}")

if __name__ == "__main__":
    main()


Extracted s01.tgz to C:\Users\eitan\code repos\data\extracted tarballs
Extracted s02.tgz to C:\Users\eitan\code repos\data\extracted tarballs
Extracted s03.tgz to C:\Users\eitan\code repos\data\extracted tarballs
Extracted s04.tgz to C:\Users\eitan\code repos\data\extracted tarballs
Extracted s05.tgz to C:\Users\eitan\code repos\data\extracted tarballs
Extracted s06.tgz to C:\Users\eitan\code repos\data\extracted tarballs
Extracted s07.tgz to C:\Users\eitan\code repos\data\extracted tarballs
Extracted s08.tgz to C:\Users\eitan\code repos\data\extracted tarballs
Extracted s09.tgz to C:\Users\eitan\code repos\data\extracted tarballs
Extracted s10.tgz to C:\Users\eitan\code repos\data\extracted tarballs
Extracted s11.tgz to C:\Users\eitan\code repos\data\extracted tarballs
Extracted s12.tgz to C:\Users\eitan\code repos\data\extracted tarballs
Extracted s13.tgz to C:\Users\eitan\code repos\data\extracted tarballs
Extracted s14.tgz to C:\Users\eitan\code repos\data\extracted tarballs
Extrac

In [45]:
#uniforms the .txt files, copying them into each directory, deleting subdirectories

import os
import shutil

def move_txt_files(source_dir, target_dir):
    # Move .txt files from source directory and subdirectories to target directory
    for root, dirs, files in os.walk(source_dir):
        for file in files:
            if file.endswith('.txt'):
                src_file = os.path.join(root, file)
                dest_file = os.path.join(target_dir, file)
                shutil.move(src_file, dest_file)

def delete_subdirectories(directory):
    # Delete all subdirectories within the directory
    for root, dirs, files in os.walk(directory, topdown=False):
        for folder in dirs:
            folder_path = os.path.join(root, folder)
            # Remove the subdirectory
            shutil.rmtree(folder_path)

def organize_txt_files(extraction_dir):
    # List all folders in the extraction directory
    sectors = [folder for folder in os.listdir(extraction_dir) if os.path.isdir(os.path.join(extraction_dir, folder))]
    
    for sector in sectors:
        sector_dir = os.path.join(extraction_dir, sector)
        move_txt_files(sector_dir, sector_dir)  # Move .txt files back to their original sector folders

    # Delete subdirectories within each sector folder
    for sector in sectors:
        sector_dir = os.path.join(extraction_dir, sector)
        delete_subdirectories(sector_dir)

def main():
    # Directory where the extracted tarballs are located
    extraction_dir = r"C:\Users\eitan\code repos\data\extracted tarballs"
    
    # Organize the .txt files within each sector folder
    organize_txt_files(extraction_dir)
    print(".txt files moved back to their original sector folders. subdirectories deleted.")

if __name__ == "__main__":
    main()




.txt files moved back to their original sector folders. Subdirectories deleted.
