# Data Processing for Altered Dataset (Lower Thresholds)

In this notebook, we reproduce the dataset used in the paper Ghaderi et al. (2025), using different thresholds (see section 2 of the report). The purpose of this notebook is to organise the dataset. The dataset is Galaxy Zoo 2 [Hart et al. (2016), Willett et al. (2013)]. We do this only for galaxies (no stars and no artifacts). We use the following thresholds, as explained in section 2 of the report:

Whether the object is a galaxy or not:
+ t01_smooth_or_features_a03_star_or_artifact_fraction < 0.5

Whether the object presents the characteristics of a spiral galaxy:

+ 0.90 <= t01_smooth_or_features_a02_features_or_disk_fraction <= 0.95
+ 0.90 <= t02_edgeon_a05_no_fraction <= 0.95
+ 0.90 <= t04_spiral_a08_spiral_fraction <= 0.95

Whether the object presents the characteristics of an elliptical galaxy:

+ 0.85 <= t01_smooth_or_features_a01_smooth_fraction <= 0.90
+ 0.85 <= t07_rounded_a16_completely_round_fraction <= 0.90
+ 0.85 <= t06_odd_a15_no_fraction <= 0.90

Whether the object presents the characteristics for it being an "odd" (irregular, ring, lens, disturbed, merger, dust lane, and other galactic features [Ghaderi et al. 2025]) galaxy:

+ 0.85 <= t06_odd_a14_yes_fraction <= 0.90

In [None]:
file_path_columns = "/content/drive/Shared drives/DLP Project/Project/GZ2_thresholds/gz2_hart16.csv.gz"
data_columns = pd.read_csv(file_path_columns)
#data_columns.head()

In [None]:
df_columns = pd.read_csv(file_path_columns, compression='gzip')

# Removing non-galaxy samples
df_galaxies = df_columns[df_columns['t01_smooth_or_features_a03_star_or_artifact_fraction'] < 0.5]

# Apply thresholds for spiral galaxies
spiral_galaxies = df_galaxies[
    (df_galaxies['t01_smooth_or_features_a02_features_or_disk_fraction'] >= 0.90) &
    (df_galaxies['t01_smooth_or_features_a02_features_or_disk_fraction'] <= 0.95) &
    (df_galaxies['t02_edgeon_a05_no_fraction'] >= 0.90) &
    (df_galaxies['t02_edgeon_a05_no_fraction'] <= 0.95) &
    (df_galaxies['t04_spiral_a08_spiral_fraction'] >= 0.90) &
    (df_galaxies['t04_spiral_a08_spiral_fraction'] <= 0.95)]

# Apply thresholds for elliptical galaxies
elliptical_galaxies = df_galaxies[
    (df_galaxies['t01_smooth_or_features_a01_smooth_fraction'] >= 0.85) &
    (df_galaxies['t01_smooth_or_features_a01_smooth_fraction'] <= 0.90) &
    (df_galaxies['t07_rounded_a16_completely_round_fraction'] >= 0.85) &
    (df_galaxies['t07_rounded_a16_completely_round_fraction'] <= 0.90) &
    (df_galaxies['t06_odd_a15_no_fraction'] >= 0.85) &
    (df_galaxies['t06_odd_a15_no_fraction'] <= 0.90)
]

# Apply thresholds for odd objects
odd_objects = df_galaxies[
    (df_galaxies['t06_odd_a14_yes_fraction'] >= 0.85) &
    (df_galaxies['t06_odd_a14_yes_fraction'] <= 0.90)]

print(f"Total Spiral Galaxies: {len(spiral_galaxies)}")
print(f"Total Elliptical Galaxies: {len(elliptical_galaxies)}")
print(f"Total Odd Objects: {len(odd_objects)}")

In [None]:
# File name mapping

file_path_map = "/content/drive/Shared drives/DLP Project/Project/GZ2_thresholds/gz2_filename_mapping.csv"
data_map = pd.read_csv(file_path_map)
#data_map.head()

Extraction with these thresholds has been completed, so this cell is commented out.

In [None]:
# Extracting the galaxies into different folders, depending on class

# Paths to images and output folder
zip_path = "/content/drive/Shared drives/DLP Project/Project/GZ2_thresholds/Datazipped/images_gz2.zip"
output_folder = "/content/drive/Shared drives/DLP Project/Project/GZ2_thresholds/Data_85_90"

# spiral_folder = os.path.join(output_folder, 'spiral')
# elliptical_folder = os.path.join(output_folder, 'elliptical')
# odd_folder = os.path.join(output_folder, 'odd')

# os.makedirs(spiral_folder, exist_ok=True)
# os.makedirs(elliptical_folder, exist_ok=True)
# os.makedirs(odd_folder, exist_ok=True)

# # Read the asset_id mapping
# file_path_map = "/content/drive/Shared drives/DLP Project/Project/GZ2_thresholds/gz2_filename_mapping.csv"
# data_map = pd.read_csv(file_path_map)

# if 'objid' not in data_map.columns or 'asset_id' not in data_map.columns:
#     raise ValueError("Missing required columns: 'objid' or 'asset_id'")

# def extract_images(df, class_folder):
#     with zipfile.ZipFile(zip_path, 'r') as zip_ref:
#         # Merge the data to get the asset_id for the corresponding dr7objid
#         merged_df = df.merge(data_map[['objid', 'asset_id']], left_on='dr7objid', right_on='objid', how='left')

#         if merged_df['asset_id'].isnull().any():
#             print("Warning: Some dr7objid values are missing asset_id mappings")

#         image_names = [f"images/{int(asset_id)}.jpg" for asset_id in merged_df['asset_id'].dropna()]

#         for image_name in tqdm(image_names, desc=f"Extracting images to {class_folder}", unit="image"):
#             if image_name in zip_ref.namelist():
#                 zip_ref.extract(image_name, output_folder)
#                 shutil.move(os.path.join(output_folder, image_name), class_folder)
#             else:
#                 print(f"Image not found: {image_name}")

# extract_images(spiral_galaxies, spiral_folder)
# extract_images(elliptical_galaxies, elliptical_folder)
# extract_images(odd_objects, odd_folder)

In [None]:
def count_files(folder_path):
    return len([f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))])

print(f"Spiral Galaxies: {count_files(spiral_folder)} images")
print(f"Elliptical Galaxies: {count_files(elliptical_folder)} images")
print(f"Odd Objects: {count_files(odd_folder)} images")