## Recuperation des images cropped

In [26]:
import pandas as pd
import requests as req
import os.path
import numpy as np
from datetime import datetime

In [27]:
path = os.getcwd()

## Constantes

In [28]:
# Get the absolute path of the parent directory of the given 'path'
DIR_PATH = os.path.abspath(os.path.join(path, os.pardir))

# Get the absolute path of the 'data' directory within the parent directory of the given 'path'
DATA_PATH = os.path.abspath(os.path.join(path, os.pardir, 'data'))

# Get the absolute path of the 'images_cropped' directory within the 'data' directory in the parent directory of the given 'path'
DATA_IMAGES_CROP = os.path.abspath(os.path.join(path, os.pardir, 'data','images_cropped'))

# Get the absolute path of the 'logs' directory within the parent directory of the given 'path'
LOGS_IMAGES = os.path.abspath(os.path.join(path, os.pardir, 'logs'))


In [29]:
# Read a CSV file containing star classification data into a DataFrame
df =pd.read_csv('../data/stars_classification_with_images.csv')

# Select only the columns of interest from the DataFrame
df = df[['obj_ID','alpha','delta','spec_obj_ID', 'class']]

# Remove duplicate rows from the DataFrame, modifying the DataFrame in place.
# The inplace=True parameter ensures that the operation modifies the original
# DataFrame rather than returning a new one.
df.drop_duplicates(inplace=True)

# Convert the 'spec_obj_ID' column to integers
df['spec_obj_ID'] = df['spec_obj_ID'].apply(int)


In [30]:
# Calculate and return the number of unique values in the 'spec_obj_ID' column
df['spec_obj_ID'].nunique()

99999

In [31]:
# Remove any rows from the DataFrame that contain missing values
df.dropna()

# Create a copy of the DataFrame 'df' and assign it to 'df_test'
df_test = df.copy()

# Convert the 'spec_obj_ID' column to integers and return the array of unique values
df_test['spec_obj_ID'].apply(int).unique()

array([ 5658976714552007680, 12462617271914000384,  6961443351364392960,
       ...,  3112007759562827776,  7601079570549462016,
        8343152349989327872], dtype=uint64)

In [32]:
# Create a dictionary with keys 'spec_obj_ID', 'class', and 'filename'
def classification_data(df_spec_id, df_class):
    # 'spec_obj_ID' is assigned the value of df_spec_id
    # 'class' is assigned the value of df_class
    # 'filename' is assigned the string '{df_spec_id}.jpeg', where df_spec_id
    # is interpolated into the string
    dict = {'spec_obj_ID':df_spec_id,
            'class': df_class,
            'filename':f'{df_spec_id}.jpeg'}
    return dict

In [33]:
images_location = '/images_cropped'

In [34]:
def append_to_log(owner, text):
    # Get the current date as a string in the format 'YYYY-MM-DD'
    timestamp = datetime.today().strftime('%Y-%m-%d')

    # Construct the log file path with the owner and timestamp in the filename
    log_file = LOGS_IMAGES + f'/{owner}-' + timestamp + '.log'

    # Check if the LOGS_IMAGES directory exists, if not, create it
    if not os.path.exists(LOGS_IMAGES):
        os.makedirs(LOGS_IMAGES)
        with open(log_file, "a") as myfile:
            myfile.write(text+'\n')
    else:
        # Open the log file in append mode and write the provided text with a newline
        with open(log_file, "a") as myfile:
            myfile.write(text+'\n')

In [35]:
def cropped_image(alpha, delta, spec_obj_ID):
    append_to_log('natalia',f'processed spec_obj_ID {spec_obj_ID}')
    image_path = f'{DATA_IMAGES_CROP}/{spec_obj_ID}.jpeg'

    if not os.path.exists(DATA_IMAGES_CROP):
        os.makedirs(DATA_IMAGES_CROP)
    else :
        if not os.path.exists(image_path):
            baseurl = 'https://skyserver.sdss.org/dr18/SkyServerWS/ImgCutout/getjpeg?TaskName=Skyserver.Chart.Navi&scale=0.099031675&radius=0.2&format=json'
            resp = req.get(baseurl, params={'ra': alpha, 'dec': delta})
            f = open(image_path,'wb')
            f.write(resp.content)
            f.close()
            append_to_log('natalia',f"downloaded {spec_obj_ID}")
        else:
            append_to_log('natalia',f'{spec_obj_ID}.jpeg already exists')
    return f'{spec_obj_ID}'

In [46]:
df_out = pd.DataFrame(columns=['spec_obj_ID', 'class', 'filename'])
for i in range(40001,80000):
    cropped_image(df.loc[i]['alpha'],df.loc[i]['delta'],df.loc[i]['spec_obj_ID'])
    df_out = pd.concat([df_out, pd.DataFrame([classification_data(df.loc[i]['spec_obj_ID'], df.loc[i]['class'])])], ignore_index=True)

KeyboardInterrupt: 

In [44]:
df_out

Unnamed: 0,spec_obj_ID,class,filename
0,10794100668908195840,GALAXY,10794100668908195840.jpeg
1,5180502509907892224,GALAXY,5180502509907892224.jpeg


## Tests

In [22]:
if not os.path.exists(DATA_IMAGES_CROP):
    print("not exists")
    os.makedirs(DATA_IMAGES_CROP)
else:
    print("exis")

exis


In [23]:
def missing_images_url(path):
    pass

In [24]:
def missing_images(paths):
    pass

In [25]:

# Get the list of all files and directories

dir_img_crop_cnt = [x.replace(".jpeg",'') for x in os.listdir(DATA_IMAGES_CROP)]
print("Files and directories in '", DATA_IMAGES_CROP, "':")
# prints all files
print(dir_img_crop_cnt)
print(len(dir_img_crop_cnt))

Files and directories in ' /Users/nataliamenacho/code/NMenacho/to_infinity_and_beyond/To-infinity-and-beyond/data/images_cropped ':
['6961443351364392960', '5658976714552007680', '2751763212482406400', '5652161941432719360', '12462617271914000384', '7459284627188110336']
6


In [217]:
#len(list(df['obj_ID'])), len(set(list(df['obj_ID'])))