### Paths

In [2]:
source_image_folder = '../data/images_cropped_sample'
csv_path = '../data/stars_classification_with_images.csv'

### Imports

In [3]:
from os import listdir

import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer, make_column_selector, make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

from tensorflow.keras.models import Sequential
from tensorflow.keras import layers


### Create DataFrame

In [4]:
# retrieve list of images in scrapped images directory
image_names = [f for f in listdir(source_image_folder)]

# create dataframe out of data stored in images names
image_names_df = pd.DataFrame(
    columns=[
        'obj_id',
        'alpha',
        'delta',
        'label',
        'redshift_value'
    ]
)
for image_name in image_names:
    image_row = image_name.replace('.jpg', '').split('_')
    obj_id = image_row[0]
    alpha = float(image_row[1])
    delta = float(image_row[2])
    label = image_row[3]
    redshift_value = float(image_row[4])
    row = { 'obj_id':obj_id, 'alpha':alpha, 'delta':delta, 'label': label, 'redshift_value': redshift_value }
    new_df = pd.DataFrame([row])
    image_names_df = pd.concat([image_names_df, new_df], axis=0, ignore_index=True)

# create dataframe out of original csv
orignal_csv_df = pd.read_csv(csv_path)

# join dataframes based on alpha (ra) & delta (dec)
merged_df = pd.merge(
    image_names_df,
    orignal_csv_df,
    how='left',
    on=['alpha', 'delta']
)

merged_df.head()

  image_names_df = pd.concat([image_names_df, new_df], axis=0, ignore_index=True)


Unnamed: 0,obj_id,alpha,delta,label,redshift_value,obj_ID,u,g,r,i,...,rerun_ID,cam_col,field_ID,spec_obj_ID,class,redshift,plate,MJD,fiber_ID,image_url
0,1237664673256571815,206.152742,35.783247,GALAXY,1.010923,1.237665e+18,26.30457,25.8921,21.78114,20.54924,...,301,4,130,1.154746e+19,GALAXY,1.010923,10256,58193,843,https://dr12.sdss.org//sas/dr12/boss/photoObj/...
1,1237678597545591138,341.669732,2.683017,STAR,1.209123,1.237679e+18,21.36229,21.91844,21.26442,21.13737,...,301,4,207,1.27317e+19,QSO,1.209123,11308,58426,100,https://dr12.sdss.org//sas/dr12/boss/photoObj/...
2,1237661971714802594,167.435588,7.254149,GALAXY,0.666519,1.237662e+18,23.72051,23.21101,21.19941,19.86738,...,301,4,18,6.038282e+18,GALAXY,0.666519,5363,55956,295,https://dr12.sdss.org//sas/dr12/boss/photoObj/...
3,1237678661426872832,6.576645,4.687866,STAR,2.225737,1.237679e+18,22.62305,21.89907,21.93067,21.70387,...,301,3,110,4.973126e+18,QSO,2.225737,4417,55829,93,https://dr12.sdss.org//sas/dr12/boss/photoObj/...
4,1237680272035807907,331.937003,19.126116,STAR,0.000142,1.23768e+18,20.99358,20.40626,20.57264,20.55044,...,301,3,52,5.653178e+18,STAR,0.000142,5021,55863,124,https://dr12.sdss.org//sas/dr12/boss/photoObj/...


### Clean DataFrame

In [5]:

merged_df_clean = merged_df.drop(
    columns=[
        'obj_id',
        #'label',
        'obj_ID',
        'rerun_ID',
        'run_ID',
        'cam_col',
        'class',
        'redshift',
        'plate',
        'MJD',
        'fiber_ID',
        'image_url',
        'field_ID',
        'spec_obj_ID'
    ],
    axis=1
)
#     columns=[
#         'obj_Id',
#         # 'rerun_ID',
#         # 'cam_col',

#     ]
# )

merged_df_clean = merged_df_clean.rename(columns={'redshift_value': 'redshift'})

merged_df_clean.head(5)

merged_df_clean.dtypes


alpha       float64
delta       float64
label        object
redshift    float64
u           float64
g           float64
r           float64
i           float64
z           float64
dtype: object

### Preprocess

In [6]:
X = merged_df_clean.drop("redshift", axis=1)
y = merged_df_clean['redshift']
X

Unnamed: 0,alpha,delta,label,u,g,r,i,z
0,206.152742,35.783247,GALAXY,26.30457,25.89210,21.78114,20.54924,19.29786
1,341.669732,2.683017,STAR,21.36229,21.91844,21.26442,21.13737,21.11002
2,167.435588,7.254149,GALAXY,23.72051,23.21101,21.19941,19.86738,19.27868
3,6.576645,4.687866,STAR,22.62305,21.89907,21.93067,21.70387,21.40456
4,331.937003,19.126116,STAR,20.99358,20.40626,20.57264,20.55044,20.04577
...,...,...,...,...,...,...,...,...
5430,55.829432,9.764397,STAR,17.79224,16.47265,15.84970,15.59104,15.46414
5431,13.841103,-4.093512,GALAXY,21.97417,22.42726,21.87176,22.33464,21.17168
5432,357.320307,26.080041,STAR,23.24426,22.26608,20.75471,18.77681,17.68333
5433,16.548202,-0.616989,GALAXY,17.47933,15.95861,15.23440,14.83781,14.56005


In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.3,
    random_state=42
)

# Option 1
# num_transformer = Pipeline([
#     #('imputer', SimpleImputer(strategy="mean")),
#     ('standard_scaler', StandardScaler())
# ])
# cat_transformer = OneHotEncoder(handle_unknown='ignore')
# preprocessor = ColumnTransformer([
#     ('num_transformer', num_transformer, ['age', 'bmi']),
#     ('cat_transformer', cat_transformer, ['smoker', 'region'])
# ])

num_transformer = make_pipeline( SimpleImputer(),StandardScaler())#SimpleImputer(),
num_sel = make_column_selector(dtype_include=['float64'])

cat_transformer = OneHotEncoder()
cat_sel = make_column_selector(dtype_include=['object','bool'])

preproc_basic = make_column_transformer(
    (num_transformer, num_sel),
    (cat_transformer, cat_sel),
    remainder='passthrough'
)
preproc_basic

In [8]:

preproc_basic.fit(X_train)

X_train_scaled = preproc_basic.transform(X_train)
X_test_scaled = preproc_basic.transform(X_test)

In [9]:
X_train_scaled[0]

array([-0.50742813,  1.81559918, -1.26610157, -1.3841251 , -1.35061063,
       -1.3316821 , -1.26422029,  1.        ,  0.        ])

# Create model

In [10]:
model = Sequential()
model.add(layers.Dense(20, activation='relu', input_dim=9))
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='relu'))
model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [11]:
model.compile(
    loss='mse',
    optimizer='adam',
    metrics=['mae', 'mse', 'accuracy']
)

In [12]:
y_train.mean()

0.5960877172211989

In [15]:
model.fit(
    X_train_scaled,
    y_train,
    epochs=100,
    batch_size=32,
    verbose=1,
    validation_split=0.3,
    #callbacks=[es],
)

Epoch 1/100
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.0018 - loss: 0.3186 - mae: 0.2861 - mse: 0.3186 - val_accuracy: 0.0026 - val_loss: 0.2510 - val_mae: 0.2526 - val_mse: 0.2510
Epoch 2/100
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 527us/step - accuracy: 0.0030 - loss: 0.2755 - mae: 0.2684 - mse: 0.2755 - val_accuracy: 0.0026 - val_loss: 0.2610 - val_mae: 0.2628 - val_mse: 0.2610
Epoch 3/100
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 516us/step - accuracy: 0.0038 - loss: 0.2838 - mae: 0.2703 - mse: 0.2838 - val_accuracy: 0.0026 - val_loss: 0.2573 - val_mae: 0.2583 - val_mse: 0.2573
Epoch 4/100
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 499us/step - accuracy: 4.9341e-04 - loss: 0.2974 - mae: 0.2789 - mse: 0.2974 - val_accuracy: 0.0026 - val_loss: 0.2612 - val_mae: 0.2599 - val_mse: 0.2612
Epoch 5/100
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 490us/step - a

<keras.src.callbacks.history.History at 0x381ad6a40>