In [1]:
import numpy as np
import pandas as pd
from datetime import datetime

# basic manipulation path tools
import os
from pathlib import Path
notebook_path = Path().absolute()
project_root = notebook_path.parent
notebook_path = Path().absolute()
project_root = notebook_path.parent

# 1. Bronze Layer

In [2]:
# Resolve absolute path to the script
extract_path = project_root / "scripts" / "data_extract.py"

# Run the script
%run "{extract_path}"



In [3]:
dfB = pd.read_csv(project_root/'data'/'bronze'/'starB.csv')
dfB

Unnamed: 0,recno,Version,FileName,EB,LAMOST,Bin,RAJ2000,DEJ2000,Date,SNg,...,e_FeH2,Rad,s_Rad,Mass,s_Mass,Lum,s_Lum,Dist,s_Dist,DR7
0,2251,LAMOST DR8,spec-57042-EG030739N012421V01_sp10-007.fits,,J025942.96+011122.1,0,44.929009,1.189498,474979440,137.30,...,0.0140,1.820,0.115,1.760,0.286,9.80948,1.17420,1171.0200,69.2050,1
1,2254,LAMOST DR8,spec-57042-EG030739N012421B01_sp14-220.fits,,J025947.35+020834.1,0,44.947296,2.142818,474983820,50.16,...,0.0750,2.440,,1.590,,13.91443,,3164.3701,365.5800,1
2,2273,LAMOST DR8,spec-57042-EG030739N012421V01_sp14-133.fits,,J030043.83+021812.4,0,45.182644,2.303460,474979440,62.52,...,,1.254,0.062,1.210,0.180,2.13966,0.11133,440.9360,7.7280,2
3,2332,LAMOST DR8,spec-57042-EG030739N012421V01_sp16-157.fits,,J030336.54+025459.5,0,45.902268,2.916538,474979440,121.08,...,0.0134,1.480,0.077,1.620,0.293,5.39942,0.39687,905.6660,28.0010,2
4,2151,LAMOST DR8,spec-57716-EG025338N015809B01_sp15-141.fits,,J025406.46+025537.7,0,43.526944,2.927140,533229360,57.75,...,,1.277,,1.280,,2.43731,,2151.9600,190.8400,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21630,18473,LAMOST DR8,spec-56358-HD144346S023019B01_sp13-035.fits,,J145332.99-020651.5,0,223.387466,-2.114333,415913460,50.91,...,0.1320,4.078,,1.550,,36.77406,,4210.2002,422.1100,2
21631,18498,LAMOST DR8,spec-57526-HD150542S015907V01_sp04-033.fits,,J150420.63-012544.0,0,226.085982,-1.428891,516818460,221.85,...,0.0110,1.663,0.075,1.720,0.277,7.78983,0.49770,717.9860,17.6355,2
21632,20746,LAMOST DR8,spec-56201-EG214025S065830V01_sp02-050.fits,,J213339.08-082740.4,0,323.412870,-8.461230,402328380,78.06,...,,1.705,0.095,1.590,0.266,6.76456,0.62333,400.5130,17.0200,1
21633,20552,LAMOST DR8,spec-56208-EG211454S070515B03_sp02-187.fits,,J210850.48-082114.5,0,317.210360,-8.354040,402932460,69.86,...,0.0410,1.801,0.080,1.661,0.293,8.43666,0.56499,311.7170,7.9110,1


Let's inspect the informations of our data. We begin by inspecting its datatypes, NaN values and summary.

In [4]:
dfB.columns

Index(['recno', 'Version', 'FileName', 'EB', 'LAMOST', 'Bin', 'RAJ2000',
       'DEJ2000', 'Date', 'SNg', 'Class', 'subCl', 'SpT', 'Teff1', 'e_Teff1',
       'logg1', 'e_logg1', 'FeH1', 'e_FeH1', 'SepTESS', 'Vmag', 'e_Vmag',
       'Flag', 'Teff2', 's_Teff2', 'logg2', 's_logg2', 'FeH2', 'e_FeH2', 'Rad',
       's_Rad', 'Mass', 's_Mass', 'Lum', 's_Lum', 'Dist', 's_Dist', 'DR7'],
      dtype='object')

In [5]:
dfB.dtypes

recno         int64
Version      object
FileName     object
EB           object
LAMOST       object
Bin           int64
RAJ2000     float64
DEJ2000     float64
Date          int64
SNg         float64
Class        object
subCl        object
SpT          object
Teff1       float64
e_Teff1     float64
logg1       float64
e_logg1     float64
FeH1        float64
e_FeH1      float64
SepTESS     float64
Vmag        float64
e_Vmag      float64
Flag         object
Teff2       float64
s_Teff2     float64
logg2       float64
s_logg2     float64
FeH2        float64
e_FeH2      float64
Rad         float64
s_Rad       float64
Mass        float64
s_Mass      float64
Lum         float64
s_Lum       float64
Dist        float64
s_Dist      float64
DR7           int64
dtype: object

In [6]:
dfB.columns

Index(['recno', 'Version', 'FileName', 'EB', 'LAMOST', 'Bin', 'RAJ2000',
       'DEJ2000', 'Date', 'SNg', 'Class', 'subCl', 'SpT', 'Teff1', 'e_Teff1',
       'logg1', 'e_logg1', 'FeH1', 'e_FeH1', 'SepTESS', 'Vmag', 'e_Vmag',
       'Flag', 'Teff2', 's_Teff2', 'logg2', 's_logg2', 'FeH2', 'e_FeH2', 'Rad',
       's_Rad', 'Mass', 's_Mass', 'Lum', 's_Lum', 'Dist', 's_Dist', 'DR7'],
      dtype='object')

The dataset contains lots of columns that will not be of use in stellar classification, mainly the errors like 'e_FeH1'. Furthermore, the labels of the columns are not into proper snake_case formating, which is convenient for later analysis. We now turn to implement this considerations into the silver layer of our Data Lakehouse.

# 2. Silver Layer

In [7]:
# Resolve absolute path to the script
silver_path = project_root / "scripts" / "silver.py"

# Run the script
%run "{silver_path}"
dfS = pd.read_csv(project_root/'data'/'silver'/'starS.csv')
dfS

Unnamed: 0,lamost,date,recno,full_class,spt,effective_temperature_1,log_surface_gravity_1,metallicity_fe_h_1,visual_magnitude,effective_temperature_2,log_surface_gravity_2,metallicity_fe_h_2,radius,mass,luminosity,distance
0,J025942.96+011122.1,474979440,2251,F0,kA5hA8mA9,7569.0,3.860,0.220,12.872,7571.0,4.1632,0.2220,1.820,1.760,9.80948,1171.0200
1,J025947.35+020834.1,474983820,2254,A6IV,kA4hA9mA9,7124.0,4.187,-0.367,14.712,7137.0,3.8647,-0.3780,2.440,1.590,13.91443,3164.3701
2,J030043.83+021812.4,474979440,2273,A2V,kA3hA5mA7,,,,12.446,6234.0,4.3243,,1.254,1.210,2.13966,440.9360
3,J030336.54+025459.5,474979440,2332,A7V,kA3hA6mA7,7254.0,4.186,-0.467,12.935,7232.0,4.3071,-0.4705,1.480,1.620,5.39942,905.6660
4,J025406.46+025537.7,533229360,2151,A3V,kA2hA4mA7,,,,16.013,6383.0,4.3332,,1.277,1.280,2.43731,2151.9600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21630,J145332.99-020651.5,415913460,18473,A3IV,kA2hA4mA7,,,,14.328,7039.0,3.4076,-0.6900,4.078,1.550,36.77406,4210.2002
21631,J150420.63-012544.0,516818460,18498,A7V,kA3hA5mA7,7475.0,4.119,-0.343,11.774,7477.0,4.2316,-0.3390,1.663,1.720,7.78983,717.9860
21632,J213339.08-082740.4,402328380,20746,A7IV,kA7hF1mF1,,,,10.689,7129.0,4.1760,,1.705,1.590,6.76456,400.5130
21633,J210850.48-082114.5,402932460,20552,A7IV,kA7hF0mF1,7327.0,4.017,-0.043,9.879,7330.0,4.1473,-0.0530,1.801,1.661,8.43666,311.7170


The silver layer DataFrame is now formatted in the standard snake_case convention, useful for data analyses. We now turn to visualize some of its characteristics.

In [8]:
print(len(dfS))
dfS.describe(include='all')

21635


Unnamed: 0,lamost,date,recno,full_class,spt,effective_temperature_1,log_surface_gravity_1,metallicity_fe_h_1,visual_magnitude,effective_temperature_2,log_surface_gravity_2,metallicity_fe_h_2,radius,mass,luminosity,distance
count,21635,21635.0,21635.0,21635,21635,13613.0,13612.0,13613.0,20374.0,19758.0,18071.0,9811.0,19202.0,18071.0,18072.0,20119.0
unique,21635,,,26,978,,,,,,,,,,,
top,J212231.01-074710.8,,,F0,kA1hA2mA3,,,,,,,,,,,
freq,1,,,8304,1439,,,,,,,,,,,
mean,,504643400.0,10818.0,,,7345.678763,4.046966,-0.023128,13.333392,7351.207106,4.051486,-0.027439,2.292268,1.686554,14.242121,1641.953452
std,,70503770.0,6245.630873,,,388.551671,0.150621,0.253429,1.40843,673.292044,0.261838,0.251261,1.509712,0.224366,13.371515,1048.889306
min,,372861500.0,1.0,,,6000.0,1.098,-0.999,7.92,1001.0,3.1216,-0.9989,0.519,0.517,0.04979,94.3582
25%,,444058000.0,5409.5,,,7167.0,3.95675,-0.181,12.32025,7079.0,3.8939,-0.185,1.651,1.58,6.85438,866.492
50%,,499464700.0,10818.0,,,7322.0,4.047,-0.025,13.388,7326.0,4.0858,-0.029,1.961,1.66,10.03827,1368.23
75%,,548105500.0,16226.5,,,7509.0,4.134,0.158,14.45,7632.0,4.2325,0.155,2.477,1.79,16.478455,2143.17995


In [9]:
dfS.isna().sum()

lamost                         0
date                           0
recno                          0
full_class                     0
spt                            0
effective_temperature_1     8022
log_surface_gravity_1       8023
metallicity_fe_h_1          8022
visual_magnitude            1261
effective_temperature_2     1877
log_surface_gravity_2       3564
metallicity_fe_h_2         11824
radius                      2433
mass                        3564
luminosity                  3563
distance                    1516
dtype: int64

As one may observe, the 'lamost' entry has one unique register for each data, being essentially an identifier of the object in the ski. This information may therefore be used as an interesting feature to implement a hard key in the gold layer. Furthermore, the data contains missing values for most of its columns, which must be properly dealt with. 

In [10]:
dfS.dropna().describe(include='all')

Unnamed: 0,lamost,date,recno,full_class,spt,effective_temperature_1,log_surface_gravity_1,metallicity_fe_h_1,visual_magnitude,effective_temperature_2,log_surface_gravity_2,metallicity_fe_h_2,radius,mass,luminosity,distance
count,7983,7983.0,7983.0,7983,7983,7983.0,7983.0,7983.0,7983.0,7983.0,7983.0,7983.0,7983.0,7983.0,7983.0,7983.0
unique,7983,,,20,557,,,,,,,,,,,
top,J212231.01-074710.8,,,F0,kA7hF0mF0,,,,,,,,,,,
freq,1,,,4927,304,,,,,,,,,,,
mean,,480272700.0,10667.756608,,,7361.362395,4.036674,-0.02971,13.325144,7358.282601,4.088209,-0.031135,2.016956,1.674226,12.042555,1506.927511
std,,58772790.0,6169.545535,,,343.744705,0.135194,0.250396,1.279332,348.986952,0.247796,0.251797,0.637769,0.137566,9.328316,868.507923
min,,372861500.0,1.0,,,6009.0,2.02,-0.979,8.528,5629.0,3.1216,-0.981,0.676,1.0,0.82404,166.973
25%,,440267400.0,5246.5,,,7195.0,3.954,-0.186,12.48,7192.0,3.94305,-0.19,1.583,1.61,6.567575,867.7065
50%,,470858300.0,10624.0,,,7328.0,4.04,-0.034,13.389,7329.0,4.1236,-0.035,1.849,1.66,9.05367,1293.48
75%,,505188300.0,16021.5,,,7504.0,4.12,0.15,14.3115,7507.0,4.26155,0.1495,2.279,1.73,13.89999,1943.03


# 3. Gold Layer

In the gold layer, we combine the 'date' and 'lamost' columns into a single 'identifier' column, which is an interesting hard key for our data. Said feature contains information pertaining both to the object itself in 'lamost' and its date, which will is useful to update the dataset. Apart from the missing values, which must be handled in different ways depending on the analysis, the dataset is now ready to be visualized and analyzed, which we implement in the data_visualization.ipynb notebook.

In [16]:
# Resolve absolute path to the script
gold_path = project_root / "scripts" / "gold.py"

# Run the script
%run "{gold_path}"
dfG = pd.read_csv(project_root/'data'/'gold'/'starG.csv')
dfG

Unnamed: 0,identifier,class,subclass,luminosity_class,full_class,recno,spt,effective_temperature_1,log_surface_gravity_1,metallicity_fe_h_1,visual_magnitude,effective_temperature_2,log_surface_gravity_2,metallicity_fe_h_2,radius,mass,luminosity,distance,lamost,date
0,19850119_J025942.96+011122.1,F,F0,,F0,2251,kA5hA8mA9,7569.0,3.860,0.220,12.872,7571.0,4.1632,0.2220,1.820,1.760,9.80948,1171.0200,J025942.96+011122.1,1985-01-19 10:44:00
1,19850119_J025947.35+020834.1,A,A6,IV,A6IV,2254,kA4hA9mA9,7124.0,4.187,-0.367,14.712,7137.0,3.8647,-0.3780,2.440,1.590,13.91443,3164.3701,J025947.35+020834.1,1985-01-19 11:57:00
2,19850119_J030043.83+021812.4,A,A2,V,A2V,2273,kA3hA5mA7,,,,12.446,6234.0,4.3243,,1.254,1.210,2.13966,440.9360,J030043.83+021812.4,1985-01-19 10:44:00
3,19850119_J030336.54+025459.5,A,A7,V,A7V,2332,kA3hA6mA7,7254.0,4.186,-0.467,12.935,7232.0,4.3071,-0.4705,1.480,1.620,5.39942,905.6660,J030336.54+025459.5,1985-01-19 10:44:00
4,19861124_J025406.46+025537.7,A,A3,V,A3V,2151,kA2hA4mA7,,,,16.013,6383.0,4.3332,,1.277,1.280,2.43731,2151.9600,J025406.46+025537.7,1986-11-24 15:16:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21630,19830307_J145332.99-020651.5,A,A3,IV,A3IV,18473,kA2hA4mA7,,,,14.328,7039.0,3.4076,-0.6900,4.078,1.550,36.77406,4210.2002,J145332.99-020651.5,1983-03-07 19:31:00
21631,19860518_J150420.63-012544.0,A,A7,V,A7V,18498,kA3hA5mA7,7475.0,4.119,-0.343,11.774,7477.0,4.2316,-0.3390,1.663,1.720,7.78983,717.9860,J150420.63-012544.0,1986-05-18 16:41:00
21632,19821001_J213339.08-082740.4,A,A7,IV,A7IV,20746,kA7hF1mF1,,,,10.689,7129.0,4.1760,,1.705,1.590,6.76456,400.5130,J213339.08-082740.4,1982-10-01 13:53:00
21633,19821008_J210850.48-082114.5,A,A7,IV,A7IV,20552,kA7hF0mF1,7327.0,4.017,-0.043,9.879,7330.0,4.1473,-0.0530,1.801,1.661,8.43666,311.7170,J210850.48-082114.5,1982-10-08 13:41:00


In [12]:
# # SETUP
# import pandas as pd
# from pathlib import Path
# from datetime import datetime

# # PATHS (using your exact structure)
# project_root = Path().absolute().parent
# gold_path = project_root / 'data' / 'gold'
# gold_path.mkdir(parents=True, exist_ok=True)  # Ensure directory exists
# input_csv = gold_path / 'starG.csv'  # Existing data
# new_csv = gold_path / 'starG.csv'  # New data to merge
# output_csv = gold_path / 'starG.csv'  # Final output

# # VALIDATION FUNCTION
# def validate_columns(df, required_cols):
#     missing = [col for col in required_cols if col not in df.columns]
#     if missing:
#         raise KeyError(f"Missing required columns: {missing}")

# # LOAD DATA WITH VALIDATION
# required_columns = ['identifier', 'date', 'lamost']
# try:
#     df_existing = pd.read_csv(input_csv, parse_dates=['date']) if input_csv.exists() else pd.DataFrame()
#     validate_columns(df_existing, required_columns) if not df_existing.empty else None
    
#     df_new = pd.read_csv(new_csv, parse_dates=['date'])
#     validate_columns(df_new, required_columns)

#     # MERGE STRATEGY
#     combined = pd.concat([df_existing, df_new])
    
#     # 1. Sort by date (newest first)
#     combined = combined.sort_values('date', ascending=False)
    
#     # 2. Remove duplicates - keeps first occurrence (most recent) for each identifier
#     combined = combined.drop_duplicates(subset=['identifier'], keep='first')
    
#     # 3. For repeated lamost IDs, keep most recent with complete identifier
#     combined = combined.sort_values(['lamost', 'date'], ascending=[True, False])
#     combined = combined.drop_duplicates(subset=['lamost'], keep='first')
    
#     # FILL MISSING VALUES (new data fills gaps in existing)
#     combined.update(df_new, overwrite=False)  # Only fills NaN values
    
#     # SAVE RESULT
#     combined.to_csv(output_csv, index=False)
#     print(f"✅ Successfully updated: {output_csv}")
    
# except KeyError as e:
#     print(f"❌ Column error: {e}")
#     print("Required columns: identifier, date, lamost")
# except Exception as e:
#     print(f"❌ Unexpected error: {e}")