In [2]:
import os 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from astropy.coordinates import SkyCoord
from astropy import units as u


In [4]:
# Specify the directory
directory = 'lamost_train_set/agn_data'

# Get a list of all files in the directory
id_train_agn = [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]

# star directory
directory = 'lamost_train_set/star_data'

# Get a list of all files in the directory
id_train_star = [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]

# binary directory
directory = 'lamost_train_set/bin_data'

# Get a list of all files in the directory
id_train_bin = [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]

# galaxy directory
directory = 'lamost_train_set/gal_data'

# Get a list of all files in the directory
id_train_gal = [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]

# print the number of files in each category
print('Number of AGN spectra:', len(id_train_agn))
print('Number of STAR spectra:', len(id_train_star))
print('Number of BIN spectra:', len(id_train_bin))
print('Number of GAL spectra:', len(id_train_gal))




Number of AGN spectra: 35936
Number of STAR spectra: 86037
Number of BIN spectra: 40676
Number of GAL spectra: 1699


In [5]:
# Specify the directory
directory = 'lamost_val_set/agn_data'

# Get a list of all files in the directory
id_val_agn = [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]

# star directory
directory = 'lamost_val_set/star_data'

# Get a list of all files in the directory
id_val_star = [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]

# binary directory
directory = 'lamost_val_set/bin_data'

# Get a list of all files in the directory
id_val_bin = [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]

# galaxy directory
directory = 'lamost_val_set/gal_data'

# Get a list of all files in the directory
id_val_gal = [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]

# print the number of files in each category
print('Number of AGN spectra:', len(id_val_agn))
print('Number of STAR spectra:', len(id_val_star))
print('Number of BIN spectra:', len(id_val_bin))
print('Number of GAL spectra:', len(id_val_gal))


Number of AGN spectra: 400
Number of STAR spectra: 400
Number of BIN spectra: 400
Number of GAL spectra: 400


## Note: had to do it twice; val and train

In [6]:
lamost_catalog = pd.read_csv("dr9_v2.0_LRS_catalogue.csv")  # Assuming CSV format for LAMOST catalog

# Starting of with Gal

In [17]:
obsid_list = id_val_gal



obsid_list = [int(obsid) for obsid in obsid_list]

# Filter the DataFrame
filtered_df = lamost_catalog[lamost_catalog['obsid'].isin(obsid_list)]

# Get the 'ra' and 'dec' values
ra_dec_values = filtered_df[['ra', 'dec']]
print(ra_dec_values)

################################################################################################################

# Load your GAL data and LAMOST catalog (assuming you have a local CSV or FITS file for LAMOST)
gal_data = pd.read_pickle("Pickles/gal_data.pkl")  # Loaded GAL data

# Ensure that RA and Dec columns are numeric and have units
gal_data['ra'] = pd.to_numeric(gal_data['ra'], errors='coerce')  # Coerce non-numeric to NaN
gal_data['dec'] = pd.to_numeric(gal_data['dec'], errors='coerce')

# Drop rows with NaN values in 'ra' and 'dec' columns
gal_data = gal_data.dropna(subset=['ra', 'dec'])
ra_dec_values = filtered_df[['obsid','ra', 'dec']]

# Convert GAL and LAMOST data to SkyCoord objects for crossmatching
gal_coords = SkyCoord(ra=gal_data['ra'].values*u.deg, dec=gal_data['dec'].values*u.deg)
lamost_coords = SkyCoord(ra=ra_dec_values['ra'].values*u.deg, dec=ra_dec_values['dec'].values*u.deg)

# Crossmatch the GAL and LAMOST data
idx, d2d, _ = gal_coords.match_to_catalog_sky(lamost_coords)

# Define a matching radius
match_radius = 1 * u.arcsec
matches = d2d < match_radius

# Filter the matches
matched_gal = gal_data.iloc[matches]
matched_lamost = ra_dec_values.iloc[idx[matches]]

# Combine the matched data
matched_data = pd.concat([matched_gal.reset_index(drop=True), matched_lamost.reset_index(drop=True)], axis=1)

#######################################################################################################################



                  ra        dec
1838      333.865510   0.776690
113126    138.383021  31.957768
237337     42.455140  -0.324820
285166    175.558822  28.841412
288652     20.021740  -0.404700
...              ...        ...
10491764  246.803116  29.998576
10567385  117.924140  53.057106
10595660  227.251731  31.032884
10669548  115.395570  23.501689
10681656  189.348344  55.694221

[400 rows x 2 columns]


In [19]:
gal_gaia = matched_data[['obsid','ra', 'dec','ra_error', 'dec_error', 'parallax', 'parallax_error', 'pmra', 'pmra_error', 'pmdec', 'pmdec_error', 'phot_g_mean_flux', 'phot_g_mean_flux_error', 'phot_bp_mean_flux', 'phot_bp_mean_flux_error', 'phot_rp_mean_flux', 'phot_rp_mean_flux_error']]
gal_gaia.columns = ['obsid','ra','ra2', 'dec', 'dec2','ra_error', 'dec_error', 'parallax', 'parallax_error', 'pmra', 'pmra_error', 'pmdec', 'pmdec_error', 'phot_g_mean_flux', 'phot_g_mean_flux_error', 'phot_bp_mean_flux', 'phot_bp_mean_flux_error', 'phot_rp_mean_flux', 'phot_rp_mean_flux_error']
gal_gaia = gal_gaia.drop(columns=['ra2', 'dec2'])
print(gal_gaia.columns)

# create a new folder for the pickles
if not os.path.exists("Pickles/vcleaned"):
    os.makedirs("Pickles/vcleaned")
gal_gaia.to_pickle("Pickles/vcleaned/gal_gaia.pkl")



Index(['obsid', 'ra', 'dec', 'ra_error', 'dec_error', 'parallax',
       'parallax_error', 'pmra', 'pmra_error', 'pmdec', 'pmdec_error',
       'phot_g_mean_flux', 'phot_g_mean_flux_error', 'phot_bp_mean_flux',
       'phot_bp_mean_flux_error', 'phot_rp_mean_flux',
       'phot_rp_mean_flux_error'],
      dtype='object')


 # Doing the same for bin

In [25]:
obsid_list = id_train_bin
obsid_list = [int(obsid) for obsid in obsid_list]


# Filter the DataFrame
filtered_df = lamost_catalog[lamost_catalog['obsid'].isin(obsid_list)]

# Get the 'ra' and 'dec' values
ra_dec_values = filtered_df[['ra', 'dec']]
print(ra_dec_values)


                  ra        dec
778       333.964749   0.302551
1903      330.641667   1.239245
1942      330.369258   0.786806
9462       79.412996  30.833612
9983      330.698645  28.929285
...              ...        ...
10686771  278.870280  11.083686
10686845  277.704890  11.178101
10686858  277.555770  11.421151
10686863  277.662760  11.637670
10686918  276.793290  11.188747

[40676 rows x 2 columns]


In [26]:
# Load your GAL data and LAMOST catalog (assuming you have a local CSV or FITS file for LAMOST)
bin_data = pd.read_pickle("Pickles/bin_data.pkl")  # Loaded BIN data

# Ensure that RA and Dec columns are numeric and have units
bin_data['ra'] = pd.to_numeric(bin_data['ra'], errors='coerce')  # Coerce non-numeric to NaN
bin_data['dec'] = pd.to_numeric(bin_data['dec'], errors='coerce')

# Drop rows with NaN values in 'ra' and 'dec' columns
bin_data = bin_data.dropna(subset=['ra', 'dec'])
ra_dec_values = filtered_df[['obsid','ra', 'dec']]
# Convert GAL and LAMOST data to SkyCoord objects for crossmatching
bin_coords = SkyCoord(ra=bin_data['ra'].values*u.deg, dec=bin_data['dec'].values*u.deg)
lamost_coords = SkyCoord(ra=ra_dec_values['ra'].values*u.deg, dec=ra_dec_values['dec'].values*u.deg)

# Crossmatch the GAL and LAMOST data
idx, d2d, _ = bin_coords.match_to_catalog_sky(lamost_coords)

# Define a matching radius
match_radius = 1 * u.arcsec
matches = d2d < match_radius

# Filter the matches
matched_bin = bin_data.iloc[matches]
matched_lamost = ra_dec_values.iloc[idx[matches]]

# Combine the matched data
matched_data = pd.concat([matched_bin.reset_index(drop=True), matched_lamost.reset_index(drop=True)], axis=1)




In [27]:
bin_gaia = matched_data[['obsid','ra', 'dec','ra_error', 'dec_error', 'parallax', 'parallax_error', 'pmra', 'pmra_error', 'pmdec', 'pmdec_error', 'phot_g_mean_flux', 'phot_g_mean_flux_error', 'phot_bp_mean_flux', 'phot_bp_mean_flux_error', 'phot_rp_mean_flux', 'phot_rp_mean_flux_error']]
bin_gaia.columns = ['obsid','ra','ra2', 'dec', 'dec2','ra_error', 'dec_error', 'parallax', 'parallax_error', 'pmra', 'pmra_error', 'pmdec', 'pmdec_error', 'phot_g_mean_flux', 'phot_g_mean_flux_error', 'phot_bp_mean_flux', 'phot_bp_mean_flux_error', 'phot_rp_mean_flux', 'phot_rp_mean_flux_error']
bin_gaia = bin_gaia.drop(columns=['ra2', 'dec2'])
print(bin_gaia.columns)

# create a new folder for the pickles
if not os.path.exists("Pickles/tcleaned"):
    os.makedirs("Pickles/tcleaned")
bin_gaia.to_pickle("Pickles/tcleaned/bin_gaia.pkl")

Index(['obsid', 'ra', 'dec', 'ra_error', 'dec_error', 'parallax',
       'parallax_error', 'pmra', 'pmra_error', 'pmdec', 'pmdec_error',
       'phot_g_mean_flux', 'phot_g_mean_flux_error', 'phot_bp_mean_flux',
       'phot_bp_mean_flux_error', 'phot_rp_mean_flux',
       'phot_rp_mean_flux_error'],
      dtype='object')


# Doing the same for Stars

In [None]:
obsid_list = id_train_star
obsid_list = [int(obsid) for obsid in obsid_list]

# Filter the DataFrame
filtered_df = lamost_catalog[lamost_catalog['obsid'].isin(obsid_list)]

# Get the 'ra' and 'dec' values
ra_dec_values = filtered_df[['ra', 'dec']]
print(ra_dec_values)    

# Load your GAL data and LAMOST catalog (assuming you have a local CSV or FITS file for LAMOST)
star_data = pd.read_pickle("Pickles/star_data.pkl")  # Loaded STAR data

# Ensure that RA and Dec columns are numeric and have units
star_data['ra'] = pd.to_numeric(star_data['ra'], errors='coerce')  # Coerce non-numeric to NaN
star_data['dec'] = pd.to_numeric(star_data['dec'], errors='coerce')

# Drop rows with NaN values in 'ra' and 'dec' columns
star_data = star_data.dropna(subset=['ra', 'dec'])
ra_dec_values = filtered_df[['obsid','ra', 'dec']]

# Convert GAL and LAMOST data to SkyCoord objects for crossmatching
star_coords = SkyCoord(ra=star_data['ra'].values*u.deg, dec=star_data['dec'].values*u.deg)
lamost_coords = SkyCoord(ra=ra_dec_values['ra'].values*u.deg, dec=ra_dec_values['dec'].values*u.deg)

# Crossmatch the GAL and LAMOST data
idx, d2d, _ = star_coords.match_to_catalog_sky(lamost_coords)

# Define a matching radius
match_radius = 1 * u.arcsec
matches = d2d < match_radius

# Filter the matches
matched_star = star_data.iloc[matches]
matched_lamost = ra_dec_values.iloc[idx[matches]]

# Combine the matched data
matched_data = pd.concat([matched_star.reset_index(drop=True), matched_lamost.reset_index(drop=True)], axis=1)

######################################################################################################################


                  ra        dec
99        332.098588  -1.259508
135       332.087160  -2.039786
435       331.310760   0.145980
452       331.312840   0.914318
499       332.425814   0.696325
...              ...        ...
10686847  277.463350  11.244976
10686850  277.426030  11.074758
10686879  276.872710  11.871802
10686896  277.306960  11.241274
10686899  277.242780  11.204973

[86037 rows x 2 columns]


In [31]:
star_data = matched_data[['obsid','ra', 'dec','ra_error', 'dec_error', 'parallax', 'parallax_error', 'pmra', 'pmra_error', 'pmdec', 'pmdec_error', 'phot_g_mean_flux', 'phot_g_mean_flux_error', 'phot_bp_mean_flux', 'phot_bp_mean_flux_error', 'phot_rp_mean_flux', 'phot_rp_mean_flux_error']]
star_data.columns = ['obsid','ra','ra2', 'dec', 'dec2','ra_error', 'dec_error', 'parallax', 'parallax_error', 'pmra', 'pmra_error', 'pmdec', 'pmdec_error', 'phot_g_mean_flux', 'phot_g_mean_flux_error', 'phot_bp_mean_flux', 'phot_bp_mean_flux_error', 'phot_rp_mean_flux', 'phot_rp_mean_flux_error']
star_data = star_data.drop(columns=['ra2', 'dec2'])
print(star_data.columns)

# create a new folder for the pickles
if not os.path.exists("Pickles/vcleaned"):
    os.makedirs("Pickles/vcleaned")
star_data.to_pickle("Pickles/tcleaned/star_gaia.pkl")

Index(['obsid', 'ra', 'dec', 'ra_error', 'dec_error', 'parallax',
       'parallax_error', 'pmra', 'pmra_error', 'pmdec', 'pmdec_error',
       'phot_g_mean_flux', 'phot_g_mean_flux_error', 'phot_bp_mean_flux',
       'phot_bp_mean_flux_error', 'phot_rp_mean_flux',
       'phot_rp_mean_flux_error'],
      dtype='object')


# Doing the same for AGN

In [34]:
obsid_list = id_val_agn
obsid_list = [int(obsid) for obsid in obsid_list]

# Filter the DataFrame
filtered_df = lamost_catalog[lamost_catalog['obsid'].isin(obsid_list)]

# Get the 'ra' and 'dec' values
ra_dec_values = filtered_df[['ra', 'dec']]
print(ra_dec_values)    

# Load your GAL data and LAMOST catalog (assuming you have a local CSV or FITS file for LAMOST)
df = pd.read_pickle("Pickles/agn_data.pkl")  # Loaded AGN data

# Ensure that RA and Dec columns are numeric and have units
df['ra'] = pd.to_numeric(df['ra'], errors='coerce')  # Coerce non-numeric to NaN
df['dec'] = pd.to_numeric(df['dec'], errors='coerce')

# Drop rows with NaN values in 'ra' and 'dec' columns
df = df.dropna(subset=['ra', 'dec'])
ra_dec_values = filtered_df[['obsid','ra', 'dec']]

# Convert GAL and LAMOST data to SkyCoord objects for crossmatching
agn_coords = SkyCoord(ra=df['ra'].values*u.deg, dec=df['dec'].values*u.deg)
lamost_coords = SkyCoord(ra=ra_dec_values['ra'].values*u.deg, dec=ra_dec_values['dec'].values*u.deg)

# Crossmatch the GAL and LAMOST data
idx, d2d, _ = agn_coords.match_to_catalog_sky(lamost_coords)

# Define a matching radius
match_radius = 1 * u.arcsec
matches = d2d < match_radius

# Filter the matches
matched_agn = df.iloc[matches]
matched_lamost = ra_dec_values.iloc[idx[matches]]

# Combine the matched data
matched_data = pd.concat([matched_agn.reset_index(drop=True), matched_lamost.reset_index(drop=True)], axis=1)



                  ra        dec
279       330.366460  -0.800640
3899       43.798310   0.049600
176584    163.459516  28.013092
247848     25.947470   2.692070
262363     44.136010   5.782250
...              ...        ...
10594670  225.100220  31.664313
10608168  351.991470  -0.349688
10614698  345.211320  33.541183
10626703  332.806910  21.561966
10669837  136.487280  12.533718

[400 rows x 2 columns]


In [35]:
df = matched_data[['obsid','ra', 'dec','ra_error', 'dec_error', 'parallax', 'parallax_error', 'pmra', 'pmra_error', 'pmdec', 'pmdec_error', 'phot_g_mean_flux', 'phot_g_mean_flux_error', 'phot_bp_mean_flux', 'phot_bp_mean_flux_error', 'phot_rp_mean_flux', 'phot_rp_mean_flux_error']]
df.columns = ['obsid','ra','ra2', 'dec', 'dec2','ra_error', 'dec_error', 'parallax', 'parallax_error', 'pmra', 'pmra_error', 'pmdec', 'pmdec_error', 'phot_g_mean_flux', 'phot_g_mean_flux_error', 'phot_bp_mean_flux', 'phot_bp_mean_flux_error', 'phot_rp_mean_flux', 'phot_rp_mean_flux_error']
df = df.drop(columns=['ra2', 'dec2'])
print(df.columns)

df.to_pickle("Pickles/vcleaned/agn_gaia.pkl")

Index(['obsid', 'ra', 'dec', 'ra_error', 'dec_error', 'parallax',
       'parallax_error', 'pmra', 'pmra_error', 'pmdec', 'pmdec_error',
       'phot_g_mean_flux', 'phot_g_mean_flux_error', 'phot_bp_mean_flux',
       'phot_bp_mean_flux_error', 'phot_rp_mean_flux',
       'phot_rp_mean_flux_error'],
      dtype='object')


: 

# Dealing with nans

In [3]:
def clean_nans(input_pkl, output_pkl):    
    # open the pickles
    df = pd.read_pickle(input_pkl)

    # add a flag column if parallax is nan
    df['flagnopllx'] = np.where(df['parallax'].isnull(), 1, 0)

    # if parallax is nan, set the parallax to 0 and the error to 10
    # 10 is a large value but still normalizabe

    df.fillna({'parallax':0}, inplace=True)
    df.fillna({'parallax_error':10}, inplace=True)

    # if pmra or pmdec is nan, set the pmra to 0 and the error to 10

    df.fillna({'pmra':0}, inplace=True)
    df.fillna({'pmra_error':10}, inplace=True)
    df.fillna({'pmdec':0}, inplace=True)
    df.fillna({'pmdec_error':10}, inplace=True)

    # if any nans are left, drop the row, save the obsid of dropped row,print the number of rows dropped out of the total
    na_free = df.dropna()
    only_na = df[~df.index.isin(na_free.index)]    
    print('Number of rows now:', len(na_free))
    print('Number of rows dropped:', len(df) - len(na_free), 'accounting for', ((len(df)-len(na_free))/len(df))*100, '%')

    # save the pickle
    df.to_pickle(output_pkl)
    return only_na['obsid']

obsid_drop_vagn = clean_nans("Pickles/vcleaned/agn_gaia.pkl", "Pickles/vcleaned2/agn_gaia.pkl")
obsid_drop_vbin = clean_nans("Pickles/vcleaned/bin_gaia.pkl", "Pickles/vcleaned2/bin_gaia.pkl")
obsid_drop_vstar = clean_nans("Pickles/vcleaned/star_gaia.pkl", "Pickles/vcleaned2/star_gaia.pkl")
obsid_drop_vgal = clean_nans("Pickles/vcleaned/gal_gaia.pkl", "Pickles/vcleaned2/gal_gaia.pkl")
obsid_drop_tagn = clean_nans("Pickles/tcleaned/agn_gaia.pkl", "Pickles/tcleaned2/agn_gaia.pkl")
obsid_drop_tbin = clean_nans("Pickles/tcleaned/bin_gaia.pkl", "Pickles/tcleaned2/bin_gaia.pkl")
obsid_drop_tstar = clean_nans("Pickles/tcleaned/star_gaia.pkl", "Pickles/tcleaned2/star_gaia.pkl")
obsid_drop_tgal = clean_nans("Pickles/tcleaned/gal_gaia.pkl", "Pickles/tcleaned2/gal_gaia.pkl")

Number of rows now: 400
Number of rows dropped: 0 accounting for 0.0 %
Number of rows now: 399
Number of rows dropped: 1 accounting for 0.25 %
Number of rows now: 400
Number of rows dropped: 0 accounting for 0.0 %
Number of rows now: 399
Number of rows dropped: 1 accounting for 0.25 %
Number of rows now: 35931
Number of rows dropped: 5 accounting for 0.013913624220837043 %
Number of rows now: 40608
Number of rows dropped: 71 accounting for 0.1745372305120578 %
Number of rows now: 85928
Number of rows dropped: 157 accounting for 0.1823778823256084 %
Number of rows now: 1698
Number of rows dropped: 1 accounting for 0.05885815185403178 %


In [6]:
# save the obsid of the dropped rows
obsid_drop = pd.concat([obsid_drop_vagn, obsid_drop_vbin, obsid_drop_vstar, obsid_drop_vgal, obsid_drop_tagn, obsid_drop_tbin, obsid_drop_tstar, obsid_drop_tgal])
obsid_drop.to_pickle("Pickles/drops/gaiaall.pkl")

### Note, drops are from lack of photometric data to bp and rp

# Remove from gaia data the lamost nans

In [15]:
import pandas as pd
import numpy as np

def clean_nans(input_pkl, output_pkl, lmst_drop):    
    # open the pickles
    df = pd.read_pickle(input_pkl)

    # add a flag column if parallax is nan
    df['flagnopllx'] = np.where(df['parallax'].isnull(), 1, 0)

    # if parallax is nan, set the parallax to 0 and the error to 10
    df.fillna({'parallax': 0, 'parallax_error': 10}, inplace=True)

    # if pmra or pmdec is nan, set the pmra to 0 and the error to 10
    df.fillna({'pmra': 0, 'pmra_error': 10, 'pmdec': 0, 'pmdec_error': 10}, inplace=True)

    # if any nans are left, drop the row, save the obsid of dropped row, print the number of rows dropped out of the total
    na_free = df.dropna()
    only_na = df[~df.index.isin(na_free.index)]    
    print('Number of rows before:', len(na_free),'Number of rows dropped:', len(df) - len(na_free), 'accounting for', ((len(df)-len(na_free))/len(df))*100, '%')

    # Drop rows where obsid is in lmst_drop
    df = na_free[~na_free['obsid'].isin(lmst_drop)]
    print('Number of rows dropped due to LAMOST:', len(na_free) - len(df), 'accounting for', ((len(na_free)-len(df))/len(na_free))*100, '%')
    print('Number of rows now:', len(df))
    # save the pickle
    df.to_pickle(output_pkl)
    return only_na['obsid']

# Load the list of obsid to drop
lmst_drop = pd.read_pickle('Pickles/drops/lamost.pkl')

# Apply the function to each file
obsid_drop_vagn = clean_nans("Pickles/vcleaned/agn_gaia.pkl", "Pickles/vcleaned3/agn_gaia.pkl", lmst_drop)
obsid_drop_vbin = clean_nans("Pickles/vcleaned/bin_gaia.pkl", "Pickles/vcleaned3/bin_gaia.pkl", lmst_drop)
obsid_drop_vstar = clean_nans("Pickles/vcleaned/star_gaia.pkl", "Pickles/vcleaned3/star_gaia.pkl", lmst_drop)
obsid_drop_vgal = clean_nans("Pickles/vcleaned/gal_gaia.pkl", "Pickles/vcleaned3/gal_gaia.pkl", lmst_drop)
obsid_drop_tagn = clean_nans("Pickles/tcleaned/agn_gaia.pkl", "Pickles/tcleaned3/agn_gaia.pkl", lmst_drop)
obsid_drop_tbin = clean_nans("Pickles/tcleaned/bin_gaia.pkl", "Pickles/tcleaned3/bin_gaia.pkl", lmst_drop)
obsid_drop_tstar = clean_nans("Pickles/tcleaned/star_gaia.pkl", "Pickles/tcleaned3/star_gaia.pkl", lmst_drop)
obsid_drop_tgal = clean_nans("Pickles/tcleaned/gal_gaia.pkl", "Pickles/tcleaned3/gal_gaia.pkl", lmst_drop)


Number of rows before: 400 Number of rows dropped: 0 accounting for 0.0 %
Number of rows dropped due to LAMOST: 0 accounting for 0.0 %
Number of rows now: 400
Number of rows before: 399 Number of rows dropped: 1 accounting for 0.25 %
Number of rows dropped due to LAMOST: 0 accounting for 0.0 %
Number of rows now: 399
Number of rows before: 400 Number of rows dropped: 0 accounting for 0.0 %
Number of rows dropped due to LAMOST: 0 accounting for 0.0 %
Number of rows now: 400
Number of rows before: 399 Number of rows dropped: 1 accounting for 0.25 %
Number of rows dropped due to LAMOST: 0 accounting for 0.0 %
Number of rows now: 399
Number of rows before: 35931 Number of rows dropped: 5 accounting for 0.013913624220837043 %
Number of rows dropped due to LAMOST: 125 accounting for 0.3478890094904122 %
Number of rows now: 35806
Number of rows before: 40608 Number of rows dropped: 71 accounting for 0.1745372305120578 %
Number of rows dropped due to LAMOST: 122 accounting for 0.30043341213553

In [11]:
import gc
gc.collect()

31