# Preprocessing and Data Cleaning

**This notebook is just for reference, there's no need to run this.  
the clean data is saved at the end and uploaded in the main script.**

In [11]:
import pandas as pd
import numpy as np

In [36]:
upload = pd.read_csv('raw_data.csv').rename(str.lower, axis='columns')
upload.sample(10)

Unnamed: 0,kpmag,numax,e_numax,delnu,e_delnu,a,e_a,width,e_width,teff,...,e_nocorr,rgbcorm,e_rgbcorm,rgbcorr,e_rgbcorr,clcorm,e_clcorm,clcorr,e_clcorr,phase
358,10.98,36.18,0.86,3.937,0.015,68.9,3.3,14.4,1.9,4806,...,0.4,1.49,0.14,11.81,0.38,1.66,0.15,12.45,0.4,2
179,13.0,34.83,0.84,4.116,0.038,99.6,4.1,15.1,1.3,5202,...,0.43,1.46,0.15,11.68,0.44,1.44,0.15,11.62,0.43,2
2225,12.651,270.6,8.96,18.63,0.059,,,,,4944,...,0.16,1.42,0.15,4.18,0.16,1.49,0.16,4.29,0.16,1
3744,13.06,41.6,0.77,4.412,0.022,76.4,4.4,16.6,1.6,4802,...,0.34,1.45,0.12,10.88,0.32,1.62,0.13,11.47,0.34,2
13458,13.26,79.39,0.54,7.733,0.014,67.0,2.6,25.0,1.2,5134,...,0.16,1.33,0.08,7.41,0.16,1.35,0.08,7.46,0.16,1
5870,13.85,47.5,0.45,5.183,0.016,97.3,5.8,14.3,0.9,4894,...,0.22,1.2,0.08,9.2,0.21,1.33,0.09,9.68,0.23,1
14645,13.76,27.37,1.06,3.849,0.041,137.7,6.6,12.2,2.6,5096,...,0.5,0.89,0.12,10.36,0.51,0.89,0.12,10.31,0.5,2
11663,12.51,27.23,0.66,3.588,0.092,131.7,5.4,11.3,1.3,5190,...,0.72,1.21,0.17,12.06,0.73,1.18,0.16,11.88,0.72,2
148,11.51,81.28,1.06,6.955,0.045,37.6,1.5,30.4,1.9,4978,...,0.23,2.02,0.13,9.11,0.22,2.12,0.14,9.33,0.23,2
8069,12.83,49.07,0.66,4.899,0.022,74.7,3.0,17.4,0.9,4766,...,0.25,1.59,0.1,10.49,0.24,1.76,0.11,11.04,0.26,1


In [75]:
df = upload.copy()

## column removal

In [39]:
col_df = df.copy()

columns that start in *e_* describe the error in measurement, and are irrelevant for the calculation process.

In [40]:
col_df.drop(
    columns=df.columns[
        df.columns.str.contains('e_')],
    inplace=True)

col_df.columns

Index(['kpmag', 'numax', 'delnu', 'a', 'width', 'teff', 'log(g)', '[fe/h]',
       'nocorm', 'nocorr', 'rgbcorm', 'rgbcorr', 'clcorm', 'clcorr', 'phase'],
      dtype='object')

the mass and radius corrections can be averaged

In [41]:
col_df[col_df.columns[
        col_df.columns.str.contains('cor')]
          ].sample(3)

Unnamed: 0,nocorm,nocorr,rgbcorm,rgbcorr,clcorm,clcorr
3351,1.69,10.97,1.6,10.66,1.73,11.09
12080,1.14,3.89,1.14,3.89,1.15,3.91
15361,1.38,7.51,1.36,7.43,1.42,7.62


In [42]:
col_df['rad'] = col_df.nocorr + col_df.rgbcorr + col_df.clcorr
col_df['mass'] = col_df.nocorm + col_df.rgbcorm + col_df.clcorm

col_df.drop(
    columns=col_df.columns[
        col_df.columns.str.contains('cor')],
    inplace=True)

col_df.sample(10)

Unnamed: 0,kpmag,numax,delnu,a,width,teff,log(g),[fe/h],phase,rad,mass
7154,11.65,33.83,3.952,76.0,14.3,5302,2.459,-0.92,2,37.07,4.81
2342,13.56,75.07,7.056,56.0,25.4,4835,2.785,-0.35,1,24.04,4.29
14547,12.2,183.99,14.456,38.9,52.8,4888,3.177,-0.08,1,14.25,3.71
15149,13.11,12.62,1.875,221.4,5.1,4360,1.988,-0.25,1,53.62,3.4
1010,11.51,82.9,6.795,34.7,30.4,5062,2.838,0.12,2,30.11,7.59
11643,11.08,32.6,3.99,109.8,13.0,4893,2.425,-0.31,2,33.03,3.53
1418,11.63,38.92,4.165,88.0,16.6,4947,2.505,0.04,2,36.69,5.23
3044,12.94,34.92,3.938,91.2,13.9,5035,2.461,0.15,2,37.34,4.91
14131,13.06,28.57,3.882,146.5,12.5,5190,2.381,-0.26,2,32.08,3.01
9680,13.35,36.17,4.333,114.3,12.2,5152,2.482,-0.62,1,32.26,3.83


renaming and reordering columns

In [43]:
col_df.rename(
    {
        'kpmag': 'ap_mag',  # apparent magnitude (brightness)
        'numax': 'freq',  # maximum oscillation frequency
        'delnu': 'fr_sep',  # frequency separation of oscillation modes
        'a': 'amp',  # oscillation amplitude
        'width': 'pow_ex',  # power excess width
        'teff': 'temp',  # effective temperature
        'log(g)': 'grav', # surface gravity logarithm
        '[fe/h]': 'metal',  # metallicity
    },
axis='columns',
inplace=True)

In [44]:
col_df = col_df[['phase'] + list(col_df.columns.drop('phase'))]
col_df.columns

Index(['phase', 'ap_mag', 'freq', 'fr_sep', 'amp', 'pow_ex', 'temp', 'grav',
       'metal', 'rad', 'mass'],
      dtype='object')

In [80]:
df = col_df.copy()

## nulls

In [81]:
null_df = df.copy()

In [82]:
null_df.isnull().sum()

phase     0
ap_mag    0
freq      0
fr_sep    0
amp       0
pow_ex    0
temp      0
grav      0
metal     0
rad       0
mass      0
dtype: int64

that's not true, and the dtypes hold a hint:

In [83]:
null_df.dtypes

phase       int64
ap_mag    float64
freq      float64
fr_sep    float64
amp        object
pow_ex     object
temp        int64
grav      float64
metal     float64
rad       float64
mass      float64
dtype: object

apparently there are spaces in certain expressions and empty strings instead of nulls

In [84]:
print('vals with spaces:', null_df.amp.str.contains(' ').sum())
null_df['amp'] = df.amp.str.replace(' ', ''
                  ).replace('', np.nan
                           ).astype(float)
print('nulls post cleanup:', null_df.amp.isnull().sum())

vals with spaces: 9185
nulls post cleanup: 537


In [85]:
print('vals with spaces:', null_df.pow_ex.str.contains(' ').sum())
null_df['pow_ex'] = df.pow_ex.str.replace(' ', ''
                  ).replace('', np.nan
                           ).astype(float)
print('nulls post cleanup:', null_df.pow_ex.isnull().sum())

vals with spaces: 15387
nulls post cleanup: 537


In [101]:
null_df.dtypes

phase       int64
ap_mag    float64
freq      float64
fr_sep    float64
amp       float64
pow_ex    float64
temp        int64
grav      float64
metal     float64
rad       float64
mass      float64
dtype: object

In [111]:
tot_nulls = len(null_df[null_df.isnull().any(axis='columns')])

print(f'{tot_nulls/len(null_df):.2%}')

3.49%


totally negeligible

In [119]:
null_df.dropna(inplace=True)

and what about zeros?

In [120]:
null_df[null_df == 0].any(axis='columns').sum()

0

In [121]:
df = null_df.copy()

## export

In [122]:
export_df = df.copy()

change phase names for clarity

In [123]:
export_df.phase.replace({
    1: 'RGB', # Red Giant Branch
    2: 'HeB', # Helium Burning Phase
}, inplace=True)
export_df.phase.value_counts()

HeB    7703
RGB    7148
Name: phase, dtype: int64

In [124]:
export_df.to_csv('clean_data.csv', index=False)