# Preprocessing and Data Cleaning

**This notebook is just for reference, there's no need to run this.  
the clean data is saved at the end and uploaded in the main script.**

In [1]:
import pandas as pd
import numpy as np

In [2]:
upload = pd.read_csv('raw_data.csv').rename(str.lower, axis='columns')
upload.sample(10)

Unnamed: 0,kpmag,numax,e_numax,delnu,e_delnu,a,e_a,width,e_width,teff,...,e_nocorr,rgbcorm,e_rgbcorm,rgbcorr,e_rgbcorr,clcorm,e_clcorm,clcorr,e_clcorr,phase
3648,12.76,57.39,0.7,5.327,0.011,48.0,1.8,20.6,1.6,4891,...,0.23,1.94,0.11,10.67,0.22,2.11,0.12,11.11,0.23,1
13079,13.85,161.06,1.75,12.872,0.031,31.8,2.5,56.4,7.3,5243,...,0.13,1.49,0.1,5.47,0.13,1.53,0.1,5.55,0.13,1
1217,11.64,249.68,2.15,18.228,0.034,,,,,5151,...,0.08,1.31,0.07,4.14,0.08,1.38,0.07,4.25,0.08,1
2845,12.59,58.86,0.62,6.11,0.015,92.6,3.9,18.1,0.7,4635,...,0.17,1.03,0.06,7.77,0.16,1.16,0.06,8.25,0.17,1
11472,15.02,137.03,0.88,12.2,0.021,59.2,3.6,36.4,2.7,5178,...,0.11,1.13,0.07,5.19,0.11,1.13,0.07,5.19,0.11,1
10746,13.31,31.76,0.72,4.166,0.064,164.6,8.0,13.8,1.9,4875,...,0.43,0.96,0.11,10.09,0.44,0.93,0.1,9.92,0.43,2
11418,12.89,172.93,1.12,13.918,0.019,37.3,1.2,50.2,2.1,5223,...,0.12,1.36,0.09,5.05,0.12,1.37,0.09,5.07,0.12,1
12792,10.54,74.32,1.95,5.839,0.043,22.2,1.5,24.2,2.5,5145,...,0.42,3.77,0.36,12.89,0.45,3.72,0.36,12.79,0.45,2
6984,12.49,39.53,0.46,4.15,0.013,92.0,4.0,14.9,1.1,4964,...,0.3,1.78,0.12,12.25,0.3,1.92,0.13,12.74,0.31,1
12243,13.57,32.48,1.44,4.094,0.081,131.6,6.7,13.8,1.6,5127,...,0.68,1.17,0.19,10.87,0.69,1.16,0.19,10.83,0.68,2


In [3]:
df = upload.copy()

## column removal

In [4]:
col_df = df.copy()

columns that start in *e_* describe the error in measurement, and are irrelevant for the calculation process.

In [5]:
col_df.drop(
    columns=df.columns[
        df.columns.str.contains('e_')],
    inplace=True)

col_df.columns

Index(['kpmag', 'numax', 'delnu', 'a', 'width', 'teff', 'log(g)', '[fe/h]',
       'nocorm', 'nocorr', 'rgbcorm', 'rgbcorr', 'clcorm', 'clcorr', 'phase'],
      dtype='object')

the mass and radius corrections can be averaged

In [6]:
col_df[col_df.columns[
        col_df.columns.str.contains('cor')]
          ].sample(3)

Unnamed: 0,nocorm,nocorr,rgbcorm,rgbcorr,clcorm,clcorr
9220,1.58,5.49,1.57,5.47,1.6,5.53
6983,1.23,10.82,1.15,10.49,1.24,10.86
7872,1.29,9.61,1.18,9.2,1.3,9.64


In [7]:
col_df['rad'] = col_df.nocorr + col_df.rgbcorr + col_df.clcorr
col_df['mass'] = col_df.nocorm + col_df.rgbcorm + col_df.clcorm

col_df.drop(
    columns=col_df.columns[
        col_df.columns.str.contains('cor')],
    inplace=True)

col_df.sample(10)

Unnamed: 0,kpmag,numax,delnu,a,width,teff,log(g),[fe/h],phase,rad,mass
3650,13.62,33.1,4.057,125.2,13.4,5059,2.439,-0.08,2,33.44,3.74
12822,12.66,18.84,2.505,205.3,7.9,4392,2.164,-0.19,2,45.09,3.61
14788,13.39,45.9,4.98,74.9,17.6,4990,2.578,-0.09,1,30.41,4.26
12709,12.91,29.27,3.86,132.7,13.2,4835,2.376,-0.11,2,31.63,2.9
5734,9.27,37.03,3.859,80.3,15.9,4707,2.472,0.05,2,39.29,5.56
8449,11.04,170.22,13.901,43.0,46.4,4893,3.143,-0.05,1,14.29,3.45
14528,12.46,35.68,4.328,98.7,14.5,5244,2.48,-0.11,2,32.44,3.86
13813,14.951,211.58,16.769,,,5246,3.253,-0.59,1,12.79,3.56
478,11.1,31.32,3.328,77.3,14.2,4937,2.41,0.1,2,46.36,6.71
13228,13.23,38.36,4.215,93.8,14.4,4779,2.491,0.19,2,34.55,4.49


renaming and reordering columns

In [8]:
col_df.rename(
    {
        'kpmag': 'ap_mag',  # apparent magnitude (brightness)
        'numax': 'freq',  # maximum oscillation frequency
        'delnu': 'fr_sep',  # frequency separation of oscillation modes
        'a': 'amp',  # oscillation amplitude
        'width': 'pow_ex',  # power excess width
        'teff': 'temp',  # effective temperature
        'log(g)': 'grav', # surface gravity logarithm
        '[fe/h]': 'metal',  # metallicity
    },
axis='columns',
inplace=True)

In [9]:
col_df = col_df[['phase'] + list(col_df.columns.drop('phase'))]
col_df.columns

Index(['phase', 'ap_mag', 'freq', 'fr_sep', 'amp', 'pow_ex', 'temp', 'grav',
       'metal', 'rad', 'mass'],
      dtype='object')

In [10]:
df = col_df.copy()

## nulls

In [11]:
null_df = df.copy()

In [12]:
null_df.isnull().sum()

phase     0
ap_mag    0
freq      0
fr_sep    0
amp       0
pow_ex    0
temp      0
grav      0
metal     0
rad       0
mass      0
dtype: int64

that's not true, and the dtypes hold a hint:

In [13]:
null_df.dtypes

phase       int64
ap_mag    float64
freq      float64
fr_sep    float64
amp        object
pow_ex     object
temp        int64
grav      float64
metal     float64
rad       float64
mass      float64
dtype: object

apparently there are spaces in certain expressions and empty strings instead of nulls

In [14]:
print('vals with spaces:', null_df.amp.str.contains(' ').sum())
null_df['amp'] = df.amp.str.replace(' ', ''
                  ).replace('', np.nan
                           ).astype(float)
print('nulls post cleanup:', null_df.amp.isnull().sum())

vals with spaces: 9185
nulls post cleanup: 537


In [15]:
print('vals with spaces:', null_df.pow_ex.str.contains(' ').sum())
null_df['pow_ex'] = df.pow_ex.str.replace(' ', ''
                  ).replace('', np.nan
                           ).astype(float)
print('nulls post cleanup:', null_df.pow_ex.isnull().sum())

vals with spaces: 15387
nulls post cleanup: 537


In [16]:
null_df.dtypes

phase       int64
ap_mag    float64
freq      float64
fr_sep    float64
amp       float64
pow_ex    float64
temp        int64
grav      float64
metal     float64
rad       float64
mass      float64
dtype: object

and what about zeros?

In [17]:
null_df[null_df == 0].any(axis='columns').sum()

0

In [18]:
df = null_df.copy()

## export

In [19]:
export_df = df.copy()

change phase names for clarity

In [20]:
export_df.phase.replace({
    1: 'RGB', # Red Giant Branch
    2: 'HeB', # Helium Burning Phase
}, inplace=True)
export_df.phase.value_counts()

HeB    7703
RGB    7685
Name: phase, dtype: int64

In [21]:
export_df.to_csv('clean_data.csv', index=False)