In [None]:
# INSTALLING THE NECESSARY LIBRARIES
!pip install lightkurve
!pip install tslearn 
!pip install pytictoc

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting lightkurve
  Downloading lightkurve-2.0.11-py3-none-any.whl (247 kB)
[K     |████████████████████████████████| 247 kB 13.7 MB/s 
Collecting astroquery>=0.3.10
  Downloading astroquery-0.4.6-py3-none-any.whl (4.5 MB)
[K     |████████████████████████████████| 4.5 MB 44.0 MB/s 
Collecting uncertainties>=3.1.4
  Downloading uncertainties-3.1.7-py2.py3-none-any.whl (98 kB)
[K     |████████████████████████████████| 98 kB 8.1 MB/s 
Collecting oktopus>=0.1.2
  Downloading oktopus-0.1.2.tar.gz (10 kB)
Collecting memoization>=0.3.1
  Downloading memoization-0.4.0.tar.gz (41 kB)
[K     |████████████████████████████████| 41 kB 220 kB/s 
Collecting fbpca>=1.0
  Downloading fbpca-1.0.tar.gz (11 kB)
Collecting keyring>=4.0
  Downloading keyring-23.7.0-py3-none-any.whl (34 kB)
Collecting pyvo>=1.1
  Downloading pyvo-1.2.1-py3-none-any.whl (832 kB)
[K     |████████████████████████████████|

In [None]:
# IMPORTING NECESSARY LIBRARIES
import lightkurve as lk
import pandas as pd
import numpy as np
from tslearn.utils import to_time_series_dataset
from tslearn.preprocessing import TimeSeriesResampler
from pytictoc import TicToc

In [None]:
# IMPORT AND PROCESSING THE KEPLER ID DATA
finalData = pd.read_csv('finalData.csv')
finalData = finalData[finalData['koi_disposition'].str.contains('CANDIDATE')==False] # removing 'candidate' rows
finalData = finalData.drop_duplicates(subset=['kepid']) # removing duplicates where kepid is repeated ==> CONTROVERSIAL!!! to deal with multiple planets (0, 1, 1+) classification, need raw data
finalData = finalData.replace(['CONFIRMED', 'FALSE POSITIVE'], [1, 0]) # converting 'confirmed' / 'false positive' into 1 / 0
data = finalData
data = data.reset_index(drop=True)

In [None]:
initial_X = []
initial_Y = []
i = 0 # dummy variable to verify the loop is working

for kepid in data['kepid'].iloc[0:1000]:
  t = TicToc()
  start_time = t.tic()


  try: 
    # DOWNLOADING THE DATA
    KIC = 'KIC ' + str(kepid)
    lcs = lk.search_lightcurve(KIC, author='kepler', cadence='long').download_all()

    # FINDING THE PERIOD, T0 AND DURATION FOR PROCESSING
    row_number = finalData[finalData['kepid'] == kepid].index[0]
    period, t0, duration_hours =  finalData['koi_period'][row_number],  finalData['koi_time0bk'][row_number],  finalData['koi_duration'][row_number]
    
    # PROCESSING THE LIGHTKURVE DATA
    lc_raw = lcs.stitch()
    lc_clean = lc_raw.remove_outliers(sigma=20, sigma_upper=4)
    temp_fold = lc_clean.fold(period, epoch_time=t0)
    fractional_duration = (duration_hours / 24.0) / period
    phase_mask = np.abs(temp_fold.phase.value) < (fractional_duration * 1.5)
    transit_mask = np.in1d(lc_clean.time.value, temp_fold.time_original.value[phase_mask])
    lc_flat, trend_lc = lc_clean.flatten(return_trend=True, mask=transit_mask)
    lc_fold = lc_flat.fold(period, epoch_time=t0)
    
    # CREATING THE GLOBAL VIEW
    lc_global = lc_fold.bin(time_bin_size=0.005).normalize() - 1
    lc_global = (lc_global / np.abs(lc_global.flux.min()) ) * 2.0 + 1


    # CONVERTING TO PANDAS DF
    lc_global = lc_global.to_pandas()

    # CREATE LISTS OF X (TIMESERIES DATA) AND Y (LABEL)
    initial_X.append(lc_global['flux'].tolist())
    initial_Y.append(finalData['koi_disposition'][row_number])

    print(i)
    i += 1

    end_time = t.toc()
    print(end_time)


  except: 
    pass

0
Elapsed time is 28.245444 seconds.
None
1
Elapsed time is 17.261830 seconds.
None
2
Elapsed time is 20.406396 seconds.
None
3
Elapsed time is 20.366598 seconds.
None
4
Elapsed time is 22.578056 seconds.
None
5
Elapsed time is 22.468539 seconds.
None
6
Elapsed time is 18.985899 seconds.
None
7
Elapsed time is 22.182596 seconds.
None
8
Elapsed time is 23.347758 seconds.
None
9
Elapsed time is 22.461542 seconds.
None
10
Elapsed time is 20.737374 seconds.
None
11
Elapsed time is 28.140155 seconds.
None
12
Elapsed time is 24.298293 seconds.
None
13
Elapsed time is 22.404589 seconds.
None


In [None]:
initial_X = [list(filter(lambda x: x == x, inner_list)) for inner_list in initial_X]
initial_X_2 = [x for x in initial_X if x != []]
empty_idx = [i for i,x in enumerate(initial_X) if not x]

for i in sorted(empty_idx, reverse=True):
  del initial_Y[i]


y = np.array(initial_Y)


time_series_X = to_time_series_dataset(initial_X_2)

resampled_time_series_X = TimeSeriesResampler(sz=time_series_X.shape[1]).fit_transform(time_series_X)

new = np.squeeze(resampled_time_series_X)

In [None]:
numbers_list = [str(x) for x in range(new.shape[1])]

df = pd.DataFrame(new,columns=numbers_list)

df['target'] = y

In [None]:
from google.colab import drive

drive.mount('/content/drive', force_remount=True)
path = '/content/drive/My Drive/GSOC/WEEK 7/output0_1000.csv'

with open(path, 'w', encoding = 'utf-8-sig') as f:
  df.to_csv(f)