<hr style="border:1px solid black"> </hr>

# Imports

In [1]:
# For Data Import
import glob
import os

# For Data Wrangling and Visualizing
import numpy as np
import pandas as pd
import seaborn as sns
%matplotlib inline

# For Reading FITS Data From NASA
from astropy.io import fits
import matplotlib.pyplot as plt

# For Processing the huge amount of Light Curve Data
import multiprocessing as mp
from multiprocessing import Pool
from functools import partial
import pandas.testing as pdt
from __future__ import print_function
# import swifter
# import mapply

import sys
from concurrent.futures import ThreadPoolExecutor
from functools import partial
from multiprocessing import Pool, RLock, freeze_support
from random import random
from threading import RLock as TRLock
from tqdm.notebook import tqdm, trange, tqdm_notebook
from tqdm.contrib.concurrent import process_map
# from progressbar import progressbar
import time 
tqdm.pandas()

<hr style="border:.3px solid grey"> </hr>

## Load in FITS Light Curve URLs
Here we will load in and clean up the wget links for each FITS lightcurve file for each K2 campaign. Each link represents a time series of the flux values observed. The imported "astropy.io" can read these links and return the useful data for model access.

NB: 
- Campaign 9 is notably missing, as that data has not been made publicly available through the Mikulski Archive. This unfortunately will reduce the number of "confirmed/candidate" target light curve data points to pick from.
- Campaigns 10a and 10b & 11a and 11b will be merged into Campaign 10 and Campaign 11, respectively, as no distinction is made in the K2 Targets

#### Dataframe from All Light Curve Files
Read in and concatenate all files in the "wgets" folder using glob to parametrically generate a list of file path names. 

In [2]:
light_curves = pd.concat([pd.read_fwf(file, header=None, names=[0, 1, "Link"]) for file in glob.glob(os.path.join(".\Data_Dump\wgets\Light_Curves\\" , "*.txt"))], ignore_index=True).drop([0,1], axis=1)

#### Build a Campaign, EPIC ID, and Cadence Column for Merging and Filtering Later
A lot of unique identifying information is extracted from each of the FITS link strings.

In [3]:
light_curves["Campaign"] = light_curves["Link"].str.extract("(?:s/c)(\d{1,2})").astype(int)

In [4]:
light_curves["EPIC"] = light_curves["Link"].str.extract("(?:ktwo)(\d+)").astype(int)

In [5]:
light_curves["Cadence"] = light_curves["Link"].str[-8:-5]

#### Filter out Short Cadence data

In [6]:
light_curves = light_curves.loc[light_curves.Cadence == "llc"]

#### Set the EPIC IDs and Campaign No.s as a multi-index
While neither are unique identifiers on their own, the combination of the two IDs is unique and will be usefull for merging the links to the K2 Targets data frame 

In [7]:
light_curves = light_curves.set_index(["EPIC", "Campaign"])

<hr style="border:.3px solid grey"> </hr>

## Load in Kepler Survey Disposition Labels

In [2]:
labels_k1 = pd.concat([pd.read_csv(file, header=0, names=["KepID", "2MASS", "Label"], skiprows=1) 
                       for file in glob.glob(os.path.join(".\Data_Dump\Kepler_Labels\\", "*.txt"))], ignore_index=True)

#### Split the Label column into individual classes and Drop

In [3]:
labels_k1["K1_False_Positive"] = np.where(labels_k1["Label"].str.contains("False_Positive", case=False), 1, 0)
labels_k1["K1_Confirmed"] = np.where(labels_k1["Label"].str.contains("Exoplanet", case=False), 1, 0)
labels_k1["K1_Candidate"] = np.where(labels_k1["Label"].str.contains("Planetary_candidate", case=False), 1, 0)
labels_k1["K1_Binary"] = np.where(labels_k1["Label"].str.contains("Eclipsing_binary", case=False), 1, 0)
labels_k1["K1_Giant"] = np.where(labels_k1["Label"].str.contains("Red_giant", case=False), 1, 0)

In [4]:
labels_k1 = labels_k1.drop(["Label"], axis=1)

<hr style="border:.3px solid grey"> </hr>

## Load in K2 Survey Disposition Labels

In [5]:
names_candidate_data = ['loc_rowid', 'EPIC', '2MASS', 'EPIC.Name', 'pl_name',
       'k2c_refdisp', 'k2c_reflink', 'Label', 'Campaign']
columns_candidate_data = ['EPIC', 'Label', 'Campaign', 'EPIC.Name']
labels_k2 = pd.read_csv("./Data_Dump/K2-Candidates.txt", header=0, names=names_candidate_data, 
                        usecols=columns_candidate_data)

In [6]:
labels_k2["K2_False_Positive"] = np.where(labels_k2["Label"].str.contains("False Positive", case=False), 1, 0)
labels_k2["K2_Confirmed"] = np.where(labels_k2["Label"].str.contains("confirmed", case=False), 1, 0)
labels_k2["K2_Candidate"] = np.where(labels_k2["Label"].str.contains("candidate", case=False), 1, 0)

In [7]:
labels_k2 = labels_k2.drop(["Label"], axis=1)

In [8]:
labels_k2 = labels_k2[labels_k2["Campaign"].notna()]

In [9]:
labels_k2["EPIC"] = labels_k2["EPIC"].str.extract("(\d+)").astype(int)
labels_k2["Campaign"] = labels_k2["Campaign"].astype(int)

In [10]:
labels_k2["Event"] = labels_k2["EPIC.Name"].str[-2:].astype(int)

In [None]:
candidates = pd.DataFrame({'EPIC': [x for x in labels_k2.EPIC.unique()], 
                           'count': [np.sort(labels_k2.loc[labels_k2.EPIC==x].Event.unique())[-1] for x in labels_k2.EPIC.unique()]}).set_index(['EPIC'])

In [None]:
sns.histplot(candidates)
print(candidates.value_counts(normalize=True)*100)
labels_k2.value_counts(subset=['K2_False_Positive', 'K2_Confirmed', 'K2_Candidate'], normalize=True)*100

In [13]:
labels_k2 = labels_k2.set_index(['EPIC', 'Campaign'])

In [None]:
labels_k2.head()

In [14]:
labels_k2.xs(206358352)

Unnamed: 0_level_0,EPIC.Name,K2_False_Positive,K2_Confirmed,K2_Candidate,Event
Campaign,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3,EPIC 206358352.01,0,0,1,1


In [None]:
# pd.DataFrame({'index' : (x for x in merge_labels_k2.index.unique)})
# merge_k2 = pd.DataFrame({'index' : (x for x in labels_k2.index.unique())})
# merge_k2[['EPIC', 'Campaign']] = pd.DataFrame(merge_k2['index'].tolist(), index=merge_k2.index)
# merge_k2.drop(['index'], axis=1, inplace=True)
# merge_k2.set_index(['EPIC', 'Campaign'], inplace=True)
# merge_k2[['Confirmed', 'Candidate', 'False_Positive']] = labels_k2[x] for x in merge_k2.index

<hr style="border:.3px solid grey"> </hr>

## Load in Target Data
Here we will load in the ~400,000 targets described in the K2 Target Dataset and filter out non-stellar targets. Each row describes an object in the Ecliptic Plane Input Catalogue *(EPIC)* flagged for Transit Analysis by the Guest Observer Program. Targets observed across multiple campaigns will appear as multiple rows. For the purposes of this project the calibration and test field targets (Campaigns E and 0 respectively) will be dropped as these are by non-stellar targets such as asteroids, local planets, galaxies, or photometric artifacts to be calibrated out of future observation campaigns. Additionally, we will be removing targets with Object Type "Extended" and "Null", as these represent non-stellar observations part of the extended mission.

Edit: Targets from Campaign 11 will also be dropped due to the ~14000 duplicate light curve observations and lack of any flagged transit events. Reviewing the press on the particular campaign, the observations were focused on the core of the Milky Way galaxy and not orbital transit candidates. 

#### Create a dataframe of every EPIC Target across every Campaign

In [8]:
names_target_data = ["EPIC", "2MASS", "Campaign", "Obj_Type", "rastr",
                     "decstr","k2_propid","Distance","k2_disterr1","k2_disterr2",
                     "k2_teff","k2_tefferr1","k2_tefferr2","Stellar_Radius","k2_raderr1",
                     "k2_raderr2","Stellar_Mass","k2_masserr1","k2_masserr2","k2_kepmag",
                     "k2_kepmagerr","k2_kepmagflag","k2_vjmag","k2_vjmagerr","k2_kmag","k2_kmagerr"]
columns_target_data = ["EPIC", "2MASS", "Campaign", "Obj_Type"]
target_data = pd.read_csv("./Data_Dump/K2-Targets.txt", header=0,names=names_target_data, usecols=columns_target_data)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


#### Filter Unwanted Targets and Fixing DTypes

In [9]:
target_data=target_data[~target_data["Campaign"].isin(["E", "11", 11])]
target_data["Campaign"] = target_data["Campaign"].astype(int)
target_data=target_data[~target_data["EPIC"].isin([";"])]

In [10]:
target_data["EPIC"]=target_data["EPIC"].astype(int)

#### Set multi-index to unique EPIC ID and Campaign Descriptor (similar to Light Curve Dataframe index structure)

In [11]:
target_data=target_data.set_index(["EPIC", "Campaign"])

## Merge Light Curve FITS links to each Target
**Note:** Not all K2 EPIC targets ended up having usable observation data. These rows will be dropped.

In [12]:
target_light_curves = target_data.merge(light_curves, left_index=True, right_index=True)

In [None]:
(target_light_curves.isnull().sum()/target_light_curves.shape[0]*100).to_frame(name='Percent Null')

In [None]:
target_light_curves

In [13]:
def get_flux_data (link):
    global pbar
    pbar.update(1)
    with fits.open(link, mode="readonly", cache=False, lazy_load_hdus=True) as hdulist:
        pdcsap_fluxes = list(hdulist[1].data['PDCSAP_FLUX'])
        return pdcsap_fluxes

In [14]:
target_light_curves_dict = {key : target_light_curves.xs(key, level=1) for key in set(target_light_curves.index.get_level_values('Campaign'))}

In [None]:
for x in list(target_light_curves_dict.keys()):
    with tqdm(total=len(target_light_curves_dict[x]['Link'])) as pbar:
        target_light_curves_dict[x]['Flux']=target_light_curves_dict[x].apply(lambda row: get_flux_data(row.Link), axis=1)
#         target_light_curves_dict[x]['Flux']=np.vectorize(get_flux_data)(target_light_curves_dict[x]['Link'])
    import gc
    gc.collect()

  0%|          | 0/7699 [00:00<?, ?it/s]

In [None]:
np.vectorize(get_flux_data)(target_light_curves_dict[1].loc[201150515, 'Link'])

In [None]:
import gc
gc.collect()

In [None]:
#!/usr/bin/env python
import psutil
# gives a single float value
psutil.cpu_percent()
# gives an object with many fields
psutil.virtual_memory()
# you can convert that object to a dictionary 
dict(psutil.virtual_memory()._asdict())

In [None]:
list(set(len(target_light_curves_dict[y].iloc[x].Flux_Len) for x in range(target_light_curves_dict[0].shape[0])) for y in range(5))

In [None]:
list(next((i for i, z in enumerate(target_light_curves_dict[1].iloc[x].Flux_Len[::-1]) if z != 0), None) for x in range(target_light_curves_dict[1].shape[0]))

In [None]:
list(np.count_nonzero(target_light_curves_dict[0].iloc[x].Flux_Len)+next((i for i, z in enumerate(target_light_curves_dict[0].iloc[x].Flux_Len) if z != 0), None)-len(target_light_curves_dict[0].iloc[x].Flux_Len) for x in range(target_light_curves_dict[0].shape[0]))

In [None]:
list(np.count_nonzero(target_light_curves_dict[1].iloc[x].Flux_Len)-len(target_light_curves_dict[1].iloc[x].Flux_Len) for x in range(target_light_curves_dict[1].shape[0]))

In [None]:
def parallelize(data, func, num_of_processes=os.cpu_count()):
    data_split = np.array_split(data, num_of_processes)
    pool = Pool(num_of_processes)
    data = pd.concat(pool.map(func, data_split))
    pool.close()
    pool.join()
    return data

def run_on_subset(func, data_subset):
    return data_subset.apply(func)

def parallelize_on_rows(data, func, num_of_processes=8):
    return parallelize(data, partial(run_on_subset, func), num_of_processes)

In [None]:
print(os.cpu_count()-1)

In [None]:
for x in tqdm_notebook(list(target_light_curves_dict.keys()), desc="Overall"):
    target_light_curves_dict[x]['Flux'] = 0

In [None]:
target_light_curves_dict[0]

In [None]:
for df in tqdm_notebook(target_light_curves_dict.items(), desc="Overall"):
    print(type(df[1]))
    df[1].Flux = parallelize_on_rows(df[1].Link, get_flux_data)

In [None]:
for df in target_light_curves_dict.items():
    print(type(df[1]))

In [None]:
target_light_curves_dict.items()

In [None]:
parallelize_on_rows(target_light_curves['Link'], get_flux_data)

In [None]:
def parallelize_dataframe(df, func):
    num_processes = mp.cpu_count()-2
    df_split = np.array_split(df, num_processes)
    with mp.Pool(num_processes) as p:
        df = pd.concat(p.map(func, df_split))
    return df

def parallelize_function(df):
    df['Flux'] = df['Link'].apply(get_flux_data, axis=1)
    return df

In [None]:
target_light_curves = tqdm(parallelize_dataframe(target_light_curves, parallelize_function), total=target_light_curves.shape[0])

In [None]:
target_light_curves['Flux'] = 

In [None]:

if __name__ == '__main__':
    def get_flux_data (row):
        link = row.Link
        with fits.open(link, mode="readonly") as hdulist:
            pdcsap_fluxes = hdulist[1].data['PDCSAP_FLUX']
            return {row.index : pdcsap_fluxes}
    r = process_map(get_flux_data, target_light_curves, max_workers=10, chunksize=1)

In [None]:
len(set(target_light_curves.index.get_level_values(1)))

In [None]:
Edit -> nbextensions config