In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
sns.set_style("darkgrid")
sns.set_palette("pastel")

In [3]:
PATH = './'

In [6]:
high_certainty_path = PATH + '../../data/sdss-galaxyzoo/high_certainty/merged_dataset.csv'
high_prob_crossmatch_path = PATH + '../../src/des/des_sdss_overlap/high_prob_overlap/high_prob_crossmatch_merged.csv'
full_overlap_crossmatch_path = PATH + '../../src/des/des_sdss_overlap/full_overlap/full_overlap_crossmatch_merged.csv'

In [7]:
high_certainty_df = pd.read_csv( high_certainty_path )
crossmatch_df = pd.read_csv( high_prob_crossmatch_path )
full_overlap_crossmatch_df = pd.read_csv( full_overlap_crossmatch_path )

In [8]:
len(high_certainty_df)

40620

In [9]:
len(crossmatch_df)

1366

# Training + Validation Set   

In [10]:
from sklearn.model_selection import train_test_split

#### 1. Subtract crossmatch_df from high_certainty_df 

In [11]:
train_df_tmp = high_certainty_df[ ~high_certainty_df['OBJID'].isin(crossmatch_df.SDSS_OBJID) ]

In [12]:
assert( len(train_df_tmp) + len(crossmatch_df) == len(high_certainty_df) )

#### 2. Split the remaining into training and validation sets 

In [13]:
train_df, val_df = train_test_split(train_df_tmp, test_size=1000, random_state=42, shuffle=True )

In [14]:
train_df.reset_index(inplace=True, drop=True)
val_df.reset_index(inplace=True, drop=True)

In [15]:
print( 'Training set size: ', len(train_df) )
print( 'Validation set size: ', len(val_df) )

Training set size:  38254
Validation set size:  1000


In [16]:
print( 'Spirals in training set: ', len(train_df[ train_df.P_CS_DEBIASED > train_df.P_EL_DEBIASED ]) )
print( 'Ellipticals in training set: ', len(train_df[ train_df.P_CS_DEBIASED < train_df.P_EL_DEBIASED ]) )

Spirals in training set:  19258
Ellipticals in training set:  18996


In [17]:
print( 'Spirals in validation set: ', len(val_df[ val_df.P_CS_DEBIASED > val_df.P_EL_DEBIASED]) )
print( 'Ellipticals in validation set: ', len(val_df[ val_df.P_CS_DEBIASED < val_df.P_EL_DEBIASED]) )

Spirals in validation set:  479
Ellipticals in validation set:  521


#### 3. Save the dataframes at deeplearning/data as csv files

In [18]:
train_df.to_csv( PATH + '../data/training_set.csv' )
val_df.to_csv( PATH + '../data/validation_set.csv')
crossmatch_df.to_csv( PATH + '../data/high_prob_crossmatch_test_set.csv')
full_overlap_crossmatch_df.to_csv( PATH + '../data/full_overlap_crossmatch_test_set.csv')