# Search for the Goldilocks Zone <br>
We will first use our model on data without planetary radius data to make a prediction and look for planets in the .25 - 3 Earth Radius range. 
To check if our model worked, we will use our own model on our data and compare with the original planetary radius values to see how accurate it was. We are going with Lasso.

In [1]:
import numpy as np
import pandas as pd
import pickle

### Use Lasso Model on other data

In [2]:
pickle_in = open("exoplanet_data.pickle","rb")
exoplanet_data_full = pickle.load(pickle_in)

In [3]:
loaded_linear = pickle.load(open('Linear_model.sav', 'rb'))
loaded_lasso = pickle.load(open('Lasso_model.sav', 'rb'))
loaded_ridge = pickle.load(open('Ridge_model.sav', 'rb'))

In [4]:
#Leave planets without radius only
all_data=exoplanet_data_full.drop(
    columns = ['MASS_SINI','SEMI_MAJOR_AXIS','OMEGA','TIME_PERIAPSE','INCLINATION','STAR_AGE'])
radius_filter = pd.isnull(all_data['PLANETARY_RADIUS'])
all_data = all_data[radius_filter]
all_data = all_data.drop(columns = 'PLANETARY_RADIUS')
all_data = all_data.dropna()
all_data['STMdivPM'] = all_data['STAR_MASS']/all_data['MASS']

In [5]:
#Other filters
filter_outlier = all_data['ORBITAL_PERIOD'] < all_data.quantile([.999])['ORBITAL_PERIOD'].iloc[0]
all_data = all_data[filter_outlier]

filter_outlier = all_data['ECCENTRICITY'] < all_data.quantile([.999])['ECCENTRICITY'].iloc[0]
all_data = all_data[filter_outlier]

filter_outlier = all_data['STAR_RADIUS'] < all_data.quantile([.999])['STAR_RADIUS'].iloc[0]
all_data = all_data[filter_outlier]

In [6]:
#formatting dataframe to merge with earth radius
no_categorical = all_data.drop(columns = ['NAME','DETECTION_TYPE'])
all_data2 = all_data.reset_index()
all_data2 = all_data2.drop(columns = 'index')

In [7]:
all_data2.head()

Unnamed: 0,NAME,MASS,ORBITAL_PERIOD,ECCENTRICITY,DETECTION_TYPE,STAR_MASS,STAR_TEMP,STAR_RADIUS,STAR_METALLICITY,STMdivPM
0,11 Com b,19.4,326.03,0.231,Radial Velocity,2.7,4742.0,19.0,0.35,0.139175
1,11 UMi b,10.5,516.22,0.08,Radial Velocity,1.8,4340.0,24.08,0.04,0.171429
2,14 And b,5.33,185.84,0.0,Radial Velocity,2.2,4813.0,11.0,0.24,0.412758
3,14 Her b,4.64,1773.4,0.369,Radial Velocity,0.9,5311.0,0.708,0.43,0.193966
4,16 Cyg B b,1.68,799.5,0.689,Radial Velocity,1.01,5766.0,0.98,0.08,0.60119


In [8]:
#Store predicted Radius values data without planetary radius
earth_radii = pd.DataFrame(loaded_lasso.predict(no_categorical) * 11.2, columns = ['EARTH_RADIUS'])

#merge this data with dataframe with names
goldilocks_data = pd.concat([all_data2, earth_radii], axis = 1)
cols = ['NAME', 'EARTH_RADIUS','MASS', 'ORBITAL_PERIOD', 'ECCENTRICITY', 
        'DETECTION_TYPE','STAR_MASS', 'STAR_TEMP', 'STAR_RADIUS', 'STAR_METALLICITY', 'STMdivPM']
goldilocks_data = goldilocks_data[cols]goldilocks_data.head(20)

goldilocks_data.info()goldilocks_data.head(20)

goldilocks_data.info()

#filter out the goldilocks zone planets!
goldilocks_data = goldilocks_data[(goldilocks_data['EARTH_RADIUS']<3) & (goldilocks_data['EARTH_RADIUS'] > 0)]

In [13]:
goldilocks_data.head()

Unnamed: 0,NAME,EARTH_RADIUS,MASS,ORBITAL_PERIOD,ECCENTRICITY,DETECTION_TYPE,STAR_MASS,STAR_TEMP,STAR_RADIUS,STAR_METALLICITY,STMdivPM
32,BD+49 828 b,0.82649,1.6,2590.0,0.35,Radial Velocity,1.52,4943.0,7.6,0.19,0.95
33,BD-11 4672 b,2.464223,0.53,1667.0,0.05,Radial Velocity,0.571,4475.0,0.52,0.48,1.077358
37,GJ 1132 c,2.484371,0.00831,8.929,0.27,Radial Velocity,0.181,3270.0,0.207,0.12,21.780987
39,GJ 15A b,1.959595,0.00953,11.4407,0.094,Radial Velocity,0.375,3567.0,0.3863,0.32,39.349423
41,GJ 179 c,2.711242,0.0154,3.4798,0.04,Radial Velocity,0.357,3370.0,0.38,0.3,23.181818


In [14]:
goldilocks_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42 entries, 32 to 512
Data columns (total 11 columns):
NAME                42 non-null object
EARTH_RADIUS        42 non-null float64
MASS                42 non-null float64
ORBITAL_PERIOD      42 non-null float64
ECCENTRICITY        42 non-null float64
DETECTION_TYPE      42 non-null object
STAR_MASS           42 non-null float64
STAR_TEMP           42 non-null float64
STAR_RADIUS         42 non-null float64
STAR_METALLICITY    42 non-null float64
STMdivPM            42 non-null float64
dtypes: float64(9), object(2)
memory usage: 3.9+ KB


**Success! 42 planets without planetary radii predicted to have habitable radii according to our Lasso model with alpha = 1** <br>
Let's see if our model at least can accurately identify which planets are in the goldilocks zone for Transit data.

### Checking Model's Categorization on Original Data

In [28]:
pickle_in = open("final_data.pkl","rb")
final_data = pickle.load(pickle_in)

In [29]:
validation_data = final_data.drop(columns = ['NAME','DETECTION_TYPE'])

In [30]:
X2 = final_data.drop('PLANETARY_RADIUS', axis = 1)
#cleaning dataframe to merge with earth radius
X2 = X2.reset_index()
X2 = X2.drop(columns = 'index')

In [31]:
#Store predicted Radius values data without planetary radius
earth_radii = pd.DataFrame(loaded_lasso.predict(X) * 11.2, columns = ['EARTH_RADIUS'])

#merge this data with dataframe with names
test_goldilocks = pd.concat([X2, earth_radii], axis = 1)

cols = ['NAME', 'EARTH_RADIUS','MASS', 'ORBITAL_PERIOD', 'ECCENTRICITY', 'DETECTION_TYPE','STAR_MASS', 'STAR_TEMP', 'STAR_RADIUS', 'STAR_METALLICITY', 'STMdivPM']
test_goldilocks = test_goldilocks[cols]

In [32]:
#filter out the goldilocks zone planets!
test_goldilocks = test_goldilocks[(test_goldilocks['EARTH_RADIUS']<3) & (test_goldilocks['EARTH_RADIUS'] > 0)]
test_goldilocks.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12 entries, 41 to 358
Data columns (total 11 columns):
NAME                12 non-null object
EARTH_RADIUS        12 non-null float64
MASS                12 non-null float64
ORBITAL_PERIOD      12 non-null float64
ECCENTRICITY        12 non-null float64
DETECTION_TYPE      12 non-null object
STAR_MASS           12 non-null float64
STAR_TEMP           12 non-null float64
STAR_RADIUS         12 non-null float64
STAR_METALLICITY    12 non-null float64
STMdivPM            12 non-null float64
dtypes: float64(9), object(2)
memory usage: 1.1+ KB


In [33]:
test_goldilocks

Unnamed: 0,NAME,EARTH_RADIUS,MASS,ORBITAL_PERIOD,ECCENTRICITY,DETECTION_TYPE,STAR_MASS,STAR_TEMP,STAR_RADIUS,STAR_METALLICITY,STMdivPM
41,GJ 1132 b,1.418803,0.00522,1.628931,0.0,Primary Transit,0.181,3270.0,0.207,0.12,34.67433
42,GJ 1214 b,2.93096,0.020325,1.580405,0.27,Primary Transit,0.15,3026.0,0.216,0.39,7.379954
47,GJ 9827 d,2.907854,0.012,6.201472,0.0,Primary Transit,0.659,4255.0,0.651,0.28,54.916667
238,Kepler-100 d,2.600794,0.0094,35.33313,0.38,Primary Transit,1.109,5825.0,1.5131,0.02,117.978723
243,Kepler-109 c,0.088739,0.00698,21.22262,0.03,Primary Transit,1.069,5952.0,1.339,0.08,153.151862
292,Kepler-445 d,2.576241,0.011,8.15275,0.0,Primary Transit,0.18,3157.0,0.21,0.19,16.363636
294,Kepler-446 c,2.570031,0.009,3.036179,0.0,Primary Transit,0.22,3359.0,0.24,0.3,24.444444
295,Kepler-446 d,2.769543,0.01,5.148921,0.0,Primary Transit,0.22,3359.0,0.24,0.3,22.0
321,LHS 1140 c,1.720046,0.00569,3.777931,0.0,Primary Transit,0.146,3131.0,0.186,0.24,25.659051
352,TOI-270 d,2.706902,0.017,11.38014,0.0,Primary Transit,0.4,3386.0,0.38,0.17,23.529412


In [34]:
actual_goldilocks = final_data.copy()
actual_goldilocks['EARTH_RADIUS'] = actual_goldilocks['PLANETARY_RADIUS']*11.2
actual_goldilocks = actual_goldilocks.drop('PLANETARY_RADIUS', axis = 1)
cols = ['NAME', 'EARTH_RADIUS','MASS', 'ORBITAL_PERIOD', 'ECCENTRICITY', 'DETECTION_TYPE','STAR_MASS', 'STAR_TEMP', 'STAR_RADIUS', 'STAR_METALLICITY', 'STMdivPM']
actual_goldilocks = actual_goldilocks[cols]
actual_goldilocks = actual_goldilocks[(actual_goldilocks['EARTH_RADIUS']<3) & (actual_goldilocks['EARTH_RADIUS'] > 0)]

In [35]:
actual_goldilocks.info()

<class 'pandas.core.frame.DataFrame'>
Index: 75 entries, 52 to 3808
Data columns (total 11 columns):
NAME                75 non-null object
EARTH_RADIUS        75 non-null float64
MASS                75 non-null float64
ORBITAL_PERIOD      75 non-null float64
ECCENTRICITY        75 non-null float64
DETECTION_TYPE      75 non-null object
STAR_MASS           75 non-null float64
STAR_TEMP           75 non-null float64
STAR_RADIUS         75 non-null float64
STAR_METALLICITY    75 non-null float64
STMdivPM            75 non-null float64
dtypes: float64(9), object(2)
memory usage: 7.0+ KB


In [36]:
matchlist = []
for index, row in test_goldilocks.iterrows():
    if actual_goldilocks['NAME'].eq(row[0]).any():
        matchlist.append(row[0])
matchlist

['GJ 1132 b',
 'GJ 1214 b',
 'GJ 9827 d',
 'Kepler-100 d',
 'Kepler-109 c',
 'Kepler-445 d',
 'Kepler-446 c',
 'Kepler-446 d',
 'LHS 1140 c',
 'TOI-270 d',
 'TRAPPIST-1 c',
 'TRAPPIST-1 g']

In [37]:
len(matchlist)

12

### Success! Sort of <br>
Our model identified 12 planets as being in the goldilocks zone, and all were actually in the zone! However, the model only identified 13 out of 75 possible. So, no false positives at least?