# Due to the poor performance of models trying to predict the discovery method out of 10 methods and difficulties in balancing the dataset, I will change the scope of the question
## Now instead of trying to classify 10 different discovery methods, we will just apply a logistic regression to determine if a given exoplanet was found through TRANSIT (the majority discovery method class)
## This is in response to the heavily imbalanced classes. By combining all discovery methods except the majority class, training a good model and applying SMOTE will be more straightforward, and we can still get insights on the features that have the most weight in an exoplanet being predicted to have been discovered by "transit" or not

In [1]:
import pandas as pd
composite_preprocessed = pd.read_csv('Composite_preprocessed_NO_MV.csv')
pd.options.display.max_columns=None
pd.options.display.max_rows=None
composite_preprocessed.head()

Unnamed: 0,Number of Stars,Number of Planets,Number of Moons,Circumbinary Flag,Discovery Year,Detected by Radial Velocity Variations,Detected by Pulsar Timing Variations,Detected by Pulsation Timing Variations,Detected by Transits,Detected by Astrometric Variations,Detected by Orbital Brightness Modulations,Detected by Microlensing,Detected by Eclipse Timing Variations,Detected by Imaging,Detected by Disk Kinematics,Controversial Flag,Galactic Latitude [deg],Galactic Longitude [deg],Ecliptic Latitude [deg],Ecliptic Longitude [deg],Number of Photometry Time Series,Number of Radial Velocity Time Series,Number of Stellar Spectra Measurements,Number of Emission Spectroscopy Measurements,Number of Transmission Spectroscopy Measurements
0,2,1,0,0,2007,1,0,0,0,0,0,0,0,0,0,0,78.28058,264.13775,18.33392,177.4179,1,2,0,0,0
1,1,1,0,0,2009,1,0,0,0,0,0,0,0,0,0,0,41.04437,108.719,74.95821,141.64699,1,1,0,0,0
2,1,1,0,0,2008,1,0,0,0,0,0,0,0,0,0,0,-21.05141,106.41269,38.22901,11.95935,1,1,0,0,0
3,1,2,0,0,2002,1,0,0,0,0,0,0,0,0,0,0,46.94447,69.16849,62.87885,223.24717,1,4,1,0,0
4,3,1,0,0,1996,1,0,0,0,0,0,0,0,0,0,0,13.20446,83.33558,69.46803,321.21176,1,4,3,0,0


In [2]:
target_dummies = ['Detected by Radial Velocity Variations',
           'Detected by Pulsar Timing Variations',
           'Detected by Pulsation Timing Variations',
           'Detected by Transits',	
           'Detected by Astrometric Variations',
           'Detected by Orbital Brightness Modulations',
           'Detected by Microlensing',
           'Detected by Eclipse Timing Variations',
           'Detected by Imaging',
           'Detected by Disk Kinematics']

In [3]:
# drop rows with more than 1 discovery method for more straightforward training

composite_preprocessed = composite_preprocessed[composite_preprocessed[target_dummies].sum(axis=1) <=1]
composite_preprocessed.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4511 entries, 0 to 5601
Data columns (total 25 columns):
 #   Column                                            Non-Null Count  Dtype  
---  ------                                            --------------  -----  
 0   Number of Stars                                   4511 non-null   int64  
 1   Number of Planets                                 4511 non-null   int64  
 2   Number of Moons                                   4511 non-null   int64  
 3   Circumbinary Flag                                 4511 non-null   int64  
 4   Discovery Year                                    4511 non-null   int64  
 5   Detected by Radial Velocity Variations            4511 non-null   int64  
 6   Detected by Pulsar Timing Variations              4511 non-null   int64  
 7   Detected by Pulsation Timing Variations           4511 non-null   int64  
 8   Detected by Transits                              4511 non-null   int64  
 9   Detected by Astrometric 

In [4]:
# get an idea of the observation count for each discovery method
composite_preprocessed[target_dummies].sum(axis=0)

Detected by Radial Velocity Variations         982
Detected by Pulsar Timing Variations             7
Detected by Pulsation Timing Variations          2
Detected by Transits                          3218
Detected by Astrometric Variations               2
Detected by Orbital Brightness Modulations       5
Detected by Microlensing                       210
Detected by Eclipse Timing Variations           17
Detected by Imaging                             56
Detected by Disk Kinematics                      1
dtype: int64

In [5]:
# since we are applying a logistic regression, we have to combine every discovery method besides "transits" 
# we can do this by dropping every 'Detected by...' column except for transits

dummies_to_drop = ['Detected by Radial Velocity Variations',
           'Detected by Pulsar Timing Variations',
           'Detected by Pulsation Timing Variations',
           'Detected by Astrometric Variations',
           'Detected by Orbital Brightness Modulations',
           'Detected by Microlensing',
           'Detected by Eclipse Timing Variations',
           'Detected by Imaging',
           'Detected by Disk Kinematics']

targets= composite_preprocessed['Detected by Transits']
# tells us instances of 1, total observations
targets.sum(), targets.shape[0]

(3218, 4511)

In [6]:
composite_dummies_dropped = composite_preprocessed.drop(dummies_to_drop, axis=1)
composite_dummies_dropped.head()

Unnamed: 0,Number of Stars,Number of Planets,Number of Moons,Circumbinary Flag,Discovery Year,Detected by Transits,Controversial Flag,Galactic Latitude [deg],Galactic Longitude [deg],Ecliptic Latitude [deg],Ecliptic Longitude [deg],Number of Photometry Time Series,Number of Radial Velocity Time Series,Number of Stellar Spectra Measurements,Number of Emission Spectroscopy Measurements,Number of Transmission Spectroscopy Measurements
0,2,1,0,0,2007,0,0,78.28058,264.13775,18.33392,177.4179,1,2,0,0,0
1,1,1,0,0,2009,0,0,41.04437,108.719,74.95821,141.64699,1,1,0,0,0
2,1,1,0,0,2008,0,0,-21.05141,106.41269,38.22901,11.95935,1,1,0,0,0
3,1,2,0,0,2002,0,0,46.94447,69.16849,62.87885,223.24717,1,4,1,0,0
4,3,1,0,0,1996,0,0,13.20446,83.33558,69.46803,321.21176,1,4,3,0,0


In [7]:
# check if targets are balanced
targets.sum() / targets.shape[0]

0.7133673243183329

## 71% of all target observations are 1 (discovered by transits)
## to efficiently train a logistic regression model the ratio has to be closer to 50/50
## so we apply SMOTE (Sample minority over sampling technique)

In [8]:
from imblearn.over_sampling import SMOTE

# the features are all the columns except our target column
features = composite_dummies_dropped.drop(['Detected by Transits'], axis=1)
smote = SMOTE()
x_resampled, y_resampled = smote.fit_resample(features, targets)

In [9]:
y_resampled.sum() / y_resampled.shape[0]

0.5

### The new ratio implies the synthetic sampling technique worked

In [10]:
# we see a higher non-null count suggesting samples were synthesized
x_resampled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6436 entries, 0 to 6435
Data columns (total 15 columns):
 #   Column                                            Non-Null Count  Dtype  
---  ------                                            --------------  -----  
 0   Number of Stars                                   6436 non-null   int64  
 1   Number of Planets                                 6436 non-null   int64  
 2   Number of Moons                                   6436 non-null   int64  
 3   Circumbinary Flag                                 6436 non-null   int64  
 4   Discovery Year                                    6436 non-null   int64  
 5   Controversial Flag                                6436 non-null   int64  
 6   Galactic Latitude [deg]                           6436 non-null   float64
 7   Galactic Longitude [deg]                          6436 non-null   float64
 8   Ecliptic Latitude [deg]                           6436 non-null   float64
 9   Ecliptic Longitude 

# Select features for logistic Regression

In [11]:
unscaled_features = x_resampled
unscaled_features.columns.values

array(['Number of Stars', 'Number of Planets', 'Number of Moons',
       'Circumbinary Flag', 'Discovery Year', 'Controversial Flag',
       'Galactic Latitude [deg]', 'Galactic Longitude [deg]',
       'Ecliptic Latitude [deg]', 'Ecliptic Longitude [deg]',
       'Number of Photometry Time Series',
       'Number of Radial Velocity Time Series',
       'Number of Stellar Spectra Measurements',
       'Number of Emission Spectroscopy Measurements',
       'Number of Transmission Spectroscopy Measurements'], dtype=object)

# Standardize

In [12]:
from sklearn.preprocessing import StandardScaler
scaler= StandardScaler()
# the standard scaler subtracts the mean and divides by standard deviation for every feature
scaler.fit(unscaled_features)
# mean, stdev are stored in the scaler object now

In [13]:
# apply the standardization
scaled_features = scaler.transform(unscaled_features)

In [14]:
# shape[0] for both features and targets must match
scaled_features.shape, y_resampled.shape

((6436, 15), (6436,))

# Train Test Split

In [15]:
# addresses overfitting
# shuffling is necessary to remove dependencies that come from order of data
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(scaled_features, y_resampled, train_size = 0.8, random_state = 42)

x_train.shape, y_train.shape

((5148, 15), (5148,))

# Fitting the Logistic Regression

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

logreg = LogisticRegression()
logreg.fit(x_train,y_train)

In [17]:
logreg.score(x_train, y_train)

0.8591686091686092

## 85% of the model's outputs match the targets!
## Still important to manually calculate the accuraccy

In [18]:
model_outputs = logreg.predict(x_train)
# view the predicted class labels of the regression
model_outputs

array([0, 0, 1, ..., 0, 0, 0])

In [19]:
model_outputs == y_train

915      True
5904     True
2083     True
2480     True
509      True
1770     True
4596     True
2997     True
3955     True
748      True
3182     True
3480     True
1335     True
1499     True
2226     True
5409     True
1859     True
6074     True
2233     True
471      True
3520     True
3773     True
1509     True
4195     True
6076     True
6421     True
896      True
6159     True
251     False
1209    False
3043     True
2447     True
5784     True
5845     True
1128    False
4906    False
4075     True
5144     True
5492     True
57      False
1322     True
5170     True
1514     True
6392     True
292     False
1142    False
6267     True
1632     True
1170    False
2118     True
1362    False
5531     True
1411    False
2306     True
1086    False
3428     True
1172    False
5914     True
5168     True
3759     True
1973     True
4038     True
3984     True
4852     True
1554     True
3487     True
613      True
2080     True
2436     True
4478     True
3406     True
6202  

In [20]:
# sum to measure correct predictions
import numpy as np
np.sum([model_outputs == y_train])

4423

In [21]:
model_outputs.shape[0]

5148

In [22]:
np.sum([model_outputs == y_train]) / model_outputs.shape[0]

0.8591686091686092

### Same accuracy

# Analyze Summary Table with coefficients (weights) and intercept (bias)

In [23]:
logreg.intercept_, logreg.coef_

(array([-2.06321446]),
 array([[ 0.11483949,  0.58417679,  0.        , -0.29802854,  0.05115432,
         -0.28848394, -0.12928241,  0.37356237,  1.72199175,  0.39930874,
         -3.95660873, -2.78576379, -2.88315256,  0.05420113,  0.45827713]]))

In [24]:
# match up coefs with features
feature_name = unscaled_features.columns.values
summary_table = pd.DataFrame(columns=['Feature Name'], data = feature_name)
summary_table['Coefficients'] = np.transpose(logreg.coef_)
# transpose bc by default np arrays are rows and not columnss

# insert intercept as 0th index 
summary_table.index += 1
summary_table.loc[0] = ['Intercept', logreg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature Name,Coefficients
0,Intercept,-2.063214
1,Number of Stars,0.114839
2,Number of Planets,0.584177
3,Number of Moons,0.0
4,Circumbinary Flag,-0.298029
5,Discovery Year,0.051154
6,Controversial Flag,-0.288484
7,Galactic Latitude [deg],-0.129282
8,Galactic Longitude [deg],0.373562
9,Ecliptic Latitude [deg],1.721992


In [25]:
# the closer the coefficient is to 0, the less impact (less weight on the model) 
# now we calculate log odds 

summary_table['odds_ratio'] = np.exp(summary_table['Coefficients'])
summary_ordered = summary_table.sort_values('odds_ratio', ascending=False)
summary_ordered

Unnamed: 0,Feature Name,Coefficients,odds_ratio
9,Ecliptic Latitude [deg],1.721992,5.595663
2,Number of Planets,0.584177,1.793514
15,Number of Transmission Spectroscopy Measurements,0.458277,1.581347
10,Ecliptic Longitude [deg],0.399309,1.490794
8,Galactic Longitude [deg],0.373562,1.452901
1,Number of Stars,0.114839,1.121693
14,Number of Emission Spectroscopy Measurements,0.054201,1.055697
5,Discovery Year,0.051154,1.052485
3,Number of Moons,0.0,1.0
7,Galactic Latitude [deg],-0.129282,0.878726


## A feature is not important if:
## coef ~ 0 ; odds ratio ~ 1

### Based on this information, we can conclude:
#### Number of moons, Discovery Year, Number of Emission Spectroscopy Measurements are not significant to the model