# Logistic Regression Including All Variables

In [1]:
import pandas as pd
composite_preprocessed = pd.read_csv('Composite_preprocessed_NO_MV_BALANCED.csv')
composite_preprocessed.head()

Unnamed: 0,Number of Stars,Number of Planets,Number of Moons,Galactic Latitude [deg],Galactic Longitude [deg],Ecliptic Latitude [deg],Ecliptic Longitude [deg],Number of Photometry Time Series,Number of Radial Velocity Time Series,Number of Stellar Spectra Measurements,Number of Emission Spectroscopy Measurements,Number of Transmission Spectroscopy Measurements,Circumbinary Flag,Controversial Flag,Discovery Year,Detected by Transits
0,3.094076,-0.666894,0.0,2.424559,1.234306,-0.145901,-0.624689,0.613405,1.731519,-0.2616,-0.012466,-0.044364,0,0,2007,0
1,-0.256668,-0.666894,0.0,1.18672,-0.283545,1.148661,-0.992906,0.613405,0.729626,-0.2616,-0.012466,-0.044364,0,0,2009,0
2,-0.256668,-0.666894,0.0,-0.877523,-0.306068,0.308947,-2.327878,0.613405,0.729626,-0.2616,-0.012466,-0.044364,0,0,2008,0
3,-0.256668,0.216988,0.0,1.382856,-0.669803,0.872499,-0.152934,0.613405,3.735304,0.591749,-0.012466,-0.044364,0,0,2002,0
4,6.44482,-0.666894,0.0,0.261241,-0.531444,1.023143,0.855489,0.613405,3.735304,2.298449,-0.012466,-0.044364,0,0,1996,0


# Train Test Split

In [2]:
# we are trying to predict whether an exoplanet has been detected by transits
targets = composite_preprocessed['Detected by Transits']
# training features are all variables except the targets
features = composite_preprocessed.drop(['Detected by Transits'], axis=1)

In [3]:
# Splitting dataset into training and testing addresses overfitting
# shuffling is necessary to remove dependencies that come from order of data
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(features, targets, train_size = 0.8, random_state = 42)

x_train.shape, y_train.shape

((5148, 15), (5148,))

# Fitting the Logistic Regression

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

logreg = LogisticRegression()
logreg.fit(x_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [5]:
logreg.score(x_train, y_train)

0.8512043512043512

### 85% of the model's training outputs match the targets!
### Still important to manually calculate the accuracy

In [6]:
model_outputs = logreg.predict(x_train)
# view the predicted class labels of the regression
model_outputs

array([0, 1, 1, ..., 0, 0, 0])

In [7]:
model_outputs == y_train

915      True
5904    False
2083     True
2480     True
509      True
        ...  
3772     True
5191     True
5226     True
5390     True
860      True
Name: Detected by Transits, Length: 5148, dtype: bool

In [8]:
# sum to measure correct predictions
import numpy as np
# number correct / total number model predictions
np.sum([model_outputs == y_train]) / model_outputs.shape[0]

0.8512043512043512

### Same accuracy!

# Analyze Summary Table with coefficients (weights) and intercept (bias)

In [9]:
logreg.intercept_, logreg.coef_

(array([-0.00526204]),
 array([[-1.42124490e-01,  4.72468078e-01,  0.00000000e+00,
         -2.59846887e-01,  5.41492684e-01,  1.95242277e+00,
          3.45094625e-01, -2.69072288e+00, -1.63784458e+00,
         -1.77303369e+00,  8.01769322e-02,  2.77559367e-01,
         -1.53029290e-01, -1.09417269e-01, -5.71383407e-04]]))

### Match up coefs with features

In [10]:
feature_name = features.columns.values
summary_table = pd.DataFrame(columns=['Feature Name'], data = feature_name)

### Transpose bc by default np arrays are rows and not columns

In [11]:
summary_table['Coefficients'] = np.transpose(logreg.coef_)

### Insert intercept as 0th index ; Move all indices up by 1 

In [12]:
summary_table.index += 1
summary_table.loc[0] = ['Intercept', logreg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature Name,Coefficients
0,Intercept,-0.005262
1,Number of Stars,-0.142124
2,Number of Planets,0.472468
3,Number of Moons,0.0
4,Galactic Latitude [deg],-0.259847
5,Galactic Longitude [deg],0.541493
6,Ecliptic Latitude [deg],1.952423
7,Ecliptic Longitude [deg],0.345095
8,Number of Photometry Time Series,-2.690723
9,Number of Radial Velocity Time Series,-1.637845


## The closer the coefficient is to 0, the less impact (less weight on the model) 
## Now we calculate log odds 
## Log odds tell us the strength of the relationship between the feature and the outcome (detected by transits T/F)

In [13]:
summary_table['Odds_ratio'] = np.exp(summary_table['Coefficients'])
summary_ordered = summary_table.sort_values('Odds_ratio', ascending=False)
summary_ordered

Unnamed: 0,Feature Name,Coefficients,Odds_ratio
6,Ecliptic Latitude [deg],1.952423,7.045737
5,Galactic Longitude [deg],0.541493,1.71857
2,Number of Planets,0.472468,1.603948
7,Ecliptic Longitude [deg],0.345095,1.412124
12,Number of Transmission Spectroscopy Measurements,0.277559,1.319904
11,Number of Emission Spectroscopy Measurements,0.080177,1.083479
3,Number of Moons,0.0,1.0
15,Discovery Year,-0.000571,0.999429
0,Intercept,-0.005262,0.994752
14,Controversial Flag,-0.109417,0.896356


## A feature is not important if:
## Coef ~ 0 (means this coefficient, even at its highest value will have close to 0 effect on the outcome);
## Odds ratio ~ 1 (means this coefficient, even at its highest value will only have minimal effect on the odds of the outcome)

## Based on the summary table, and after checking p-values, we can exclude:
### Number of Moons; Discovery Year; 