# XGBoost Model with All Features after initial preprocessing

In [1]:
import pandas as pd
composite_preprocessed = pd.read_csv('Composite_preprocessed_NO_MV_BALANCED.csv')
composite_preprocessed.head()

Unnamed: 0,Number of Stars,Number of Planets,Number of Moons,Galactic Latitude [deg],Galactic Longitude [deg],Ecliptic Latitude [deg],Ecliptic Longitude [deg],Number of Photometry Time Series,Number of Radial Velocity Time Series,Number of Stellar Spectra Measurements,Number of Emission Spectroscopy Measurements,Number of Transmission Spectroscopy Measurements,Circumbinary Flag,Controversial Flag,Discovery Year,Detected by Transits
0,3.094076,-0.666894,0.0,2.424559,1.234306,-0.145901,-0.624689,0.613405,1.731519,-0.2616,-0.012466,-0.044364,0,0,2007,0
1,-0.256668,-0.666894,0.0,1.18672,-0.283545,1.148661,-0.992906,0.613405,0.729626,-0.2616,-0.012466,-0.044364,0,0,2009,0
2,-0.256668,-0.666894,0.0,-0.877523,-0.306068,0.308947,-2.327878,0.613405,0.729626,-0.2616,-0.012466,-0.044364,0,0,2008,0
3,-0.256668,0.216988,0.0,1.382856,-0.669803,0.872499,-0.152934,0.613405,3.735304,0.591749,-0.012466,-0.044364,0,0,2002,0
4,6.44482,-0.666894,0.0,0.261241,-0.531444,1.023143,0.855489,0.613405,3.735304,2.298449,-0.012466,-0.044364,0,0,1996,0


# Train Test Split

In [2]:
# we are trying to predict whether an exoplanet has been detected by transits
targets = composite_preprocessed['Detected by Transits']
# training features are all variables except the targets
features = composite_preprocessed.drop(['Detected by Transits'], axis=1)

## Observe Feature Correlations

In [3]:
features.corr()

Unnamed: 0,Number of Stars,Number of Planets,Number of Moons,Galactic Latitude [deg],Galactic Longitude [deg],Ecliptic Latitude [deg],Ecliptic Longitude [deg],Number of Photometry Time Series,Number of Radial Velocity Time Series,Number of Stellar Spectra Measurements,Number of Emission Spectroscopy Measurements,Number of Transmission Spectroscopy Measurements,Circumbinary Flag,Controversial Flag,Discovery Year
Number of Stars,1.0,0.109074,,-0.044988,0.104236,-0.087528,-0.06945,0.057734,0.132503,0.077689,-0.0032,-0.002096,0.244685,0.090915,-0.147575
Number of Planets,0.109074,1.0,,-0.055511,0.008589,0.02668,-0.023926,-0.002505,0.08247,0.186263,0.057797,0.076409,-0.012547,0.061726,-0.072274
Number of Moons,,,,,,,,,,,,,,,
Galactic Latitude [deg],-0.044988,-0.055511,,1.0,-0.019213,0.463063,0.174632,-0.090369,-0.047292,-0.058085,-0.025691,-0.008918,0.00907,-0.016991,0.03182
Galactic Longitude [deg],0.104236,0.008589,,-0.019213,1.0,-0.657044,-0.408779,0.131333,0.113682,0.054698,-0.008283,-0.026088,0.025643,0.020641,-0.106558
Ecliptic Latitude [deg],-0.087528,0.02668,,0.463063,-0.657044,1.0,0.498442,-0.222283,-0.13997,-0.104345,-0.006864,0.025212,-0.028239,-0.013646,0.081113
Ecliptic Longitude [deg],-0.06945,-0.023926,,0.174632,-0.408779,0.498442,1.0,-0.119612,-0.191094,-0.193637,0.013811,0.031078,0.001673,0.000296,0.058041
Number of Photometry Time Series,0.057734,-0.002505,,-0.090369,0.131333,-0.222283,-0.119612,1.0,0.17439,0.182466,-0.004172,-0.014848,-0.022719,0.008973,-0.252983
Number of Radial Velocity Time Series,0.132503,0.08247,,-0.047292,0.113682,-0.13997,-0.191094,0.17439,1.0,0.492154,-0.003394,-0.007912,-0.019551,-0.006469,-0.498031
Number of Stellar Spectra Measurements,0.077689,0.186263,,-0.058085,0.054698,-0.104345,-0.193637,0.182466,0.492154,1.0,-0.003261,-0.011606,-0.019001,0.020205,-0.244466


## Split data

In [4]:
# Splitting dataset into training and testing addresses overfitting
# shuffling is necessary to remove dependencies that come from order of data
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(features, targets, train_size = 0.8, random_state = 42)

x_train.shape, y_train.shape

((5148, 15), (5148,))

# Fitting the XGBoost Model

In [5]:
import xgboost as xgb
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
import matplotlib.pyplot as plt

xgb_model_all = xgb.XGBClassifier(use_label_encoder = False, eval_metric = 'logloss')
xgb_model_all.fit(x_train, y_train)



ValueError: feature_names must be string, and may not contain [, ] or <

## Predict on the test set

In [None]:
y_pred_xgb_all = xgb_model_all.predict(x_test)

In [None]:
('Accuracy:', accuracy_score(y_test, y_pred_xgb_all))
('Classification Report:', classification_report(y_test, y_pred_xgb_all))

## ROC AUC score 

In [None]:
y_prob_xgb_all = xgb_model_all.predict_proba(x_test)[:,1]
roc_auc_xgb_all = roc_auc_score(y_test, y_prob_xgb_all)

('ROC AUC Score All Features:', roc_auc_xgb_all)

## Plot Feature Importance

In [None]:
xgb.plot_importance(xgb_model_all)
plt.show()