In [1]:
import pandas as pd

# XGB Model

## Load the preprocessed data

In [2]:
composite_preprocessed = pd.read_csv('Composite_preprocessed_NO_MV.csv')
pd.options.display.max_columns=None
pd.options.display.max_rows=None
composite_preprocessed.head()

Unnamed: 0,Number of Stars,Number of Planets,Number of Moons,Circumbinary Flag,Discovery Year,Detected by Radial Velocity Variations,Detected by Pulsar Timing Variations,Detected by Pulsation Timing Variations,Detected by Transits,Detected by Astrometric Variations,Detected by Orbital Brightness Modulations,Detected by Microlensing,Detected by Eclipse Timing Variations,Detected by Imaging,Detected by Disk Kinematics,Controversial Flag,Galactic Latitude [deg],Galactic Longitude [deg],Ecliptic Latitude [deg],Ecliptic Longitude [deg],Number of Photometry Time Series,Number of Radial Velocity Time Series,Number of Stellar Spectra Measurements,Number of Emission Spectroscopy Measurements,Number of Transmission Spectroscopy Measurements
0,2,1,0,0,2007,1,0,0,0,0,0,0,0,0,0,0,78.28058,264.13775,18.33392,177.4179,1,2,0,0,0
1,1,1,0,0,2009,1,0,0,0,0,0,0,0,0,0,0,41.04437,108.719,74.95821,141.64699,1,1,0,0,0
2,1,1,0,0,2008,1,0,0,0,0,0,0,0,0,0,0,-21.05141,106.41269,38.22901,11.95935,1,1,0,0,0
3,1,2,0,0,2002,1,0,0,0,0,0,0,0,0,0,0,46.94447,69.16849,62.87885,223.24717,1,4,1,0,0
4,3,1,0,0,1996,1,0,0,0,0,0,0,0,0,0,0,13.20446,83.33558,69.46803,321.21176,1,4,3,0,0


### We expect that half of these features will not have merit in predicting discovery method 
### Feature Analysis from the XGB model trained on the imbalanced, < 7% MV dataset told us these are the 5 features with most weight in the model:

### TESS Magnitude
### Ks (2MASS) Magnitude
### Orbital Period Limit Flag
### H (2MASS) Magnitude
### Planet Radius (Earth Radius) 


In [3]:
target_dummies = ['Detected by Radial Velocity Variations',
           'Detected by Pulsar Timing Variations',
           'Detected by Pulsation Timing Variations',
           'Detected by Transits',	
           'Detected by Astrometric Variations',
           'Detected by Orbital Brightness Modulations',
           'Detected by Microlensing',
           'Detected by Eclipse Timing Variations',
           'Detected by Imaging',
           'Detected by Disk Kinematics']

In [4]:
features = composite_preprocessed.drop(target_dummies, axis=1)
targets = composite_preprocessed[target_dummies]

### for xgb boost, feature columns cannot include [, ] or <
### so we will replace these characters

In [5]:
features.columns = features.columns.str.replace('[', '(').str.replace(']',')')

## Split data into training and testing

In [6]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(features,targets, train_size=0.8, random_state=42)
x_train.shape, y_train.shape

((4481, 15), (4481, 10))

## Apply SMOTE to balance the classes

In [7]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
x_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)

# we only balance the training data ; test data will remain unbalanced to get an unbiased estimate of the model's performance

ValueError: Imbalanced-learn currently supports binary, multiclass and binarized encoded multiclasss targets. Multilabel and multioutput targets are not supported.

### Import and train the model

In [None]:
# in this case we will use Extreme Gradient Boosting because it is flexible, tuneable, high performing and robust against missing values
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score

In [None]:
xgb_classifier = xgb.XGBClassifier()
xgb_classifier.fit(x_train_resampled, y_train_resampled)

## Model prediction and accuracy

In [None]:
y_pred = xgb_classifier.predict(x_test)
accuracy= accuracy_score(y_test,y_pred)
accuracy 

In [None]:
clf_rep = classification_report(y_test,y_pred, zero_division=1, digits=4)
print(clf_rep)

## XGB Classifier Model Evaluation (weighted average values accounts for class imbalances)
### Accuracy: 94.55%
### Precision: 97.54%
### Recall: 97.54% 
### F1: 97.40%
### Support: Represents the number of actual occurrences of each class in the dataset (provides context on the size of each class)
### Classes with higher support values typically have more influence on the overall evaluation metrics, while classes with lower support values may have less impact, especially in imbalanced datasets

### Overall, while the model shows good performance for most classes, it struggles to correctly predict class 1, resulting in poor precision, recall, and F1-score for that class.

# Extract and Plot Feature Importance

In [None]:
import matplotlib.pyplot as plt

feature_importance = xgb_classifier.feature_importances_
feature_names = x_train.columns

# set it up as a dataframe
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Weight': feature_importance})

# sort by importance
feature_importance_df = feature_importance_df.sort_values(by= 'Weight', ascending=False)

feature_importance_df

In [None]:
# Plot features by importance ; top 40 to fit in graph
n = 40
plt.figure(figsize=(10,8))
plt.barh(feature_importance_df['Feature'][:n], feature_importance_df['Weight'][:n])
plt.xlabel('Feature')
plt.ylabel('Weight')
plt.title('Feature vs Weight')
plt.show()

## The Feature Importance Plot suggests that the top 5 features with most weight in the xgb model and dataset with MV are:
### TESS Magnitude
### Ks (2MASS) Magnitude
### Orbital Period Limit Flag
### H (2MASS) Magnitude
### Planet Radius (Earth Radius) 