# Preparing dataset

Preparation of dataset the same way as [here](https://github.com/JanMatusiak/exoplanets/blob/5380110f75580a08a29b010a21c8d1a51bb5e2a4/kepler.ipynb). For more detailed information about Specific steps go there.

In [218]:
import pandas as pd
url = 'datasets/KOI.csv'
kepler_df = pd.read_csv(url, comment = '#')
kepler_df.head(5)

Unnamed: 0,kepid,kepoi_name,kepler_name,koi_disposition,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,10797460,K00752.01,Kepler-227 b,CONFIRMED,CANDIDATE,1.0,0,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,10797460,K00752.02,Kepler-227 c,CONFIRMED,CANDIDATE,0.969,0,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,10811496,K00753.01,,CANDIDATE,CANDIDATE,0.0,0,0,0,0,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,10848459,K00754.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,0,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,10854555,K00755.01,Kepler-664 b,CONFIRMED,CANDIDATE,1.0,0,0,0,0,...,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


In [219]:
kepler_df.shape

(9564, 49)

### Missing values

In [220]:
kepler_df.isnull().sum().sort_values(ascending=False)

koi_teq_err1         9564
koi_teq_err2         9564
kepler_name          6817
koi_score            1510
koi_steff_err2        483
koi_srad_err1         468
koi_steff_err1        468
koi_slogg_err2        468
koi_slogg_err1        468
koi_srad_err2         468
koi_time0bk_err1      454
koi_period_err1       454
koi_period_err2       454
koi_time0bk_err2      454
koi_impact_err1       454
koi_depth_err1        454
koi_depth_err2        454
koi_duration_err1     454
koi_duration_err2     454
koi_impact_err2       454
koi_model_snr         363
koi_srad              363
koi_slogg             363
koi_steff             363
koi_depth             363
koi_prad_err1         363
koi_prad              363
koi_impact            363
koi_prad_err2         363
koi_teq               363
koi_tce_plnt_num      346
koi_tce_delivname     346
koi_insol_err2        321
koi_insol             321
koi_insol_err1        321
koi_kepmag              1
koi_fpflag_ec           0
koi_time0bk             0
koi_fpflag_n

### Colums that can be dropped

In [221]:
kepler_clean_df = kepler_df.drop(kepler_df.filter(regex="(_err1|_err2)$").columns, axis=1)
subset = ['kepoi_name', 'kepler_name', 'koi_pdisposition', 'koi_score', 'koi_tce_delivname', 'koi_teq', 'koi_time0bk']
kepler_clean_df = kepler_clean_df.drop(subset, axis=1)
kepler_clean_df.isnull().sum().sort_values(ascending=False)

koi_model_snr       363
koi_prad            363
koi_depth           363
koi_impact          363
koi_srad            363
koi_slogg           363
koi_steff           363
koi_tce_plnt_num    346
koi_insol           321
koi_kepmag            1
koi_disposition       0
kepid                 0
koi_fpflag_ss         0
koi_fpflag_nt         0
koi_fpflag_ec         0
koi_period            0
koi_duration          0
koi_fpflag_co         0
ra                    0
dec                   0
dtype: int64

### Get rid of candidates to not confuse model

In [222]:
candidates_df = kepler_clean_df[kepler_clean_df["koi_disposition"].str.strip().str.upper() == "CANDIDATE"].copy()

labeled_df = kepler_clean_df[kepler_clean_df["koi_disposition"].str.strip().str.upper().isin(["CONFIRMED","FALSE POSITIVE"])].copy()

labeled_df["label"] = (
    labeled_df["koi_disposition"]
    .str.strip().str.upper()
    .map({"CONFIRMED": 1, "FALSE POSITIVE": 0})
)

labeled_df.drop(["koi_disposition"], axis=1, inplace=True)
labeled_df.head(5)


Unnamed: 0,kepid,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_impact,koi_duration,koi_depth,koi_prad,koi_insol,koi_model_snr,koi_tce_plnt_num,koi_steff,koi_slogg,koi_srad,ra,dec,koi_kepmag,label
0,10797460,0,0,0,0,9.488036,0.146,2.9575,615.8,2.26,93.59,35.8,1.0,5455.0,4.467,0.927,291.93423,48.141651,15.347,1
1,10797460,0,0,0,0,54.418383,0.586,4.507,874.8,2.83,9.11,25.8,2.0,5455.0,4.467,0.927,291.93423,48.141651,15.347,1
3,10848459,0,1,0,0,1.736952,1.276,2.40641,8079.2,33.46,891.96,505.6,1.0,5805.0,4.564,0.791,285.53461,48.28521,15.597,0
4,10854555,0,0,0,0,2.525592,0.701,1.6545,603.3,2.75,926.16,40.9,1.0,6031.0,4.438,1.046,288.75488,48.2262,15.509,1
5,10872983,0,0,0,0,11.094321,0.538,4.5945,1517.5,3.9,114.81,66.5,1.0,6046.0,4.486,0.972,296.28613,48.22467,15.714,1


### Missing values

In [223]:
labeled_df.isnull().sum().sort_values(ascending=False)

koi_tce_plnt_num    286
koi_model_snr       259
koi_impact          259
koi_prad            259
koi_srad            259
koi_slogg           259
koi_steff           259
koi_depth           259
koi_insol           219
koi_kepmag            1
koi_fpflag_nt         0
kepid                 0
koi_fpflag_co         0
koi_fpflag_ss         0
koi_duration          0
koi_fpflag_ec         0
koi_period            0
ra                    0
dec                   0
label                 0
dtype: int64

In [224]:
labeled_df.dropna(inplace=True, axis=0)
labeled_df.isnull().sum()

kepid               0
koi_fpflag_nt       0
koi_fpflag_ss       0
koi_fpflag_co       0
koi_fpflag_ec       0
koi_period          0
koi_impact          0
koi_duration        0
koi_depth           0
koi_prad            0
koi_insol           0
koi_model_snr       0
koi_tce_plnt_num    0
koi_steff           0
koi_slogg           0
koi_srad            0
ra                  0
dec                 0
koi_kepmag          0
label               0
dtype: int64

Dataset is ready for analysis.

# Binary classification Model

### Division into features and labels

In [225]:
import numpy as np

X = labeled_df.drop("label", axis = 1).values   # features
y = labeled_df["label"].values                  # labels

### Standardization of features

In [226]:
import sklearn
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

### Conversion to tensors

In [227]:
X_scaled.shape, y.shape


((7115, 19), (7115,))

In [228]:
type(X_scaled), X_scaled.dtype

(numpy.ndarray, dtype('float64'))

In [229]:
# Turn data into tensors
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim
X_scaled = torch.from_numpy(X_scaled).type(torch.float32)
y = torch.from_numpy(y).type(torch.float32)


In [230]:
type(X_scaled), X_scaled.dtype, y.dtype

(torch.Tensor, torch.float32, torch.float32)

### train / test split

In [231]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

len(X_train), len(X_test), len(y_train), len(y_test)

(5692, 1423, 5692, 1423)

In [232]:
type(X_train), X_train.dtype

(torch.Tensor, torch.float32)

In [233]:
import torch
from torch import nn

device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

### Construct Model Class

In [234]:
class ExoplanetsV0(nn.Module):
    def __init__(self, input_features, output_features, hidden_units = 8):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(in_features = input_features, out_features = hidden_units),
            nn.ReLU(),
            nn.Linear(in_features = hidden_units, out_features = hidden_units),
            nn.ReLU(),
            nn.Linear(in_features = hidden_units, out_features = output_features),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.layers(x)

# Create an instance of model
model_0 = ExoplanetsV0(input_features = 19, output_features = 1, hidden_units = 8).to(device)
model_0

ExoplanetsV0(
  (layers): Sequential(
    (0): Linear(in_features=19, out_features=8, bias=True)
    (1): ReLU()
    (2): Linear(in_features=8, out_features=8, bias=True)
    (3): ReLU()
    (4): Linear(in_features=8, out_features=1, bias=True)
    (5): Sigmoid()
  )
)

In [235]:
X_train.shape, y_train.shape

(torch.Size([5692, 19]), torch.Size([5692]))

### Loss Function and Optimizer

In [236]:
# Create a loss function for binary classification
loss_fn = nn.BCEWithLogitsLoss()

# Create an optimizer for binary classification
optimizer = torch.optim.SGD(params = model_0.parameters(), lr = 0.1)

In [237]:
# Calculate accuracy
def accuracy_fn(y_true, y_pred):
    correct = torch.eq(y_true, y_pred).sum().item()
    acc = (correct / len(y_pred)) * 100
    return acc

# Train Model

Model outputs are raw **Logits**

We are converting logits into prediction probabilities by passing them to some kind of activation function (int this case `nn.Sigmoid()`) Then we can convert our model's prediction probabilities to **prediction labels** by taking the `argmaX_scaled()`

In [238]:
model_0.eval()
with torch.inference_mode():
    y_logits = model_0(X_test.to(device))
y_logits[:5]

tensor([[0.5656],
        [0.5634],
        [0.5516],
        [0.5428],
        [0.5601]])

In [239]:
y_pred_probs = torch.sigmoid(y_logits)
y_pred_probs

tensor([[0.6377],
        [0.6372],
        [0.6345],
        ...,
        [0.6397],
        [0.6336],
        [0.6372]])

In [240]:
torch.round(y_pred_probs)

tensor([[1.],
        [1.],
        [1.],
        ...,
        [1.],
        [1.],
        [1.]])

In [241]:
# find predicted labels
y_preds = torch.round(y_pred_probs)

# in full
y_pred_labels = torch.round(torch.sigmoid(model_0(X_test.to(device))))

# check for equality
print(torch.eq(y_preds.squeeze(), y_pred_labels.squeeze()))

# get rid of extra dimension
y_preds.squeeze()

tensor([True, True, True,  ..., True, True, True])


tensor([1., 1., 1.,  ..., 1., 1., 1.])

In [242]:
y_test[:5]

tensor([0., 0., 0., 0., 0.])

### Building a training loop and testing loop

In [243]:
torch.manual_seed(42)
torch.cuda.manual_seed(42)

# Set the number of epochs
epochs = 20

# Put data to target device
X_train, y_train = X_train.to(device), y_train.to(device)
X_test, y_test = X_test.to(device), y_test.to(device)

for epoch in range(epochs):
    ### Training
    model_0.train()

    # 1. Forward pass
    y_logits = model_0(X_train).squeeze()
    y_pred = torch.round(torch.sigmoid(y_logits)) # turn logits into pred probs and into pred labels

    # 2. Calculate the loss / cost
    loss = loss_fn(y_logits, y_train)
    acc = accuracy_fn(y_true = y_train, y_pred = y_pred)

    # 3. Optimizer zero grad
    optimizer.zero_grad()

    # 4. Backpropagation algorithm
    loss.backward()

    # 5. Gradient descent algorithm
    optimizer.step()

    ### Testing
    model_0.eval()
    with torch.inference_mode():
        # 1. Forward pass
        test_logits = model_0(X_test).squeeze()
        test_pred = torch.round(torch.sigmoid(test_logits))

        # 2. Calculate test loss/acc
        test_loss = loss_fn(test_logits, y_test)
        test_acc = accuracy_fn(y_true=y_test, y_pred=test_pred)

    # Print out what's happenin'
    if epoch % 4 == 0:
        print(f"Epoch {epoch} | loss: {loss:.5f}, accuracy: {acc:.2f}%, Test loss: {test_loss:.5f}, Test accuracy: {test_acc:.2f}%")

Epoch 0 | loss: 0.79813, accuracy: 38.16%, Test loss: 0.79017, Test accuracy: 39.49%
Epoch 4 | loss: 0.79601, accuracy: 38.16%, Test loss: 0.78817, Test accuracy: 39.49%
Epoch 8 | loss: 0.79394, accuracy: 38.16%, Test loss: 0.78620, Test accuracy: 39.49%
Epoch 12 | loss: 0.79190, accuracy: 38.16%, Test loss: 0.78428, Test accuracy: 39.49%
Epoch 16 | loss: 0.78991, accuracy: 38.16%, Test loss: 0.78240, Test accuracy: 39.49%
