In [12]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.cuda.amp import autocast, GradScaler
from torchvision import datasets, transforms, models
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

In [13]:
class TOIDataset(Dataset):
  def __init__(self, x, y):
    self.x = torch.tensor(x, dtype=torch.float32)
    self.y = torch.tensor(y.values, dtype=torch.long)

  def __len__(self):
    return len(self.x)

  def __getitem__(self, idx):
    return self.x[idx], self.y[idx]

In [14]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [15]:
device

device(type='cpu')

In [16]:
data = pd.read_csv('/content/new_combined.csv').dropna()

y = data.pop('disp').astype('float64')
X = data

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                    test_size=0.05,
                                    random_state=1)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,
                                    test_size=0.1,
                                    random_state=1)
X_final_test = X_test
y_final_test = y_test
features_num = [
    'ra', 'dec', 'orbper', 'duration', 'depth', 'rad', 'steff',
    'logg', 'srad'
]
features_cat = []

transformer_num = make_pipeline(
    SimpleImputer(strategy="median"),
    StandardScaler(),
)

transformer_cat = make_pipeline(
    SimpleImputer(strategy="median", fill_value="NA"),
    OneHotEncoder(handle_unknown='ignore'),
)

preprocessor = make_column_transformer(
    (transformer_num, features_num),
    (transformer_cat, features_cat),
)

X_train = preprocessor.fit_transform(X_train)
X_val = preprocessor.transform(X_val)
X_test = preprocessor.transform(X_test)


In [17]:
X_train.shape, X_val.shape, X_test.shape

((13572, 9), (1509, 9), (794, 9))

In [18]:
num_classes = len(y_train.unique())
num_classes

2

In [19]:
y_train.unique()

array([1., 0.])

In [20]:
train_dataset_comb = TOIDataset(X_train, y_train)
val_dataset_comb = TOIDataset(X_val, y_val)

In [21]:
train_loader = DataLoader(train_dataset_comb, batch_size=128, shuffle=True)
val_loader = DataLoader(val_dataset_comb, batch_size=128)

In [22]:
num_classes = len(y_train.unique())
num_classes

2

In [23]:
# {'lr': 0.0006056915778868701, 'batch_size': 128, 'out_channels1': 64, 'out_channels2': 512, 'kernel_size': 3, 'dropout': 0.21777348000182786, 'num_layers': 5, 'hidden_size': 512}
class CNN1D_LSTM(nn.Module):
    def __init__(self, num_classes=num_classes, input_features=9, out_channels1=64, out_channels2=512,
                 kernel_size=3, dropout=0.21777348000182786, num_layers=5, hidden_size=512):
        super(CNN1D_LSTM, self).__init__()
        self.lstm = nn.LSTM(input_size=input_features, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)

        self.conv_block1 = nn.Sequential(
            nn.Conv1d(in_channels=1, out_channels=out_channels1, kernel_size=kernel_size, padding=kernel_size//2),
            nn.ReLU(),
            nn.BatchNorm1d(out_channels1),
            nn.MaxPool1d(2)
        )
        self.conv_block2 = nn.Sequential(
            nn.Conv1d(in_channels=out_channels1, out_channels=out_channels2, kernel_size=kernel_size, padding=kernel_size//2),
            nn.ReLU(),
            nn.BatchNorm1d(out_channels2),
            nn.MaxPool1d(2)
        )
        with torch.no_grad():
            dummy = torch.zeros(1, 1, input_features)
            dummy = self.conv_block1(dummy)
            dummy = self.conv_block2(dummy)
            flattened_size = dummy.numel()

        self.fc_block = nn.Sequential(
            nn.Linear(flattened_size, 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, num_classes)
        )

    def forward(self, x):
          x = x.unsqueeze(1)
          x = self.conv_block1(x)
          x = self.conv_block2(x)
          x = torch.flatten(x, 1)
          x = self.fc_block(x)
          return x

In [24]:
class EarlyStopping:
    def __init__(self, patience=7, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = None
        self.early_stop = False

    def __call__(self, val_loss):
        if self.best_loss == None:
            self.best_loss = val_loss
        elif self.best_loss - val_loss > self.min_delta:
            self.best_loss = val_loss
            self.counter = 0
        elif self.best_loss - val_loss < self.min_delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True

In [25]:
model = CNN1D_LSTM().to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0006056915778868701, weight_decay=1e-4)
scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=0.1, patience=5)
early_stopping = EarlyStopping(patience=10, min_delta=0.001)

In [26]:
train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []
num_epochs = 50
scaler = GradScaler()

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    train_correct = 0
    train_total = 0

    train_loop = tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}')
    for frames, labels in train_loop:
        frames, labels = frames.to(device), labels.to(device)
        optimizer.zero_grad()

        with autocast():
            outputs = model(frames)
            loss = criterion(outputs, labels)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        running_loss += loss.item()

        _, predicted = torch.max(outputs, 1)
        train_total += labels.size(0)
        train_correct += (predicted == labels).sum().item()

        train_loop.set_postfix(loss=loss.item(), accuracy=100 * train_correct / train_total)

    train_loss = running_loss / len(train_loader)
    train_accuracy = 100 * train_correct / train_total

    train_losses.append(train_loss)
    train_accuracies.append(train_accuracy)

    torch.cuda.empty_cache()

    model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0
    with torch.no_grad():
        for frames, labels in val_loader:
            frames, labels = frames.to(device), labels.to(device)
            with autocast():
                outputs = model(frames)
                loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            val_total += labels.size(0)
            val_correct += (predicted == labels).sum().item()

    val_loss /= len(val_loader)
    val_accuracy = 100 * val_correct / val_total

    val_losses.append(val_loss)
    val_accuracies.append(val_accuracy)


    scheduler.step(val_loss)
    lr = scheduler.get_last_lr()

    print(f"Epoch {epoch+1}/{num_epochs}: "
          f"Train Loss: {train_loss:.4f}, Train Acc: {train_accuracy:.2f}% | "
          f"Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.2f}%")

    early_stopping(val_loss)
    if early_stopping.early_stop:
      print('Early stopping')
      break

  scaler = GradScaler()
  with autocast():
Epoch 1/50:  87%|████████▋ | 93/107 [00:04<00:00, 19.20it/s, accuracy=69.8, loss=0.507]


KeyboardInterrupt: 

In [None]:
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(train_losses, label='Train Loss')
plt.plot(val_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.title('Training and Validation Loss'),

plt.subplot(1, 2, 2)
plt.plot(train_accuracies, label='Train Accuracy')
plt.plot(val_accuracies, label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy (%)')
plt.legend()
plt.title('Training and Validation Accuracy')

plt.tight_layout()
plt.show()

In [None]:
# model_path = "CNN1D_LSTM.pth"

# torch.save({
#     'model_state_dict': model.state_dict(),
#     'optimizer_state_dict': optimizer.state_dict()
# }, model_path)
# print(f"Модель збережена у {model_path}")

In [None]:
model.eval()
with torch.no_grad():
    X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)
    outputs = model(X_test_tensor)
    y_pred = torch.argmax(outputs, dim=1).cpu().numpy()

print(classification_report(y_test, y_pred))
print(f"F1-score: {f1_score(y_test, y_pred)}, {f1_score(y_test, y_pred,average='weighted')}")

acc: 0.7909319899244333 F1-score: 0.8316430020283976

In [None]:
!pip install optuna

In [None]:
# import optuna.visualization as vis
# import optuna

# def train_and_validate_model(model, train_loader, val_loader, num_epochs, learning_rate, patience):
#     criterion = nn.CrossEntropyLoss()
#     optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-4)
#     scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=0.1, patience=5)
#     early_stopping = EarlyStopping(patience=patience, min_delta=0.001)

#     train_losses = []
#     val_losses = []
#     train_accuracies = []
#     val_accuracies = []
#     val_f1_scores = []

#     for epoch in range(num_epochs):
#         model.train()
#         running_loss = 0.0
#         train_correct = 0
#         train_total = 0

#         train_loop = tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}')
#         for frames, labels in train_loop:
#             frames, labels = frames.to(device), labels.to(device)
#             optimizer.zero_grad()

#             outputs = model(frames)
#             loss = criterion(outputs, labels)

#             loss.backward()
#             optimizer.step()

#             running_loss += loss.item()

#             _, predicted = torch.max(outputs, 1)
#             train_total += labels.size(0)
#             train_correct += (predicted == labels).sum().item()

#             train_loop.set_postfix(loss=loss.item(), accuracy=100 * train_correct / train_total)

#         train_loss = running_loss / len(train_loader)
#         train_accuracy = 100 * train_correct / train_total

#         train_losses.append(train_loss)
#         train_accuracies.append(train_accuracy)

#         model.eval()
#         val_loss = 0.0
#         val_correct = 0
#         val_total = 0
#         all_preds = []
#         all_labels = []
#         with torch.no_grad():
#             for frames, labels in val_loader:
#                 frames, labels = frames.to(device), labels.to(device)
#                 outputs = model(frames)
#                 loss = criterion(outputs, labels)
#                 val_loss += loss.item()
#                 _, predicted = torch.max(outputs, 1)
#                 val_total += labels.size(0)
#                 val_correct += (predicted == labels).sum().item()
#                 all_preds.extend(predicted.cpu().numpy())
#                 all_labels.extend(labels.cpu().numpy())

#         val_loss /= len(val_loader)
#         val_accuracy = 100 * val_correct / val_total
#         val_f1 = f1_score(all_labels, all_preds, average='weighted')

#         val_losses.append(val_loss)
#         val_accuracies.append(val_accuracy)
#         val_f1_scores.append(val_f1)

#         scheduler.step(val_f1)

#         print(f"Epoch {epoch+1}/{num_epochs}: "
#               f"Train Loss: {train_loss:.4f}, Train Acc: {train_accuracy:.2f}% | "
#               f"Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.2f}%, Val F1: {val_f1:.4f}")

#         early_stopping(val_f1)
#         if early_stopping.early_stop:
#             print('Early stopping')
#             break

#     history = {
#         'train_loss': train_losses,
#         'val_loss': val_losses,
#         'train_accuracy': train_accuracies,
#         'val_accuracy': val_accuracies,
#         'val_f1': val_f1_scores
#     }
#     return history

# def objective(trial):
#     lr = trial.suggest_float("lr", 1e-5, 1e-2, log=True)
#     batch_size = trial.suggest_categorical("batch_size", [64, 128, 256, 512, 1024])
#     out_channels1 = trial.suggest_categorical("out_channels1", [64, 128, 256, 512])
#     out_channels2 = trial.suggest_categorical("out_channels2", [128, 256, 512, 1024])
#     kernel_size = trial.suggest_int("kernel_size", 3, 7, step=2)
#     dropout = trial.suggest_float("dropout", 0.1, 0.5)
#     num_layer = trial.suggest_categorical("num_layers", [3, 4, 5, 6, 7])
#     hidden_size = trial.suggest_categorical("hidden_size", [128, 256, 512, 1024])


#     model = CNN1D_LSTM(num_classes=2, input_features=9,
#                   out_channels1=out_channels1, out_channels2=out_channels2,
#                   kernel_size=kernel_size, dropout=dropout, num_layers=num_layer, hidden_size=hidden_size).to(device)

#     history = train_and_validate_model(
#         model=model,
#         train_loader=DataLoader(train_dataset_comb, batch_size=batch_size, shuffle=True),
#         val_loader=DataLoader(val_dataset_comb, batch_size=batch_size, shuffle=False),
#         num_epochs=20,
#         learning_rate=lr,
#         patience=5
#     )

#     best_val_f1 = max(history['val_f1'])
#     return best_val_f1

# study = optuna.create_study(direction="maximize")
# study.optimize(objective, n_trials=40)

# print("Best parameters:", study.best_params)
# print("Best val_f1:", study.best_value)

# vis.plot_optimization_history(study).show()
# vis.plot_param_importances(study).show()

In [None]:
# import numpy as np
# import torch
# from torch.utils.data import TensorDataset, DataLoader
# from sklearn.model_selection import KFold


# X_non_test = np.concatenate((X_train, X_val), axis=0)
# y_non_test = np.concatenate((y_train, y_val), axis=0)

# y_non_test = y_non_test.astype(np.int64)

# k_folds = 5
# kf = KFold(n_splits=k_folds, shuffle=True, random_state=1)

# fold_val_losses = []
# fold_val_accuracies = []

# for fold, (train_idx, val_idx) in enumerate(kf.split(X_non_test)):
#     print(f"Fold {fold + 1}/{k_folds}")

#     X_fold_train = X_non_test[train_idx]
#     y_fold_train = y_non_test[train_idx]
#     X_fold_val = X_non_test[val_idx]
#     y_fold_val = y_non_test[val_idx]

#     train_dataset = TensorDataset(torch.from_numpy(X_fold_train).float(), torch.from_numpy(y_fold_train).long())
#     val_dataset = TensorDataset(torch.from_numpy(X_fold_val).float(), torch.from_numpy(y_fold_val).long())

#     train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
#     val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

#     model = CNN1D_LSTM()
#     criterion = nn.CrossEntropyLoss()
#     optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)
#     scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=0.1, patience=5)
#     early_stopping = EarlyStopping(patience=10, min_delta=0.001)
#     scaler = GradScaler()
#     num_epochs = 50

#     train_losses = []
#     val_losses = []
#     train_accuracies = []
#     val_accuracies = []

#     for epoch in range(num_epochs):
#         model.train()
#     running_loss = 0.0
#     train_correct = 0
#     train_total = 0

#     train_loop = tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}')
#     for frames, labels in train_loop:
#         frames, labels = frames.to(device), labels.to(device)
#         optimizer.zero_grad()

#         with autocast():
#             outputs = model(frames)
#             loss = criterion(outputs, labels)

#         scaler.scale(loss).backward()
#         scaler.step(optimizer)
#         scaler.update()

#         running_loss += loss.item()

#         _, predicted = torch.max(outputs, 1)
#         train_total += labels.size(0)
#         train_correct += (predicted == labels).sum().item()

#         train_loop.set_postfix(loss=loss.item(), accuracy=100 * train_correct / train_total)

#     train_loss = running_loss / len(train_loader)
#     train_accuracy = 100 * train_correct / train_total

#     train_losses.append(train_loss)
#     train_accuracies.append(train_accuracy)

#     torch.cuda.empty_cache()

#     model.eval()
#     val_loss = 0.0
#     val_correct = 0
#     val_total = 0
#     with torch.no_grad():
#         for frames, labels in val_loader:
#             frames, labels = frames.to(device), labels.to(device)
#             with autocast():
#                 outputs = model(frames)
#                 loss = criterion(outputs, labels)
#             val_loss += loss.item()
#             _, predicted = torch.max(outputs, 1)
#             val_total += labels.size(0)
#             val_correct += (predicted == labels).sum().item()

#     val_loss /= len(val_loader)
#     val_accuracy = 100 * val_correct / val_total

#     val_losses.append(val_loss)
#     val_accuracies.append(val_accuracy)


#     scheduler.step(val_loss)
#     lr = scheduler.get_last_lr()

#     print(f"Epoch {epoch+1}/{num_epochs}: "
#           f"Train Loss: {train_loss:.4f}, Train Acc: {train_accuracy:.2f}% | "
#           f"Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.2f}%")

#     early_stopping(val_loss)
#     if early_stopping.early_stop:
#       print('Early stopping')
#       break

#     fold_val_losses.append(val_losses[-1])
#     fold_val_accuracies.append(val_accuracies[-1])

# avg_val_loss = np.mean(fold_val_losses)
# avg_val_accuracy = np.mean(fold_val_accuracies)

# print(f"Average Val Loss across {k_folds} folds: {avg_val_loss:.4f}")
# print(f"Average Val Accuracy across {k_folds} folds: {avg_val_accuracy:.2f}%")

In [None]:
X_final_test[:1]

In [None]:
y_final_test

In [None]:
# # {'lr': 0.0006056915778868701, 'batch_size': 128, 'out_channels1': 64, 'out_channels2': 512, 'kernel_size': 3, 'dropout': 0.21777348000182786, 'num_layers': 5, 'hidden_size': 512}
# class CNN1D_LSTM(nn.Module):
#     def __init__(self, num_classes=2, input_features=9, out_channels1=64, out_channels2=512,
#                  kernel_size=3, dropout=0.21777348000182786, num_layers=5, hidden_size=512):
#         super(CNN1D_LSTM, self).__init__()
#         self.lstm = nn.LSTM(input_size=input_features, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)

#         self.conv_block1 = nn.Sequential(
#             nn.Conv1d(in_channels=1, out_channels=out_channels1, kernel_size=kernel_size, padding=kernel_size//2),
#             nn.ReLU(),
#             nn.BatchNorm1d(out_channels1),
#             nn.MaxPool1d(2)
#         )
#         self.conv_block2 = nn.Sequential(
#             nn.Conv1d(in_channels=out_channels1, out_channels=out_channels2, kernel_size=kernel_size, padding=kernel_size//2),
#             nn.ReLU(),
#             nn.BatchNorm1d(out_channels2),
#             nn.MaxPool1d(2)
#         )
#         with torch.no_grad():
#             dummy = torch.zeros(1, 1, input_features)
#             dummy = self.conv_block1(dummy)
#             dummy = self.conv_block2(dummy)
#             flattened_size = dummy.numel()

#         self.fc_block = nn.Sequential(
#             nn.Linear(flattened_size, 128),
#             nn.ReLU(),
#             nn.Dropout(dropout),
#             nn.Linear(128, num_classes)
#         )

#     def forward(self, x):
#           x = x.unsqueeze(1)
#           x = self.conv_block1(x)
#           x = self.conv_block2(x)
#           x = torch.flatten(x, 1)
#           x = self.fc_block(x)
#           return x

# model = CNN1D_LSTM()

# features_num = [
#     'ra', 'dec', 'orbper', 'duration', 'depth', 'rad', 'steff',
#     'logg', 'srad'
# ]
# features_cat = []

# transformer_num = make_pipeline(
#     SimpleImputer(strategy="median"),
#     StandardScaler(),
# )

# transformer_cat = make_pipeline(
#     SimpleImputer(strategy="median", fill_value="NA"),
#     OneHotEncoder(handle_unknown='ignore'),
# )

# preprocessor = make_column_transformer(
#     (transformer_num, features_num),
#     (transformer_cat, features_cat),
# )

# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Z = pd.read_csv('/content/new_combined.csv')
# y_f = Z.pop('disp')
# # Z = preprocessor.transform(Z)

# state_dict = torch.load('/content/CNN1D_LSTM.pth')

# model.load_state_dict(state_dict['model_state_dict'])
# model = model.to(device)

# sample_idx = range(len(Z))
# sample_idx

# for i in sample_idx:
#     sample_data = Z.iloc[i]
#     # true_label = y_final_test.iloc[i]

#     sample_processed = preprocessor.transform(pd.DataFrame(sample_data).T)
#     sample_tensor = torch.tensor(sample_processed, dtype=torch.float32).to(device)

#     with torch.no_grad():
#         output = model(sample_tensor)
#         _, predicted = torch.max(output, 1)
#         predicted_label = predicted.item()

#     # print(f"True class: {true_label}")
#     print(f"Predicted class: {predicted_label}")
#     logits = output

#     probs = torch.softmax(logits, dim=1)
#     print("Ймовірності:", probs.cpu().numpy())

#     predicted_class = torch.argmax(logits, dim=1).item()
#     print("Кінцевий результат (клас):", predicted_class, '\n')

In [None]:
sample_idx = [1, 2, 3, 4, 5]

for i in sample_idx:
    sample_data = X_final_test.iloc[i]
    true_label = y_final_test.iloc[i]

    sample_processed = preprocessor.transform(pd.DataFrame(sample_data).T)
    sample_tensor = torch.tensor(sample_processed, dtype=torch.float32).to(device)

    with torch.no_grad():
        output = model(sample_tensor)
        _, predicted = torch.max(output, 1)
        predicted_label = predicted.item()

    print("Інформація про об'єкт:")
    print(sample_data, '\n')
    print(f"True class: {true_label}")
    print(f"Predicted class: {predicted_label}")
    logits = output

    probs = torch.softmax(logits, dim=1)
    print("Ймовірності:", probs.cpu().numpy())

    predicted_class = torch.argmax(logits, dim=1).item()
    print("Кінцевий результат (клас):", predicted_class, '\n')

# **MACHINE LEARNING**

In [None]:
!pip install catboost

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from scipy import stats
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.ensemble import StackingClassifier


In [None]:
import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

In [None]:
svm_clf = SVC(kernel = 'rbf', C = 10, gamma = 0.5)

# params = {"C":(0.1, 0.5, 1, 2, 5, 10),
#           "gamma":(0.01, 0.1, 0.5, 1),}

# svm_cv = GridSearchCV(svm_clf, params, n_jobs=-1, cv=5, verbose=1, scoring="accuracy")
# svm_cv.fit(X_train, y_train)
# best_params = svm_cv.best_params_
# print(f"Best params: {best_params}")

# svm_clf = SVC(**best_params)

svm_clf.fit(X_train, y_train)

y_pred = svm_clf.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(f"F1-score: {f1_score(y_test, y_pred)}, {f1_score(y_test, y_pred,average='weighted')}")

In [None]:
tree_clf = DecisionTreeClassifier(random_state=42)
tree_clf.fit(X_train, y_train)

y_pred = tree_clf.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(f"F1-score: {f1_score(y_test, y_pred)}, {f1_score(y_test, y_pred,average='weighted')}")

In [None]:
# Best paramters: {'n_estimators': 300, 'min_samples_split': 2, 'min_samples_leaf': 6, 'max_features': 0.5, 'max_depth': None, 'criterion': 'entropy', 'class_weight': 'balanced', 'bootstrap': True}
rf_clf = RandomForestClassifier(n_estimators=50, random_state=42)

# param_dist = {
#         'n_estimators': [50, 100, 200],
#         'max_depth': [None, 10, 20],
#         'min_samples_split': [2, 5, 10],
#         'min_samples_leaf': [1, 2, 4],
#         'max_features': ['sqrt', 'log2'],
#         'bootstrap': [True, False]
#     }

# rf_clf = RandomForestClassifier()
# rf_clf = RandomizedSearchCV(
#     rf_clf, param_dist, n_iter=15,
#     scoring='accuracy', n_jobs=-1, verbose=1
# )

# rf_clf.fit(X_train, y_train)
# best_params = rf_clf.best_params_
# print(f"Best paramters: {best_params}")

# rf_clf = RandomForestClassifier(**best_params)

rf_clf.fit(X_train, y_train)

y_pred = rf_clf.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(f"F1-score: {f1_score(y_test, y_pred)}, {f1_score(y_test, y_pred,average='weighted')}")

In [27]:
# {'learning_rate': np.float64(0.1530530058619579), 'max_depth': 9, 'n_estimators': 227} 0.8229166666666666
xgb_clf = XGBClassifier(use_label_encoder=False, learning_rate = np.float64(0.0666175730453532), max_depth = 6, n_estimators = 394)

# param_grid = dict(
#     n_estimators=stats.randint(10, 1000),
#     max_depth=stats.randint(1, 10),
#     learning_rate=stats.uniform(0, 1)
# )

# xgb_clf = XGBClassifier(use_label_encoder=False)
# xgb_cv = RandomizedSearchCV(
#     xgb_clf, param_grid, n_iter=45,
#     scoring='accuracy', n_jobs=-1, verbose=1
# )
# xgb_cv.fit(X_train, y_train)
# best_params = xgb_cv.best_params_
# print(f"Best paramters: {best_params}")

# xgb_clf = XGBClassifier(**best_params)

xgb_clf.fit(X_train, y_train)

y_pred = xgb_clf.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(f"F1-score: {f1_score(y_test, y_pred)}, {f1_score(y_test, y_pred,average='weighted')}")

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


0.7846347607052897
              precision    recall  f1-score   support

         0.0       0.79      0.67      0.72       335
         1.0       0.78      0.87      0.82       459

    accuracy                           0.78       794
   macro avg       0.79      0.77      0.77       794
weighted avg       0.79      0.78      0.78       794

F1-score: 0.8235294117647058, 0.7814301933320582


In [None]:
lgbm = LGBMClassifier(n_estimators=500, max_depth=8, learning_rate=0.1, num_leaves=31, random_state=42)

lgbm.fit(X_train, y_train)
y_pred = lgbm.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(f"F1-score: {f1_score(y_test, y_pred)}, {f1_score(y_test, y_pred,average='weighted')}")

In [None]:
catboost = CatBoostClassifier(iterations=500, depth=8, learning_rate=0.1, random_seed=42,verbose=False)

catboost.fit(X_train, y_train)
y_pred = catboost.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(f"F1-score: {f1_score(y_test, y_pred)}, {f1_score(y_test, y_pred,average='weighted')}")

In [None]:
extratrees = ExtraTreesClassifier(n_estimators=500, max_depth=15, min_samples_split=5, random_state=42)

extratrees.fit(X_train, y_train)
y_pred = extratrees.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(f"F1-score: {f1_score(y_test, y_pred)}, {f1_score(y_test, y_pred,average='weighted')}")

In [None]:
gb = GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, max_depth=8)

gb.fit(X_train, y_train)

y_pred = gb.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(f"F1-score: {f1_score(y_test, y_pred)}, {f1_score(y_test, y_pred,average='weighted')}")

In [None]:
print(cross_val_score(gb, X, y, cv=5).mean())

In [None]:
rf = RandomForestClassifier(n_estimators=50, random_state=42)
xgb = XGBClassifier(use_label_encoder=False, learning_rate = np.float64(0.0666175730453532), max_depth = 6, n_estimators = 394, random_state=42)
svm_clf = SVC(kernel = 'rbf', C = 10, gamma = 0.5, random_state=42)


voting_clf = VotingClassifier(
        estimators=[
            ('rf', rf),
            ('xgb', xgb),
            ('svm_clf', svm_clf)
        ], voting='hard'
    )
voting_clf.fit(X_train, y_train)

y_pred = voting_clf.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(f"F1-score: {f1_score(y_test, y_pred)}, {f1_score(y_test, y_pred,average='weighted')}")

In [None]:
print(cross_val_score(rf, X, y, cv=5).mean()),
print(cross_val_score(xgb, X, y, cv=5).mean()),
print(cross_val_score(catboost, X, y, cv=5).mean()),
print(cross_val_score(lgbm, X, y, cv=5).mean())

In [None]:
print(cross_val_score(voting_clf, X, y, cv=5).mean())

In [None]:
xgb_probs = xgb_clf.predict_proba(X_test)
xgb_probs = xgb_probs[:, 1]

xgb_auc = roc_auc_score(y_test, xgb_probs)

xgb_fpr, xgb_tpr, _ = roc_curve(y_test, xgb_probs)

import matplotlib.pyplot as plt

plt.figure(figsize=(12, 8))
plt.plot(xgb_fpr, xgb_tpr, linestyle='--', label='(AUC = %0.3f)' % xgb_auc)

plt.title('ROC Plot')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()

In [None]:
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d')

In [None]:
sample_idx = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

for i in sample_idx:
    sample_data = X_final_test.iloc[i]
    true_label = y_final_test.iloc[i]

    sample_processed = preprocessor.transform(pd.DataFrame(sample_data).T)

    predicted_label = xgb_clf.predict(sample_processed)[0]

    probabilities = xgb_clf.predict_proba(sample_processed)[0]

    print("Information:")
    print(sample_data, '\n')
    # print(f"True class: {true_label}")
    if predicted_label == 0:
        print(f"This is not exoplanet")
    else:
        print(f"This is exoplanet")

    # print(f"Probabilities: {probabilities}")
    print(f"Class: {predicted_label}", '\n')
    print("-" * 50)

In [None]:
import joblib
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer


df = pd.read_csv('/content/new_combined.csv').dropna()
loaded_model = joblib.load('/content/model_xgb_final.pkl')
df = df.drop('disp',axis=1)

features_num = [
    'ra', 'dec', 'orbper', 'duration', 'depth', 'rad', 'steff',
    'logg', 'srad'
]
features_cat = []

transformer_num = make_pipeline(
    SimpleImputer(strategy="median"),
    StandardScaler(),
)

transformer_cat = make_pipeline(
    SimpleImputer(strategy="median", fill_value="NA"),
    OneHotEncoder(handle_unknown='ignore'),
)

preprocessor = make_column_transformer(
    (transformer_num, features_num),
    (transformer_cat, features_cat),
)

preprocessor.fit(df)

sample_idx = range(len(df))

for i in sample_idx:
    sample_data = df.iloc[i]

    sample_processed = preprocessor.transform(pd.DataFrame(sample_data).T)

    predicted_label = loaded_model.predict(sample_processed)[0]

    probabilities = loaded_model.predict_proba(sample_processed)[0]

    print("Information:")
    print(sample_data, '\n')
    if predicted_label == 0:
        print(f"This is not exoplanet")
    else:
        print(f"This is exoplanet")

    # print(f"Probabilities: {probabilities}")
    print(f"Class: {predicted_label}", '\n')
    print("-" * 50)