In [7]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, f1_score, classification_report

In [8]:
%%capture
!pip install xgboost
!pip install catboost
!pip install optuna

In [9]:
pd.set_option("display.max_columns", None)

# TESS (TOI)

In [10]:
df = pd.read_csv('./data_filtered/TOI_2025.10.03_08.53.59.csv')
print(df.shape)
df = df[df["tfopwg_disp"].isin(["PC", "FP", "CP"])]
df["tfopwg_disp"] = df["tfopwg_disp"].replace({
    "PC": "CANDIDATE",
    "FP": "FALSE POSITIVE",
    "CP": "CONFIRMED",
})
print(df.shape)

(7703, 66)
(6560, 66)


In [11]:
df['tfopwg_disp'].value_counts()

tfopwg_disp
CANDIDATE         4679
FALSE POSITIVE    1197
CONFIRMED          684
Name: count, dtype: int64

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6560 entries, 0 to 7702
Data columns (total 66 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   loc_rowid        6560 non-null   int64  
 1   toi              6560 non-null   float64
 2   tid              6560 non-null   int64  
 3   tfopwg_disp      6560 non-null   object 
 4   rastr            6560 non-null   object 
 5   ra               6560 non-null   float64
 6   decstr           6560 non-null   object 
 7   dec              6560 non-null   float64
 8   st_pmra          6446 non-null   float64
 9   st_pmraerr1      6446 non-null   float64
 10  st_pmraerr2      6446 non-null   float64
 11  st_pmralim       6446 non-null   float64
 12  st_pmdec         6446 non-null   float64
 13  st_pmdecerr1     6446 non-null   float64
 14  st_pmdecerr2     6446 non-null   float64
 15  st_pmdeclim      6446 non-null   float64
 16  pl_tranmid       6560 non-null   float64
 17  pl_tranmiderr1   65

In [13]:
# df.drop(['loc_rowid','tid','toi','rastr','decstr','pl_insolerr1','pl_insolerr2',
#          'pl_insollim','pl_eqterr1','pl_eqterr2','pl_eqtlim','toi_created',
#          'rowupdate',
# "st_logglim",
# "st_pmralim",
# "st_tefflim",
# "pl_radelim",
# "pl_trandeplim",
# "st_pmdeclim",
# "st_distlim",
# "pl_tranmidlim",
# "pl_orbperlim",
# "pl_trandurhlim",
# "st_tmaglim",
# "st_tmagerr2",
# "st_radlim"
# ],axis=1,inplace=True)

df.drop(['loc_rowid','tid','toi','rastr','decstr','pl_insolerr1','pl_insolerr2',
         'pl_insollim','pl_eqterr1','pl_eqterr2','pl_eqtlim','toi_created',
         'rowupdate'
],axis=1,inplace=True)



In [14]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

# Aplicar no label
df["label"] = encoder.fit_transform(df["tfopwg_disp"])
encoder = LabelEncoder()
df.drop(['tfopwg_disp'],axis=1,inplace=True)
df.head()

Unnamed: 0,ra,dec,st_pmra,st_pmraerr1,st_pmraerr2,st_pmralim,st_pmdec,st_pmdecerr1,st_pmdecerr2,st_pmdeclim,pl_tranmid,pl_tranmiderr1,pl_tranmiderr2,pl_tranmidlim,pl_orbper,pl_orbpererr1,pl_orbpererr2,pl_orbperlim,pl_trandurh,pl_trandurherr1,pl_trandurherr2,pl_trandurhlim,pl_trandep,pl_trandeperr1,pl_trandeperr2,pl_trandeplim,pl_rade,pl_radeerr1,pl_radeerr2,pl_radelim,pl_insol,pl_eqt,st_tmag,st_tmagerr1,st_tmagerr2,st_tmaglim,st_dist,st_disterr1,st_disterr2,st_distlim,st_teff,st_tefferr1,st_tefferr2,st_tefflim,st_logg,st_loggerr1,st_loggerr2,st_logglim,st_rad,st_raderr1,st_raderr2,st_radlim,label
0,112.357708,-12.69596,-5.964,0.085,-0.085,0.0,-0.076,0.072,-0.072,0.0,2459230.0,0.001657,-0.001657,0,2.171348,0.000264,-0.000264,0,2.01722,0.319588,-0.319588,0,656.886099,37.77821,-37.77821,0,5.818163,1.910546,-1.910546,0,22601.948581,3127.204052,9.604,0.013,-0.013,0,485.735,11.9515,-11.9515,0,10249.0,264.7,-264.7,0,4.19,0.07,-0.07,0,2.16986,0.072573,-0.072573,0,2
1,122.580465,-5.513852,-4.956,0.102,-0.102,0.0,-15.555,0.072,-0.072,0.0,2459988.0,0.001916,-0.001916,0,1.931646,5e-06,-5e-06,0,3.166,0.647,-0.647,0,1286.0,1186.49,-1186.49,0,11.2154,2.6242,-2.6242,0,44464.5,4045.0,9.42344,0.006,-0.006,0,295.862,5.91,-5.91,0,7070.0,126.4,-126.4,0,4.03,0.09,-0.09,0,2.01,0.09,-0.09,0,0
2,104.726966,-10.580455,-1.462,0.206,-0.206,0.0,-2.249,0.206,-0.206,0.0,2459225.0,0.000625,-0.000625,0,1.867557,3e-06,-3e-06,0,1.408,0.184,-0.184,0,1500.0,1.7584,-1.7584,0,23.7529,,,0,2860.61,2037.0,9.299501,0.058,-0.058,0,943.109,106.333,-106.333,0,8924.0,124.0,-124.0,0,,,,0,5.73,,,0,2
3,110.559945,-25.207017,-0.939,0.041,-0.041,0.0,1.64,0.055,-0.055,0.0,2458493.0,0.00535,-0.00535,0,2.74323,0.00108,-0.00108,0,3.167,0.642,-0.642,0,383.41,0.781988,-0.781988,0,,,,0,1177.36,1631.0,9.3003,0.037,-0.037,0,7728.17,1899.57,-1899.57,0,5388.5,567.0,-567.0,0,4.15,1.64,-1.64,0,,,,0,2
4,122.178195,-48.802811,-4.496,0.069,-0.069,0.0,9.347,0.062,-0.062,0.0,2459987.0,0.003748,-0.003748,0,3.573014,1.3e-05,-1.3e-05,0,3.37,1.029,-1.029,0,755.0,1306.55,-1306.55,0,11.3113,3.24714,-3.24714,0,54679.3,4260.0,9.1355,0.006,-0.006,0,356.437,4.6175,-4.6175,0,9219.0,171.1,-171.1,0,4.14,0.07,-0.07,0,2.15,0.06,-0.06,0,2


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6560 entries, 0 to 7702
Data columns (total 53 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ra               6560 non-null   float64
 1   dec              6560 non-null   float64
 2   st_pmra          6446 non-null   float64
 3   st_pmraerr1      6446 non-null   float64
 4   st_pmraerr2      6446 non-null   float64
 5   st_pmralim       6446 non-null   float64
 6   st_pmdec         6446 non-null   float64
 7   st_pmdecerr1     6446 non-null   float64
 8   st_pmdecerr2     6446 non-null   float64
 9   st_pmdeclim      6446 non-null   float64
 10  pl_tranmid       6560 non-null   float64
 11  pl_tranmiderr1   6550 non-null   float64
 12  pl_tranmiderr2   6550 non-null   float64
 13  pl_tranmidlim    6560 non-null   int64  
 14  pl_orbper        6471 non-null   float64
 15  pl_orbpererr1    6451 non-null   float64
 16  pl_orbpererr2    6451 non-null   float64
 17  pl_orbperlim     65

In [16]:
df = df.fillna(df.mean())
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6560 entries, 0 to 7702
Data columns (total 53 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ra               6560 non-null   float64
 1   dec              6560 non-null   float64
 2   st_pmra          6560 non-null   float64
 3   st_pmraerr1      6560 non-null   float64
 4   st_pmraerr2      6560 non-null   float64
 5   st_pmralim       6560 non-null   float64
 6   st_pmdec         6560 non-null   float64
 7   st_pmdecerr1     6560 non-null   float64
 8   st_pmdecerr2     6560 non-null   float64
 9   st_pmdeclim      6560 non-null   float64
 10  pl_tranmid       6560 non-null   float64
 11  pl_tranmiderr1   6560 non-null   float64
 12  pl_tranmiderr2   6560 non-null   float64
 13  pl_tranmidlim    6560 non-null   int64  
 14  pl_orbper        6560 non-null   float64
 15  pl_orbpererr1    6560 non-null   float64
 16  pl_orbpererr2    6560 non-null   float64
 17  pl_orbperlim     65

In [17]:
df['st_tefflim'].unique()

array([0])

In [18]:
df['label'].value_counts()

label
0    4679
2    1197
1     684
Name: count, dtype: int64

## Training and Test

In [19]:

df_final = df[[
"st_disterr2",
"st_tmag",
"st_dist",
"pl_eqt",
"st_disterr1",
"pl_insol",
"pl_rade",
"pl_tranmid",
"pl_radeerr2",
"pl_tranmiderr2",
"label"
]]

df_final = df
df_final = df[[
    "st_dist",
    "st_tmag",
    "pl_eqt",
    "pl_insol",
    "st_disterr2",
    "st_disterr1",
    "pl_rade",
    "pl_tranmid",
    "pl_tranmiderr2",
    "st_raderr2", "label"
]
]
df_final = df

In [20]:
df['label'].value_counts()

label
0    4679
2    1197
1     684
Name: count, dtype: int64

In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score
import xgboost as xgb
import joblib
from time import time

start = time()

# ==============================
# SPLITS E MODELO BASE
# ==============================

# Features e label
X = df_final.drop(['label'], axis=1)
y = df_final['label']

# Primeiro split: 70% treino, 30% restante
X_train_full, X_temp, y_train_full, y_temp = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

# Segundo split: 20% teste, 10% blind validation
X_test, X_blind, y_test, y_blind = train_test_split(
    X_temp, y_temp, test_size=0.3333, stratify=y_temp, random_state=42
)

print(f"Train shape: {X_train_full.shape}")
print(f"Test shape: {X_test.shape}")
print(f"Blind validation shape: {X_blind.shape}")

# ==============================
# CROSS-VALIDATION
# ==============================

n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
fold_results = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train_full, y_train_full), 1):
    print(f"\nFold {fold}")
    X_train, X_val = X_train_full.iloc[train_idx], X_train_full.iloc[val_idx]
    y_train, y_val = y_train_full.iloc[train_idx], y_train_full.iloc[val_idx]

    model = xgb.XGBClassifier(eval_metric='mlogloss', objective='multi:softprob')
    model.fit(X_train.values, y_train.values)

    y_pred_val = model.predict(X_val.values)
    acc = accuracy_score(y_val, y_pred_val)
    f1 = f1_score(y_val, y_pred_val, average='weighted')
    print(f"Validation Accuracy: {acc:.4f} | F1-score: {f1:.4f}")
    fold_results.append({'fold': fold, 'accuracy': acc, 'f1': f1})

acc_mean = np.mean([r['accuracy'] for r in fold_results])
f1_mean = np.mean([r['f1'] for r in fold_results])
print(f"\nMédia Validation Accuracy: {acc_mean:.4f}")
print(f"Média Validation F1-score: {f1_mean:.4f}")

# ==============================
# MODELO FINAL COMPLETO
# ==============================

final_model = xgb.XGBClassifier(eval_metric='mlogloss', objective='multi:softprob')
final_model.fit(X_train_full.values, y_train_full.values)

feat_imp = pd.Series(final_model.feature_importances_, index=X_train_full.columns).sort_values(ascending=False)
feat_imp.to_csv('summary/Tess_complete_all_feature_importance.csv')
print("\nTop 10 Feature importances:")
print(feat_imp.head(10))

# Avaliação no teste
y_pred_test = final_model.predict(X_test.values)
print("\n--- Base Model Test Results ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred_test):.4f}")
print(f"F1-score: {f1_score(y_test, y_pred_test, average='weighted'):.4f}")

# Avaliação na blind validation
y_pred_blind = final_model.predict(X_blind.values)
print("\n--- Base Model Blind Results ---")
print(f"Accuracy: {accuracy_score(y_blind, y_pred_blind):.4f}")
print(f"F1-score: {f1_score(y_blind, y_pred_blind, average='weighted'):.4f}")

#joblib.dump(final_model, 'xgb_final_model.pkl')

# ==============================
# TESTES COM 90%, 95%, 99% DAS FEATURES
# ==============================

thresholds = [0.90, 0.95, 0.99]
results_summary = []

feat_imp_cumsum = feat_imp.cumsum()

for th in thresholds:
    selected_features = feat_imp_cumsum[feat_imp_cumsum <= th].index.tolist()
    print(th,selected_features)
    print(f"\n===== Testing with top {th*100:.0f}% feature importance =====")
    print(f"Selected features: {len(selected_features)}")

    X_train_sel = X_train_full[selected_features]
    X_test_sel = X_test[selected_features]
    X_blind_sel = X_blind[selected_features]

    model_sel = xgb.XGBClassifier(eval_metric='mlogloss', objective='multi:softprob')
    model_sel.fit(X_train_sel.values, y_train_full.values)

    # Test set
    y_pred_test_sel = model_sel.predict(X_test_sel.values)
    acc_test = accuracy_score(y_test, y_pred_test_sel)
    f1_test = f1_score(y_test, y_pred_test_sel, average='weighted')

    # Blind set
    y_pred_blind_sel = model_sel.predict(X_blind_sel.values)
    acc_blind = accuracy_score(y_blind, y_pred_blind_sel)
    f1_blind = f1_score(y_blind, y_pred_blind_sel, average='weighted')

    print(f"Test Accuracy: {acc_test:.4f} | F1: {f1_test:.4f}")
    print(f"Blind Accuracy: {acc_blind:.4f} | F1: {f1_blind:.4f}")
    report_test = classification_report(y_test, y_pred_test_sel)
    report_blind = classification_report(y_blind, y_pred_blind_sel)
    print(f"\n Number of Features for this model: {X_test_sel.shape[1]}")
    print(f"\n📊 Classification Test Report: TESS (TOI) Complete Model with {th*100:.0f}% feature importance\n", report_test)
    print(f"\n📊 Classification Blind Report: TESS (TOI) Complete Model with {th*100:.0f}% feature importance\n", report_blind)

    results_summary.append({
        'Threshold': f'{int(th*100)}%',
        'N_Features': len(selected_features),
        'Test_Accuracy': acc_test,
        'Test_F1': f1_test,
        'Blind_Accuracy': acc_blind,
        'Blind_F1': f1_blind
    })

    # Salvar modelo em formatos PKL e JSON
    #joblib.dump(model_sel, f'xgb_model_{int(th*100)}perc_features.pkl')
final_model.save_model(f'models/tess_lite.model')

# ==============================
# SUMÁRIO FINAL
# ==============================

results_df = pd.DataFrame(results_summary)
print("\n=== Summary of Reduced Feature Tests ===")
display(results_df)
results_df.to_csv("summary/Tess_complete_feature_importance_summary.csv", index=False)

end = time()
print(f"\nTotal runtime: {end - start:.2f} seconds")


Train shape: (4592, 52)
Test shape: (1312, 52)
Blind validation shape: (656, 52)

Fold 1
Validation Accuracy: 0.8009 | F1-score: 0.7878

Fold 2
Validation Accuracy: 0.8009 | F1-score: 0.7881

Fold 3
Validation Accuracy: 0.7996 | F1-score: 0.7874

Fold 4
Validation Accuracy: 0.7996 | F1-score: 0.7885

Fold 5
Validation Accuracy: 0.7996 | F1-score: 0.7828

Média Validation Accuracy: 0.8001
Média Validation F1-score: 0.7869

Top 10 Feature importances:
st_dist           0.057568
st_tmag           0.055608
pl_eqt            0.045392
pl_insol          0.043111
st_disterr2       0.041034
st_disterr1       0.038840
pl_rade           0.034326
pl_tranmid        0.033627
pl_tranmiderr2    0.031379
st_raderr2        0.030553
dtype: float32

--- Base Model Test Results ---
Accuracy: 0.8026
F1-score: 0.7903

--- Base Model Blind Results ---
Accuracy: 0.7988
F1-score: 0.7787
0.9 ['st_dist', 'st_tmag', 'pl_eqt', 'pl_insol', 'st_disterr2', 'st_disterr1', 'pl_rade', 'pl_tranmid', 'pl_tranmiderr2', 'st_

  self.get_booster().save_model(fname)


Unnamed: 0,Threshold,N_Features,Test_Accuracy,Test_F1,Blind_Accuracy,Blind_F1
0,90%,31,0.789634,0.776933,0.79878,0.780259
1,95%,35,0.801829,0.789473,0.8125,0.795269
2,99%,37,0.79497,0.780643,0.797256,0.775749



Total runtime: 13.94 seconds


In [76]:
X_train_full.to_csv('./NEW/data/tess/X_train_full.csv',index=False)
X_test.to_csv('./NEW/data/tess/X_test.csv',index=False)
X_blind.to_csv('./NEW/data/tess/X_blind.csv',index=False)

y_train_full.to_csv('./NEW/data/tess/y_train_full.csv',index=False)
y_test.to_csv('./NEW/data/tess/y_test.csv',index=False)
y_blind.to_csv('./NEW/data/tess/y_blind.csv',index=False)

In [174]:
X_blind.to_csv('./models/tess/X_blind.csv')
y_blind.to_csv('./models/tess/y_blind.csv')
final_model_tuned.save_model("./models/tess/tess_model.json")
final_model_tuned.save_model("./models/tess/tess_model_tuned.json")

# Testar no Dataset todo

In [111]:
from sklearn.model_selection import cross_val_score
import numpy as np

# Ordena importâncias
feat_imp = pd.Series(final_model.feature_importances_, index=X_train.columns).sort_values(ascending=False)

results = {}
for k in [5, 10, 15, 20, 30, len(feat_imp)]:
    selected = feat_imp.index[:k]
    scores = cross_val_score(
        xgb.XGBClassifier(eval_metric='mlogloss'),
        X_train[selected],
        y_train,
        cv=5,
        scoring="accuracy"
    )
    results[k] = np.mean(scores)

print(results)

{5: 0.7245509648000891, 10: 0.7661928858736956, 15: 0.775993252886986, 20: 0.7860609093773749, 30: 0.7996715416411797, 52: 0.7955858310626703}
