### Problema de classificação de Cosmos.

> O objetivo é classificar tipos de corpos celestes como galaxias, quasares e estelas.

In [2]:
import pandas as pd
import numpy as np

In [5]:
df_stelar = pd.read_csv('star_classification.csv')
df_stelar.shape

(100000, 18)

- 1. obj_ID (Identificador do Objeto): Um valor único que identifica cada objeto astronômico no catálogo de imagens.
- 2. alpha (Ascensão Reta): Medida angular que indica a posição leste-oeste de um objeto no céu, semelhante à longitude na Terra, medida em horas, minutos e segundos.
- 3. delta (Declinação): Ângulo que mostra a posição norte-sul de um objeto no céu, similar à latitude terrestre, medido em graus.
- 4. u (Filtro Ultravioleta): Medida da intensidade da luz ultravioleta emitida ou refletida por um objeto astronômico.
- 5. g (Filtro Verde): Medida da luz na faixa do verde, indicando propriedades como temperatura e composição.
- 6. r (Filtro Vermelho): Semelhante ao filtro verde, mas para a faixa de luz vermelha.
- 7. i (Filtro de Infravermelho Próximo): Captura informações na faixa do infravermelho próximo, útil para observar objetos frios ou distantes.
- 8. z (Filtro de Infravermelho): Usado para observações em comprimentos de onda mais longos no espectro do infravermelho, importante para estudar objetos muito distantes.
- 9. run_ID (Número da Corrida): Identificador da sessão específica ou "corrida" de observação em que os dados foram coletados.
- 10. rereun_ID (Número da Reexecução): Indica uma nova processamento ou análise dos dados originais.
- 11. cam_col (Coluna da Câmera): Identifica a linha específica de dados dentro de uma varredura da câmera.
- 12. field_ID (Número do Campo): Identifica cada campo de observação dentro de uma corrida.
- 13. spec_obj_ID (Identificador do Objeto Espectroscópico): Um identificador único para objetos observados com espectroscopia, indicando que observações diferentes com o mesmo ID devem ser classificadas da mesma forma.
- 14. class (Classe do Objeto): Categorização do objeto astronômico (por exemplo, galáxia, estrela ou quasar).
- 15. redshift (Desvio para o Vermelho): Medida do deslocamento do espectro de luz para comprimentos de onda mais longos, usado para determinar a velocidade e a distância de objetos distantes no universo.
- 16. plate (ID da Placa): Identifica a placa específica usada no telescopio para coletar dados.
- 17. MJD (Data Juliana Modificada): A data em que os dados foram coletados, uma continuação do calendário juliano usado em astronomia para datas.
- 18. fiber_ID (Identificador da Fibra): Identifica a fibra óptica específica usada para direcionar a luz de um objeto astronômico para o plano focal durante uma observação.

In [10]:
df_stelar

Unnamed: 0,obj_ID,alpha,delta,u,g,r,i,z,run_ID,rerun_ID,cam_col,field_ID,spec_obj_ID,class,redshift,plate,MJD,fiber_ID
0,1.237661e+18,135.689107,32.494632,23.87882,22.27530,20.39501,19.16573,18.79371,3606,301,2,79,6.543777e+18,GALAXY,0.634794,5812,56354,171
1,1.237665e+18,144.826101,31.274185,24.77759,22.83188,22.58444,21.16812,21.61427,4518,301,5,119,1.176014e+19,GALAXY,0.779136,10445,58158,427
2,1.237661e+18,142.188790,35.582444,25.26307,22.66389,20.60976,19.34857,18.94827,3606,301,2,120,5.152200e+18,GALAXY,0.644195,4576,55592,299
3,1.237663e+18,338.741038,-0.402828,22.13682,23.77656,21.61162,20.50454,19.25010,4192,301,3,214,1.030107e+19,GALAXY,0.932346,9149,58039,775
4,1.237680e+18,345.282593,21.183866,19.43718,17.58028,16.49747,15.97711,15.54461,8102,301,3,137,6.891865e+18,GALAXY,0.116123,6121,56187,842
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,1.237679e+18,39.620709,-2.594074,22.16759,22.97586,21.90404,21.30548,20.73569,7778,301,2,581,1.055431e+19,GALAXY,0.000000,9374,57749,438
99996,1.237679e+18,29.493819,19.798874,22.69118,22.38628,20.45003,19.75759,19.41526,7917,301,1,289,8.586351e+18,GALAXY,0.404895,7626,56934,866
99997,1.237668e+18,224.587407,15.700707,21.16916,19.26997,18.20428,17.69034,17.35221,5314,301,4,308,3.112008e+18,GALAXY,0.143366,2764,54535,74
99998,1.237661e+18,212.268621,46.660365,25.35039,21.63757,19.91386,19.07254,18.62482,3650,301,4,131,7.601080e+18,GALAXY,0.455040,6751,56368,470


In [12]:
## Contagem de classes
df_stelar['class'].value_counts()

GALAXY    59445
STAR      21594
QSO       18961
Name: class, dtype: int64

In [14]:
## Percentual de Classes
df_stelar['class'].value_counts(normalize=True)

GALAXY    0.59445
STAR      0.21594
QSO       0.18961
Name: class, dtype: float64

**As classes não estão balanceadas 50/50. Mas vamos tentar trabalhar com essas classes desbalanceadas inicialmente.**

In [19]:
## Verificar existência de dados nulos.
df_stelar.isna().sum()

obj_ID         0
alpha          0
delta          0
u              0
g              0
r              0
i              0
z              0
run_ID         0
rerun_ID       0
cam_col        0
field_ID       0
spec_obj_ID    0
class          0
redshift       0
plate          0
MJD            0
fiber_ID       0
dtype: int64

In [20]:
df_stelar.describe()

Unnamed: 0,obj_ID,alpha,delta,u,g,r,i,z,run_ID,rerun_ID,cam_col,field_ID,spec_obj_ID,redshift,plate,MJD,fiber_ID
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,1.237665e+18,177.629117,24.135305,21.980468,20.531387,19.645762,19.084854,18.66881,4481.36606,301.0,3.51161,186.13052,5.783882e+18,0.576661,5137.00966,55588.6475,449.31274
std,8438560000000.0,96.502241,19.644665,31.769291,31.750292,1.85476,1.757895,31.728152,1964.764593,0.0,1.586912,149.011073,3.324016e+18,0.730707,2952.303351,1808.484233,272.498404
min,1.237646e+18,0.005528,-18.785328,-9999.0,-9999.0,9.82207,9.469903,-9999.0,109.0,301.0,1.0,11.0,2.995191e+17,-0.009971,266.0,51608.0,1.0
25%,1.237659e+18,127.518222,5.146771,20.352353,18.96523,18.135828,17.732285,17.460677,3187.0,301.0,2.0,82.0,2.844138e+18,0.054517,2526.0,54234.0,221.0
50%,1.237663e+18,180.9007,23.645922,22.179135,21.099835,20.12529,19.405145,19.004595,4188.0,301.0,4.0,146.0,5.614883e+18,0.424173,4987.0,55868.5,433.0
75%,1.237668e+18,233.895005,39.90155,23.68744,22.123767,21.044785,20.396495,19.92112,5326.0,301.0,5.0,241.0,8.332144e+18,0.704154,7400.25,56777.0,645.0
max,1.237681e+18,359.99981,83.000519,32.78139,31.60224,29.57186,32.14147,29.38374,8162.0,301.0,6.0,989.0,1.412694e+19,7.011245,12547.0,58932.0,1000.0


In [22]:
## Cria o dataset para o treinamento considerando as colunas que o modelo irá utilizar. Para isso vamos remover os ID's, pois eles possívelmente não vão trazer informações.

cols_ids = ['obj_ID']

df_stelar_modeling = df_stelar.drop(cols_ids, axis='columns')
df_stelar_modeling.shape

(100000, 17)

In [23]:
df_stelar_modeling

Unnamed: 0,alpha,delta,u,g,r,i,z,run_ID,rerun_ID,cam_col,field_ID,spec_obj_ID,class,redshift,plate,MJD,fiber_ID
0,135.689107,32.494632,23.87882,22.27530,20.39501,19.16573,18.79371,3606,301,2,79,6.543777e+18,GALAXY,0.634794,5812,56354,171
1,144.826101,31.274185,24.77759,22.83188,22.58444,21.16812,21.61427,4518,301,5,119,1.176014e+19,GALAXY,0.779136,10445,58158,427
2,142.188790,35.582444,25.26307,22.66389,20.60976,19.34857,18.94827,3606,301,2,120,5.152200e+18,GALAXY,0.644195,4576,55592,299
3,338.741038,-0.402828,22.13682,23.77656,21.61162,20.50454,19.25010,4192,301,3,214,1.030107e+19,GALAXY,0.932346,9149,58039,775
4,345.282593,21.183866,19.43718,17.58028,16.49747,15.97711,15.54461,8102,301,3,137,6.891865e+18,GALAXY,0.116123,6121,56187,842
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,39.620709,-2.594074,22.16759,22.97586,21.90404,21.30548,20.73569,7778,301,2,581,1.055431e+19,GALAXY,0.000000,9374,57749,438
99996,29.493819,19.798874,22.69118,22.38628,20.45003,19.75759,19.41526,7917,301,1,289,8.586351e+18,GALAXY,0.404895,7626,56934,866
99997,224.587407,15.700707,21.16916,19.26997,18.20428,17.69034,17.35221,5314,301,4,308,3.112008e+18,GALAXY,0.143366,2764,54535,74
99998,212.268621,46.660365,25.35039,21.63757,19.91386,19.07254,18.62482,3650,301,4,131,7.601080e+18,GALAXY,0.455040,6751,56368,470


In [28]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import pandas as pd


# Supondo que 'data' é um DataFrame Pandas com as colunas de características e 'target' é a coluna alvo

# Carregar seus dados aqui
# data = pd.read_csv('seu_arquivo.csv')
# X = data.drop('target', axis=1)
# y = data['target']







# Avaliar o modelo
# print(classification_report(y_test, y_pred))

# Curva ROC
# fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
# roc_auc = auc(fpr, tpr)

# Plotar a Curva ROC
# plt.figure()
# plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
# plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
# plt.xlim([0.0, 1.0])
# plt.ylim([0.0, 1.05])
# plt.xlabel('False Positive Rate')
# plt.ylabel('True Positive Rate')
# plt.title('Receiver Operating Characteristic')
# plt.legend(loc="lower right")
# plt.show()

# Plotar a matriz de confusão
# sns.heatmap(pd.crosstab(y_test, y_pred, rownames=['Real'], colnames=['Predito']), annot=True, fmt='d')
# plt.show()

# Lembre-se de substituir 'seu_arquivo.csv' pelo caminho do seu arquivo de dados e 'target' pelo nome da sua coluna alvo.


In [24]:
df_stelar_modeling

Unnamed: 0,alpha,delta,u,g,r,i,z,run_ID,rerun_ID,cam_col,field_ID,spec_obj_ID,class,redshift,plate,MJD,fiber_ID
0,135.689107,32.494632,23.87882,22.27530,20.39501,19.16573,18.79371,3606,301,2,79,6.543777e+18,GALAXY,0.634794,5812,56354,171
1,144.826101,31.274185,24.77759,22.83188,22.58444,21.16812,21.61427,4518,301,5,119,1.176014e+19,GALAXY,0.779136,10445,58158,427
2,142.188790,35.582444,25.26307,22.66389,20.60976,19.34857,18.94827,3606,301,2,120,5.152200e+18,GALAXY,0.644195,4576,55592,299
3,338.741038,-0.402828,22.13682,23.77656,21.61162,20.50454,19.25010,4192,301,3,214,1.030107e+19,GALAXY,0.932346,9149,58039,775
4,345.282593,21.183866,19.43718,17.58028,16.49747,15.97711,15.54461,8102,301,3,137,6.891865e+18,GALAXY,0.116123,6121,56187,842
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,39.620709,-2.594074,22.16759,22.97586,21.90404,21.30548,20.73569,7778,301,2,581,1.055431e+19,GALAXY,0.000000,9374,57749,438
99996,29.493819,19.798874,22.69118,22.38628,20.45003,19.75759,19.41526,7917,301,1,289,8.586351e+18,GALAXY,0.404895,7626,56934,866
99997,224.587407,15.700707,21.16916,19.26997,18.20428,17.69034,17.35221,5314,301,4,308,3.112008e+18,GALAXY,0.143366,2764,54535,74
99998,212.268621,46.660365,25.35039,21.63757,19.91386,19.07254,18.62482,3650,301,4,131,7.601080e+18,GALAXY,0.455040,6751,56368,470


In [29]:
## Separar as features da target
X = df_stelar_modeling.drop(['class'], axis='columns').copy()
y = df_stelar_modeling['class'].copy()

In [30]:
# Dividir os dados em conjuntos de treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [33]:
y_train

75220    GALAXY
48955    GALAXY
44966    GALAXY
13568    GALAXY
92727      STAR
          ...  
6265        QSO
54886    GALAXY
76820      STAR
860      GALAXY
15795    GALAXY
Name: class, Length: 80000, dtype: object

In [37]:
from sklearn.preprocessing import LabelEncoder


In [53]:
# Converter rótulos categóricos em numéricos
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.fit_transform(y_test)

In [54]:
# Treinar o modelo XGBoost
model = XGBClassifier()
model.fit(X_train, y_train_encoded)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, objective='multi:softprob', ...)

In [55]:
# Prever os resultados para o conjunto de teste
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

In [61]:
# Avaliar o modelo
print(classification_report(y_test_encoded, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98     11860
           1       0.96      0.93      0.95      3797
           2       0.99      1.00      0.99      4343

    accuracy                           0.98     20000
   macro avg       0.98      0.97      0.97     20000
weighted avg       0.98      0.98      0.98     20000



## Realizar KFold para avaliar com melhores distribuições de dados.

In [78]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder
import numpy as np

def perform_kfold_cv_with_encoding(X, y, model, n_splits=5):
    """
    Perform K-Fold Cross Validation with label encoding for categorical target variables.

    Parameters:
    - X (pd.DataFrame): Feature set.
    - y (pd.Series): Target variable.
    - model (model object): The machine learning model to train.
    - n_splits (int): Number of folds. Default is 5.

    Returns:
    - dict: Dictionary containing the average scores for accuracy, precision, recall, f1, and ROC AUC.
    """

    # Inicializa o KFold
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Inicializa o LabelEncoder
    label_encoder = LabelEncoder()

    # Codifica a variável alvo
    y_encoded = label_encoder.fit_transform(y)

    # Inicializa listas para armazenar métricas
    accuracies, precisions, recalls, f1_scores, roc_aucs = [], [], [], [], []
    dataset_kfold = pd.DataFrame()
    # Itera sobre cada fold
    for train_index, test_index in kf.split(X):
        # Separa os dados em treino e teste
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y_encoded[train_index], y_encoded[test_index]
        
        y_train_encoded = label_encoder.fit_transform(y_train)
        y_test_encoded = label_encoder.fit_transform(y_test)
        
        # Treina o modelo
        model.fit(X_train, y_train_encoded)

        # Faz previsões
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)[:, 1]

        # Calcula as métricas e adiciona às listas
        accuracies.append(accuracy_score(y_test_encoded, y_pred))
        precisions.append(precision_score(y_test_encoded, y_pred, average='weighted'))
        recalls.append(recall_score(y_test_encoded, y_pred, average='weighted'))
        f1_scores.append(f1_score(y_test_encoded, y_pred, average='weighted'))
        #roc_aucs.append(roc_auc_score(y_test_encoded, y_pred_proba))

        
    dataset_kfold = pd.DataFrame({
        'Accuracy': accuracies,
        'Precision': precisions,
        'Recall': recalls,
        'F1 Score': f1_scores,
    })
    # Calcula a média das métricas
    scores = {
        'Accuracy': np.mean(accuracies),
        'Precision': np.mean(precisions),
        'Recall': np.mean(recalls),
        'F1 Score': np.mean(f1_scores),
        'Dataset_KFold': dataset_kfold
        #'ROC AUC': np.mean(roc_aucs)
    }

    return scores




In [79]:
# Exemplo de uso
model = XGBClassifier()
scores = perform_kfold_cv_with_encoding(X, y, model)
print(scores)

# Lembre-se de substituir X, y e model pelos seus dados e modelo.

{'Accuracy': 0.9768799999999999, 'Precision': 0.9767846218600666, 'Recall': 0.9768799999999999, 'F1 Score': 0.9767767321096681, 'Dataset_KFold':    Accuracy  Precision   Recall  F1 Score
0   0.97675   0.976646  0.97675  0.976637
1   0.97630   0.976185  0.97630  0.976191
2   0.97680   0.976717  0.97680  0.976688
3   0.97800   0.977927  0.97800  0.977911
4   0.97655   0.976449  0.97655  0.976456}


In [80]:
scores

{'Accuracy': 0.9768799999999999,
 'Precision': 0.9767846218600666,
 'Recall': 0.9768799999999999,
 'F1 Score': 0.9767767321096681,
 'Dataset_KFold':    Accuracy  Precision   Recall  F1 Score
 0   0.97675   0.976646  0.97675  0.976637
 1   0.97630   0.976185  0.97630  0.976191
 2   0.97680   0.976717  0.97680  0.976688
 3   0.97800   0.977927  0.97800  0.977911
 4   0.97655   0.976449  0.97655  0.976456}