In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [2]:
data = pd.read_csv('./data/binary_classification_data.csv')

data.head(), data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17898 entries, 0 to 17897
Data columns (total 9 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0    Mean of the integrated profile                17898 non-null  float64
 1    Standard deviation of the integrated profile  17898 non-null  float64
 2    Excess kurtosis of the integrated profile     17898 non-null  float64
 3    Skewness of the integrated profile            17898 non-null  float64
 4    Mean of the DM-SNR curve                      17898 non-null  float64
 5    Standard deviation of the DM-SNR curve        17898 non-null  float64
 6    Excess kurtosis of the DM-SNR curve           17898 non-null  float64
 7    Skewness of the DM-SNR curve                  17898 non-null  float64
 8   target_class                                   17898 non-null  int64  
dtypes: float64(8), int64(1)
memory usage: 1.2 MB


(    Mean of the integrated profile  \
 0                       140.562500   
 1                       102.507812   
 2                       103.015625   
 3                       136.750000   
 4                        88.726562   
 
     Standard deviation of the integrated profile  \
 0                                      55.683782   
 1                                      58.882430   
 2                                      39.341649   
 3                                      57.178449   
 4                                      40.672225   
 
     Excess kurtosis of the integrated profile  \
 0                                   -0.234571   
 1                                    0.465318   
 2                                    0.323328   
 3                                   -0.068415   
 4                                    0.600866   
 
     Skewness of the integrated profile   Mean of the DM-SNR curve  \
 0                            -0.699648                   3.199833   
 1

In [10]:
data.nunique()

 Mean of the integrated profile                   8626
 Standard deviation of the integrated profile    17862
 Excess kurtosis of the integrated profile       17897
 Skewness of the integrated profile              17898
 Mean of the DM-SNR curve                         9000
 Standard deviation of the DM-SNR curve          17894
 Excess kurtosis of the DM-SNR curve             17895
 Skewness of the DM-SNR curve                    17895
target_class                                         2
dtype: int64

전부 수치형은 맞는듯

In [16]:
print("training accuracy/target class ratio: ", data['target_class'].value_counts(1)[0].round(2))

training accuracy/target class ratio:  0.91


In [3]:
from sklearn.model_selection import train_test_split

# Define the feature matrix X and the target y
X = data.drop('target_class', axis=1)
y = data['target_class']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape


((14318, 8), (3580, 8), (14318,), (3580,))

In [20]:
# Define a function to train a model and calculate performance metrics
def evaluate_model(model, X_train, y_train, X_test):
    # Fit the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred_train = model.predict(X_train)
    
    # Calculate performance metrics
    accuracy = accuracy_score(y_train, y_pred_train)
    precision = precision_score(y_train, y_pred_train)
    recall = recall_score(y_train, y_pred_train)
    f1 = f1_score(y_train, y_pred_train)
    roc_auc = roc_auc_score(y_train, y_pred_train)
    
    # Return the performance metrics
    return accuracy, precision, recall, f1, roc_auc

# Define the models
models = [
    LogisticRegression(max_iter=1000),
    DecisionTreeClassifier(random_state=42,max_depth = 11, criterion="entropy"),
    SVC(),
    GradientBoostingClassifier(learning_rate=0.1, n_estimators=100, max_depth=5),
    MLPClassifier(hidden_layer_sizes = (30, 30, 30), activation='relu', alpha=0.0001)
]

# Define a list to store the results
results = []

# Evaluate each model
for model in models:
    result = evaluate_model(model, X_train, y_train, X_train)
    results.append(result)

# Create a dataframe to store the results
results_df = pd.DataFrame(results, columns=['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC Score'],
                          index=['Logistic Regression', 'Decision Tree', 'SVM', 'Gradient Boosting', 'Neural Network'])
results_df


Unnamed: 0,Accuracy,Precision,Recall,F1 Score,ROC AUC Score
Logistic Regression,0.978838,0.941688,0.820941,0.877179,0.907893
Decision Tree,0.993784,0.98654,0.945372,0.965517,0.972032
SVM,0.972692,0.938505,0.752656,0.835368,0.873828
Gradient Boosting,0.991898,0.984677,0.926404,0.954652,0.962471
Neural Network,0.976743,0.957289,0.782246,0.86096,0.889354


하이퍼 파라미터 튜닝,,

In [27]:
# Define the Gradient Boosting model
model = DecisionTreeClassifier(random_state=42, max_depth = 15, criterion="entropy")

# Fit the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_test = model.predict(X_test)

# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred_test)
precision = precision_score(y_test, y_pred_test)
recall = recall_score(y_test, y_pred_test)
f1 = f1_score(y_test, y_pred_test)
roc_auc = roc_auc_score(y_test, y_pred_test)

accuracy, precision, recall, f1, roc_auc


(0.9723463687150838,
 0.8512658227848101,
 0.838006230529595,
 0.8445839874411303,
 0.9117923144056382)

In [None]:
model = GradientBoostingClassifier(learning_rate=0.1, n_estimators=100, max_depth=5)

# Fit the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_test = model.predict(X_test)

# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred_test)
precision = precision_score(y_test, y_pred_test)
recall = recall_score(y_test, y_pred_test)
f1 = f1_score(y_test, y_pred_test)
roc_auc = roc_auc_score(y_test, y_pred_test)

accuracy, precision, recall, f1, roc_auc

양성 클래스(펄서)를 정확하게 감지하는 것이 중요할 수 있으므로, 재현율이 높은 모델?