In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
from pycaret.classification import *

def normalize_data(df, ground_truth_col, normalization_type='z-score'):

    features = df.drop(ground_truth_col, axis=1)
    ground_truth = df[ground_truth_col]

    # Apply normalization
    if normalization_type == 'z-score':
        scaler = StandardScaler()
        normalized_features = scaler.fit_transform(features)
    elif normalization_type == 'min-max':
        scaler = MinMaxScaler()
        normalized_features = scaler.fit_transform(features)
    else:
        raise ValueError("Invalid normalization type. Use 'z-score' or 'min-max'.")

    normalized_df = pd.DataFrame(normalized_features, columns=features.columns)
    normalized_df[ground_truth_col] = ground_truth

    return normalized_df


def balance_classes_with_smote(df, ground_truth_col, random_state=None, plot_distribution=True):
    X = df.drop(ground_truth_col, axis=1)
    y = df[ground_truth_col]
    
    # Check and plot the distribution of classes before SMOTE
    if plot_distribution:
        print("Class distribution before SMOTE:")
        print(y.value_counts())
        y.value_counts().plot(kind='bar', title='Class Distribution Before SMOTE')
        plt.show()

    # Apply SMOTE
    sm = SMOTE(random_state=random_state)
    X_resampled, y_resampled = sm.fit_resample(X, y)

    # Convert the result back to a DataFrame
    df_resampled = pd.DataFrame(X_resampled, columns=X.columns)
    df_resampled[ground_truth_col] = y_resampled

    # Check and plot the distribution of classes after SMOTE
    if plot_distribution:
        print("\nClass distribution after SMOTE:")
        print(y_resampled.value_counts())
        y_resampled.value_counts().plot(kind='bar', title='Class Distribution After SMOTE')
        plt.show()
        
    return df_resampled

In [2]:

if __name__ == "__main__":
    #Parameter
    target_csv  = 'abalone'#'50_50' # fill
    ground_truth = 'Sex' #'Diabetes_binary'
    normalization_bool = False
    normalization_type =  'z-score' #'min-max'
    
    display_bool = False
    rand_seed = 42
    #1.1 Load ============================================================
    if target_csv == 'fill':
        df = pd.read_csv('diabetes_binary_health_indicators_BRFSS2015.csv')
    if target_csv == '50_50':
        df = pd.read_csv('diabetes_binary_5050split_health_indicators_BRFSS2015.csv')
    if target_csv == 'abalone':
        df = pd.read_csv('../abalone/data.csv')
    #2.1 Feature Engineering ============================================================
    
    if target_csv == 'abalone_false':
        # Aspect Ratios
        df['Length_Diameter_Ratio'] = df['Length'] / (df['Diameter'] + 1e-8)
        df['Length_Height_Ratio'] = df['Length'] / (df['Height'] + 1e-8)
        df['Diameter_Height_Ratio'] = df['Diameter'] / (df['Height'] + 1e-8)

        # Weight Ratios
        df['Shell_Whole_Weight_Ratio'] = df['Shell_weight'] / (df['Whole_weight']+ 1e-8)
        df['Shucked_Whole_Weight_Ratio'] = df['Shucked_weight'] / (df['Whole_weight']+ 1e-8)
        df['Viscera_Whole_Weight_Ratio'] = df['Viscera_weight'] / (df['Whole_weight']+ 1e-8)
        
    
    #3.1 Preprocessing ============================================================
    if normalization_bool== True:
        df = normalize_data(df, ground_truth_col= ground_truth, normalization_type= normalization_type)

    if target_csv == 'fill':
        df = balance_classes_with_smote(df,  ground_truth_col= ground_truth, random_state=None, plot_distribution=display_bool)

    #4.1 Machine learning ============================================================
    #3.2 Splitting 60/20/20
    train_val_data, test_data = train_test_split(df, test_size=0.2, random_state=rand_seed)
    train_data, val_data = train_test_split(train_val_data, test_size=0.25, random_state=rand_seed)
 


In [None]:
    #3.2 Model Creation 
    exp = ClassificationExperiment()
    clf1 = exp.setup(train_data, target=ground_truth, polynomial_features = True,  use_gpu=False, preprocess=False, session_id=rand_seed)
    print(clf1)
    best = exp.compare_models()
    print(best)


Unnamed: 0,Description,Value
0,Session id,42
1,Target,Sex
2,Target type,Multiclass
3,Original data shape,"(2505, 9)"
4,Transformed data shape,"(2505, 9)"
5,Transformed train set shape,"(1753, 9)"
6,Transformed test set shape,"(752, 9)"
7,Numeric features,8


<pycaret.classification.oop.ClassificationExperiment object at 0x7f73d1f5f640>


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lda,Linear Discriminant Analysis,0.5585,0.7466,0.5585,0.5538,0.5505,0.333,0.3361,0.004
rf,Random Forest Classifier,0.5437,0.7406,0.5437,0.539,0.5397,0.3134,0.3144,0.047
et,Extra Trees Classifier,0.5431,0.7409,0.5431,0.5372,0.5376,0.3132,0.3147,0.037
lr,Logistic Regression,0.542,0.7495,0.542,0.5306,0.5224,0.305,0.3121,0.163
ridge,Ridge Classifier,0.5408,0.0,0.5408,0.5281,0.5202,0.3069,0.3145,0.005
gbc,Gradient Boosting Classifier,0.5345,0.7363,0.5345,0.5275,0.528,0.2977,0.2995,0.099
qda,Quadratic Discriminant Analysis,0.5323,0.7333,0.5323,0.5363,0.5013,0.3058,0.3226,0.004
ada,Ada Boost Classifier,0.5288,0.7097,0.5288,0.518,0.5146,0.2859,0.2906,0.023
knn,K Neighbors Classifier,0.5106,0.6939,0.5106,0.5077,0.5061,0.2672,0.2687,0.083
dt,Decision Tree Classifier,0.5065,0.6283,0.5065,0.5056,0.5044,0.2586,0.2596,0.005


Processing:   0%|          | 0/61 [00:00<?, ?it/s]