In [1]:
from sklearn.pipeline import Pipeline
#from sklearn.impute import SimpleImputer
#make functions for pipeline
def print_missing_values(data):
    print("Total Missing:")
    for column in data.columns:
        if data[column].isnull().any():
            print(f"   {column}: {data[column].isnull().sum()}")
            
def balance_class(data, clusters_to_undersample, clusters_to_add):
    undersampled_dataset = pd.DataFrame()

    for cluster in clusters_to_undersample:
        cluster_data = data[data['Info_cluster'] == cluster]
        minority_class_len = int(cluster_data['Class'].value_counts()[-1])
        majority_class_len = cluster_data['Class'].value_counts()[1]

        majority_class_idx = cluster_data[cluster_data['Class'] == 1].index
        minority_class_idx = cluster_data[cluster_data['Class'] == -1].index

        random_majority_idx = np.random.choice(majority_class_idx, minority_class_len, replace=False)
        undersampled_idx = np.concatenate([minority_class_idx, random_majority_idx])
        undersampled_cluster = cluster_data.loc[undersampled_idx].copy()

        undersampled_dataset = pd.concat([undersampled_dataset, undersampled_cluster])

    for cluster in clusters_to_add:
        cluster_data = data[data['Info_cluster'] == cluster]
        undersampled_dataset = pd.concat([undersampled_dataset, cluster_data])

    undersampled_dataset.reset_index(drop=True, inplace=True)

    return undersampled_dataset

def data_splitting(X,Y,groups):
    gkfold = GroupKFold(n_splits=3)

    for train_idx, test_idx in gkfold.split(X, y, groups):
        X_train, y_train= X.loc[train_idx], y.loc[train_idx]
        X_test, y_test= X.loc[test_idx], y.loc[test_idx]
    X2 = X_train.reset_index(drop=True)
    y2 = y_train.reset_index(drop=True)
    groups2 = X_train['Info_cluster']

    gkfold = GroupKFold(n_splits=2)

    for train_idx, test_idx in gkfold.split(X2, y2, groups2):
        X2_train, y2_train= X2.loc[train_idx], y2.loc[train_idx]
        X_val, y_val= X.loc[test_idx], y.loc[test_idx]
    
def plot_class_imbalance(split):
    class_counts = split['Class'].value_counts()
    n_pos = class_counts[1]
    n_neg = class_counts[-1]
    plt.bar(['Positive', 'Negative'], [n_pos, n_neg])
    plt.title(f'Split {i}: Positive = {n_pos}, Negative = {n_neg}')
    plt.show()
    
def data_skewness(features):
    skewness = features.skew()
    skewed_columns_greater = skewness[skewness > 1.5].index
    skewed_columns_less = skewness[skewness < -1.5].index

    print("Skewed columns with skewness greater than 1.5:")
    print(skewed_columns_greater)
    print("Skewed columns with skewness less than -1.5:")
    print(skewed_columns_less)
    
def scale_features(data):
    scaler = MinMaxScaler()
    feature_cols = list(data.columns)

    # Fit the scaler to the feature data and transform it + create a new dataframe
    data_norm = scaler.fit_transform(data[feature_cols])
    data_norm = pd.DataFrame(data_norm, columns=feature_cols)

    return data_norm

def reduce_dimensionality(data, n):
    pca = PCA(n_components=n, whiten=True)
    datapca = pca.fit_transform(data)

    data_reduced = pd.DataFrame(data=datapca, index=data.index)

    return data_reduced

In [2]:
# Define the remaining preprocessing steps
preprocessing_steps = [
    ('missing_values', print_missing_values),
    ('balance_class', balance_class),
    ('data_splitting', data_splitting),
    ('plot_class_imbalance', plot_class_imbalance),
    ('data_skewness', data_skewness),
    ('scaling', scale_features),
    (('reduce_dimensionality', reduce_dimensionality)),  
]
