In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import make_pipeline

from sklearn.metrics import accuracy_score, classification_report

In [67]:
pip install optuna

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
# Step 1: Combine Features
def combine_features(file_paths, label_column="label"):

    combined_features = None
    for file_path in file_paths:
        data = pd.read_csv(file_path)
        if combined_features is None:
            combined_features = data
        else:
            combined_features = pd.concat([combined_features, data.drop(columns=[label_column])], axis=1)
    return combined_features

# File paths for the 5 encoding methods (train and validation)
train_files = [
    "train_kmer_features_scaled1.csv",
    "train_dpcp_features_scaled.csv",
    "train_scpseudnc_features_scaled.csv",
    "train_pseknc_features_scaled.csv",
    "train_pcpsednc_features_scaled.csv"
]

validation_files = [
    "validation_kmer_features_scaled1.csv",
    "validation_dpcp_features_scaled.csv",
    "validation_scpseudnc_features_scaled.csv",
    "validation_pseknc_features_scaled.csv",
    "validation_pcpsednc_features_scaled.csv"
]

# Combine train and validation features
print("Combining train features...")
train_combined = combine_features(train_files)
train_labels = pd.read_csv(train_files[0])["label"]  
train_combined["label"] = train_labels

print("Combining validation features...")
validation_combined = combine_features(validation_files)
validation_labels = pd.read_csv(validation_files[0])["label"]  
validation_combined["label"] = validation_labels


Combining train features...
Combining validation features...


In [None]:
# Step 2: Separate features and labels

train_X = train_combined.drop(columns=["label"])
train_y = train_combined["label"]

validation_X = validation_combined.drop(columns=["label"])
validation_y = validation_combined["label"]

'''
scaler = StandardScaler()
train_X_scaled = scaler.fit_transform(train_X)

# Scale validation data using the same scaler
validation_X_scaled = scaler.transform(validation_X)
'''


'\n# Initialize and fit StandardScaler on training data\nscaler = StandardScaler()\ntrain_X_scaled = scaler.fit_transform(train_X)\n\n# Scale validation data using the same scaler\nvalidation_X_scaled = scaler.transform(validation_X)\n'

In [5]:
print(train_X.shape)

(54464, 276)


PREPROCESSING

In [6]:
# Check for duplicate columns in the dataset
duplicate_columns = train_X.columns[train_X.columns.duplicated()]

if len(duplicate_columns) > 0:
    print("Duplicate columns found:", duplicate_columns)
else:
    print("No duplicate columns in the dataset.")


Duplicate columns found: Index(['comp_AA', 'comp_AC', 'comp_AG', 'comp_AU', 'comp_CA', 'comp_CC',
       'comp_CG', 'comp_CU', 'comp_GA', 'comp_GC', 'comp_GG', 'comp_GU',
       'comp_UA', 'comp_UC', 'comp_UG', 'comp_UU', 'comp_AA', 'comp_AC',
       'comp_AG', 'comp_AU', 'comp_CA', 'comp_CC', 'comp_CG', 'comp_CU',
       'comp_GA', 'comp_GC', 'comp_GG', 'comp_GU', 'comp_UA', 'comp_UC',
       'comp_UG', 'comp_UU', 'pseudo_1', 'pseudo_2', 'pseudo_3', 'pseudo_4',
       'pseudo_5'],
      dtype='object')


In [7]:
# Drop duplicate columns from train_X and validation_X
train_X = train_X.loc[:, ~train_X.columns.duplicated()]
validation_X = validation_X.loc[:, ~validation_X.columns.duplicated()]

print("Shape of train_X after removing duplicates:", train_X.shape)
print("Shape of validation_X after removing duplicates:", validation_X.shape)


Shape of train_X after removing duplicates: (54464, 239)
Shape of validation_X after removing duplicates: (4329, 239)


In [8]:
print(train_X)

            AAA      AAC       AAG       AAU       ACA       ACC       ACG  \
0     -0.519133 -0.46837 -0.604317 -0.544937 -0.782923 -0.692765 -0.209141   
1     -0.519133 -0.46837 -0.604317 -0.544937 -0.782923 -0.692765 -0.209141   
2     -0.519133 -0.46837 -0.604317 -0.544937 -0.782923 -0.692765 -0.209141   
3     -0.519133 -0.46837 -0.604317 -0.544937 -0.782923 -0.692765 -0.209141   
4     -0.519133 -0.46837 -0.604317 -0.544937 -0.782923 -0.692765 -0.209141   
...         ...      ...       ...       ...       ...       ...       ...   
54459 -0.519133 -0.46837 -0.604317 -0.544937  0.962671 -0.692765  4.586468   
54460 -0.519133 -0.46837 -0.604317 -0.544937  0.962671 -0.692765  4.586468   
54461 -0.519133 -0.46837 -0.604317 -0.544937  0.962671 -0.692765  4.586468   
54462 -0.519133 -0.46837 -0.604317 -0.544937  0.962671 -0.692765  4.586468   
54463  0.941947 -0.46837 -0.604317  1.668064 -0.782923  1.253247 -0.209141   

            ACU       AGA       AGC  ...   pc_GU.1   pc_UA.1   

In [None]:
def find_correlated_features(dataset, threshold=0.95):
    
    col_corr = set()  # Set to hold correlated columns
    corr_matrix = dataset.corr()  
    print(corr_matrix)
    
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold:  
                colname = corr_matrix.columns[i]  # Get the name of the feature
                col_corr.add(colname)  # Add the feature to the set of correlated features
    
    return col_corr


# Find correlated features in train data
corr_features = find_correlated_features(train_X, threshold=0.95)
print("Number of correlated features:", len(corr_features))
print("Correlated features:", corr_features)


                 AAA       AAC       AAG       AAU       ACA       ACC  \
AAA         1.000000  0.238580  0.398908  0.240431 -0.025965 -0.223956   
AAC         0.238580  1.000000 -0.102728 -0.015548  0.103603  0.184444   
AAG         0.398908 -0.102728  1.000000 -0.074818 -0.068809 -0.279184   
AAU         0.240431 -0.015548 -0.074818  1.000000 -0.321651 -0.164689   
ACA        -0.025965  0.103603 -0.068809 -0.321651  1.000000  0.180794   
...              ...       ...       ...       ...       ...       ...   
pseudo_1.1  0.039142  0.045537  0.020344  0.082173 -0.015177 -0.022568   
pseudo_2.1  0.067222  0.062079  0.044762  0.026339  0.026638 -0.049721   
pseudo_3.1  0.049396  0.068214  0.055777  0.023091  0.007932 -0.029338   
pseudo_4.1  0.045564  0.064644  0.018096  0.042312  0.041725 -0.004911   
pseudo_5.1  0.057829  0.063498  0.038768  0.032557  0.046314 -0.020959   

                 ACG       ACU       AGA       AGC  ...   pc_GU.1   pc_UA.1  \
AAA         0.068581  0.174813  

In [None]:
import pandas as pd

original_feature_names_train = list(train_X.columns)
original_feature_names_val = list(validation_X.columns)

# Convert train and validation data to DataFrames (using the original feature names)
X_train_df = pd.DataFrame(train_X, columns=original_feature_names_train)
X_val_df = pd.DataFrame(validation_X, columns=original_feature_names_val)

print("Initial training set shape:", X_train_df.shape)
print("Initial validation set shape:", X_val_df.shape)

corr_features = [col for col in corr_features if col in X_train_df.columns]

# Drop correlated features from both train and validation sets
X_train_df.drop(columns=corr_features, inplace=True)
X_val_df.drop(columns=corr_features, inplace=True)

# Debug: Check final state
print("Training set shape after removal:", X_train_df.shape)
print("Validation set shape after removal:", X_val_df.shape)
print("Remaining features:", X_train_df.columns)


Initial training set shape: (54464, 239)
Initial validation set shape: (4329, 239)
Training set shape after removal: (54464, 171)
Validation set shape after removal: (4329, 171)
Remaining features: Index(['AAA', 'AAC', 'AAG', 'AAU', 'ACA', 'ACC', 'ACG', 'ACU', 'AGA', 'AGC',
       ...
       'pseudo_1', 'pseudo_2', 'pseudo_3', 'pseudo_4', 'pseudo_5',
       'pseudo_1.1', 'pseudo_2.1', 'pseudo_3.1', 'pseudo_4.1', 'pseudo_5.1'],
      dtype='object', length=171)


In [11]:
print(X_train_df)

            AAA      AAC       AAG       AAU       ACA       ACC       ACG  \
0     -0.519133 -0.46837 -0.604317 -0.544937 -0.782923 -0.692765 -0.209141   
1     -0.519133 -0.46837 -0.604317 -0.544937 -0.782923 -0.692765 -0.209141   
2     -0.519133 -0.46837 -0.604317 -0.544937 -0.782923 -0.692765 -0.209141   
3     -0.519133 -0.46837 -0.604317 -0.544937 -0.782923 -0.692765 -0.209141   
4     -0.519133 -0.46837 -0.604317 -0.544937 -0.782923 -0.692765 -0.209141   
...         ...      ...       ...       ...       ...       ...       ...   
54459 -0.519133 -0.46837 -0.604317 -0.544937  0.962671 -0.692765  4.586468   
54460 -0.519133 -0.46837 -0.604317 -0.544937  0.962671 -0.692765  4.586468   
54461 -0.519133 -0.46837 -0.604317 -0.544937  0.962671 -0.692765  4.586468   
54462 -0.519133 -0.46837 -0.604317 -0.544937  0.962671 -0.692765  4.586468   
54463  0.941947 -0.46837 -0.604317  1.668064 -0.782923  1.253247 -0.209141   

            ACU       AGA       AGC  ...  pseudo_1  pseudo_2  p

In [12]:
import numpy as np 

X_train = np.array(X_train_df)
X_val = np.array(X_val_df)

X_train.shape, X_val.shape

((54464, 171), (4329, 171))

In [13]:
print(X_train_df)

            AAA      AAC       AAG       AAU       ACA       ACC       ACG  \
0     -0.519133 -0.46837 -0.604317 -0.544937 -0.782923 -0.692765 -0.209141   
1     -0.519133 -0.46837 -0.604317 -0.544937 -0.782923 -0.692765 -0.209141   
2     -0.519133 -0.46837 -0.604317 -0.544937 -0.782923 -0.692765 -0.209141   
3     -0.519133 -0.46837 -0.604317 -0.544937 -0.782923 -0.692765 -0.209141   
4     -0.519133 -0.46837 -0.604317 -0.544937 -0.782923 -0.692765 -0.209141   
...         ...      ...       ...       ...       ...       ...       ...   
54459 -0.519133 -0.46837 -0.604317 -0.544937  0.962671 -0.692765  4.586468   
54460 -0.519133 -0.46837 -0.604317 -0.544937  0.962671 -0.692765  4.586468   
54461 -0.519133 -0.46837 -0.604317 -0.544937  0.962671 -0.692765  4.586468   
54462 -0.519133 -0.46837 -0.604317 -0.544937  0.962671 -0.692765  4.586468   
54463  0.941947 -0.46837 -0.604317  1.668064 -0.782923  1.253247 -0.209141   

            ACU       AGA       AGC  ...  pseudo_1  pseudo_2  p

In [14]:
print(X_train.shape)  
print(X_val.shape)  

(54464, 171)
(4329, 171)


MODEL TRAINING

In [86]:
import optuna

from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')  # Replace NaN with mean
X_train = imputer.fit_transform(X_train)
X_val = imputer.transform(X_val)
SEED = 42

# Define optimization functions for models
def optimize_rf(trial):
    n_estimators = trial.suggest_int("n_estimators", 50, 500)
    max_depth = trial.suggest_int("max_depth", 5, 50)
    rf_model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=SEED)
    rf_model.fit(X_train, train_y)
    return accuracy_score(validation_y, rf_model.predict(X_val))

def optimize_xgb(trial):
    n_estimators = trial.suggest_int("n_estimators", 50, 500)
    max_depth = trial.suggest_int("max_depth", 3, 15)
    learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3)
    xgb_model = XGBClassifier(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate, random_state=SEED)
    xgb_model.fit(X_train, train_y)
    return accuracy_score(validation_y, xgb_model.predict(X_val))

def optimize_svc(trial):
    C = trial.suggest_float("C", 0.1, 10.0)
    gamma = trial.suggest_float("gamma", 0.001, 1.0)
    svc_model = SVC(C=C, gamma=gamma, kernel='rbf', random_state=SEED)
    svc_model.fit(X_train, train_y)
    return accuracy_score(validation_y, svc_model.predict(X_val))

# Run Optuna optimization for each model
print("Optimizing RandomForest...")
study_rf = optuna.create_study(direction="maximize")
study_rf.optimize(optimize_rf, n_trials=20)
rf_best_params = study_rf.best_params
print("Best RF Params:", rf_best_params)

print("Optimizing XGBoost...")
study_xgb = optuna.create_study(direction="maximize")
study_xgb.optimize(optimize_xgb, n_trials=20)
xgb_best_params = study_xgb.best_params
print("Best XGB Params:", xgb_best_params)

print("Optimizing SVC...")
study_svc = optuna.create_study(direction="maximize")
study_svc.optimize(optimize_svc, n_trials=20)
svc_best_params = study_svc.best_params
print("Best SVC Params:", svc_best_params)

# Use the optimized parameters in the estimators
estimators = [
    ('rf', RandomForestClassifier(**rf_best_params, random_state=SEED)),
    ('svr', make_pipeline(StandardScaler(),
                           LinearSVC(random_state=SEED))),  # Keeping this unchanged
    ('xgb', XGBClassifier(**xgb_best_params, random_state=SEED)),
    ('svcl', make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=3, p=1))),  # Keeping this unchanged
    ('svcr', make_pipeline(StandardScaler(), SVC(**svc_best_params, random_state=SEED)))
]

# Final stacking classifier
clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(random_state=SEED))
clf.fit(X_train, train_y)

# Evaluate the model
y_pred = clf.predict(X_val)
clf_score = round(accuracy_score(y_pred, validation_y) * 100, 4)
print("Accuracy score by Stacking Model:", clf_score, "%")


  from .autonotebook import tqdm as notebook_tqdm
[I 2024-12-24 14:06:30,409] A new study created in memory with name: no-name-cd81ea6e-6036-4067-bca1-38364deb0273


Optimizing RandomForest...


[I 2024-12-24 14:06:57,183] Trial 0 finished with value: 0.5105105105105106 and parameters: {'n_estimators': 279, 'max_depth': 42}. Best is trial 0 with value: 0.5105105105105106.
[I 2024-12-24 14:07:03,762] Trial 1 finished with value: 0.5123585123585124 and parameters: {'n_estimators': 73, 'max_depth': 19}. Best is trial 1 with value: 0.5123585123585124.
[I 2024-12-24 14:07:12,441] Trial 2 finished with value: 0.507969507969508 and parameters: {'n_estimators': 91, 'max_depth': 26}. Best is trial 1 with value: 0.5123585123585124.
[I 2024-12-24 14:07:35,151] Trial 3 finished with value: 0.5077385077385077 and parameters: {'n_estimators': 241, 'max_depth': 36}. Best is trial 1 with value: 0.5123585123585124.
[I 2024-12-24 14:07:41,627] Trial 4 finished with value: 0.5063525063525064 and parameters: {'n_estimators': 176, 'max_depth': 5}. Best is trial 1 with value: 0.5123585123585124.
[I 2024-12-24 14:08:18,675] Trial 5 finished with value: 0.5105105105105106 and parameters: {'n_estimato

Best RF Params: {'n_estimators': 73, 'max_depth': 19}
Optimizing XGBoost...


[I 2024-12-24 14:13:13,068] Trial 0 finished with value: 0.5202125202125202 and parameters: {'n_estimators': 247, 'max_depth': 3, 'learning_rate': 0.16026868201104433}. Best is trial 0 with value: 0.5202125202125202.
[I 2024-12-24 14:13:14,369] Trial 1 finished with value: 0.525063525063525 and parameters: {'n_estimators': 204, 'max_depth': 9, 'learning_rate': 0.10221464534536827}. Best is trial 1 with value: 0.525063525063525.
[I 2024-12-24 14:13:16,606] Trial 2 finished with value: 0.5354585354585355 and parameters: {'n_estimators': 433, 'max_depth': 8, 'learning_rate': 0.10502408455728825}. Best is trial 2 with value: 0.5354585354585355.
[I 2024-12-24 14:13:19,574] Trial 3 finished with value: 0.5352275352275352 and parameters: {'n_estimators': 481, 'max_depth': 9, 'learning_rate': 0.22155299373523285}. Best is trial 2 with value: 0.5354585354585355.
[I 2024-12-24 14:13:20,698] Trial 4 finished with value: 0.5282975282975283 and parameters: {'n_estimators': 357, 'max_depth': 4, 'lea

Best XGB Params: {'n_estimators': 433, 'max_depth': 8, 'learning_rate': 0.10502408455728825}
Optimizing SVC...


[I 2024-12-24 14:37:43,997] Trial 0 finished with value: 0.5095865095865096 and parameters: {'C': 1.027119577586157, 'gamma': 0.07452782414405018}. Best is trial 0 with value: 0.5095865095865096.
[I 2024-12-24 14:53:38,601] Trial 1 finished with value: 0.5015015015015015 and parameters: {'C': 9.120900497313164, 'gamma': 0.16250005379520402}. Best is trial 0 with value: 0.5095865095865096.
[I 2024-12-24 15:10:30,435] Trial 2 finished with value: 0.49387849387849386 and parameters: {'C': 3.424183325974879, 'gamma': 0.26744103468589864}. Best is trial 0 with value: 0.5095865095865096.
[I 2024-12-24 15:31:21,963] Trial 3 finished with value: 0.4934164934164934 and parameters: {'C': 6.87836198261601, 'gamma': 0.34630517267027455}. Best is trial 0 with value: 0.5095865095865096.
[I 2024-12-24 15:51:06,382] Trial 4 finished with value: 0.4952644952644953 and parameters: {'C': 5.670352702040881, 'gamma': 0.1956763785201616}. Best is trial 0 with value: 0.5095865095865096.
[I 2024-12-24 16:11:2

Best SVC Params: {'C': 1.5204522737212365, 'gamma': 0.015560763543014326}
Accuracy score by Stacking Model: 60.0601 %


TESTING

In [None]:
def combine_features(file_paths, label_column="label"):

    combined_features = None
    for file_path in file_paths:
        data = pd.read_csv(file_path)
        if combined_features is None:
            combined_features = data
        else:
            combined_features = pd.concat([combined_features, data.drop(columns=[label_column])], axis=1)
    return combined_features

# File paths for the 5 encoding methods (test set)
test_files = [
    "test_kmer.csv",
    "test_dpcp.csv",
    "test_pcpsednc.csv",
    "test_scp.csv",
    "test_pseknc.csv"
]



# Combine features
print("Combining test features...")
test_combined = combine_features(test_files)
test_labels = pd.read_csv(train_files[0])["label"]  
test_combined["label"] = test_labels

Combining test features...


In [16]:
test_X = test_combined.drop(columns=["label"])
test_y = test_combined["label"]

In [91]:
print(test_y.value_counts())

label
1    1096
Name: count, dtype: int64


In [17]:
print(test_X.shape)

(1096, 329)


In [None]:
import pandas as pd

original_features = list(X_train_df.columns)
train_data = pd.DataFrame(X_train, columns=original_features)


In [35]:
print(test_X.columns)  

Index(['AAA', 'AAC', 'AAG', 'AAU', 'ACA', 'ACC', 'ACG', 'ACU', 'AGA', 'AGC',
       ...
       'comp_GU', 'comp_UA', 'comp_UC', 'comp_UG', 'comp_UU', 'pseudo_1',
       'pseudo_2', 'pseudo_3', 'pseudo_4', 'pseudo_5'],
      dtype='object', length=329)


In [66]:
# Drop duplicate columns from test set
test_X = test_X.loc[:, ~test_X.columns.duplicated()]

print("Shape of test_X after removing duplicates:", test_X.shape)

Shape of test_X after removing duplicates: (1096, 234)


In [72]:
extra_cols = set(test_X.columns) - set(train_data.columns)
existing_extra_cols = extra_cols.intersection(test_X.columns)
test_data = test_X.drop(columns=existing_extra_cols)

In [73]:
print(test_data.shape)

(1096, 170)


In [74]:
print("Columns in train but not in test:", set(train_data.columns) - set(test_data.columns))
print("Columns in test but not in train:", set(test_data.columns) - set(train_data.columns))

Columns in train but not in test: {'sc_lambda_1'}
Columns in test but not in train: set()


In [75]:
missing_cols = set(train_data.columns) - set(test_data.columns)
print("Missing columns in test data:", missing_cols)

for col in missing_cols:
    # Add missing column with a default value (e.g., 0 for numerical or 'unknown' for categorical)
    test_data[col] = train_data[col].mean()


Missing columns in test data: {'sc_lambda_1'}


In [76]:
print(test_data.shape)

(1096, 171)


In [78]:
train_cols = list(train_data.columns)
test_cols = list(test_data.columns)

print("Train columns sorted:", sorted(train_cols))
print("Test columns sorted:", sorted(test_cols))
print(len(train_cols))
print(len(test_cols))


Train columns sorted: ['AAA', 'AAA.1', 'AAC', 'AAC.1', 'AAG', 'AAG.1', 'AAU', 'AAU.1', 'ACA', 'ACA.1', 'ACC', 'ACC.1', 'ACG', 'ACG.1', 'ACU', 'ACU.1', 'AGA', 'AGA.1', 'AGC', 'AGC.1', 'AGG', 'AGG.1', 'AGU', 'AGU.1', 'AUA', 'AUA.1', 'AUC', 'AUC.1', 'AUG', 'AUG.1', 'AUU', 'AUU.1', 'CAA', 'CAA.1', 'CAC', 'CAC.1', 'CAG', 'CAG.1', 'CAU', 'CAU.1', 'CCA', 'CCA.1', 'CCC', 'CCC.1', 'CCG', 'CCG.1', 'CCU', 'CCU.1', 'CGA', 'CGA.1', 'CGC', 'CGC.1', 'CGG', 'CGG.1', 'CGU', 'CGU.1', 'CUA', 'CUA.1', 'CUC', 'CUC.1', 'CUG', 'CUG.1', 'CUU', 'CUU.1', 'GAA', 'GAA.1', 'GAC', 'GAC.1', 'GAG', 'GAG.1', 'GAU', 'GAU.1', 'GCA', 'GCA.1', 'GCC', 'GCC.1', 'GCG', 'GCG.1', 'GCU', 'GCU.1', 'GGA', 'GGA.1', 'GGC', 'GGC.1', 'GGG', 'GGG.1', 'GGU', 'GGU.1', 'GUA', 'GUA.1', 'GUC', 'GUC.1', 'GUG', 'GUG.1', 'GUU', 'GUU.1', 'UAA', 'UAA.1', 'UAC', 'UAC.1', 'UAG', 'UAG.1', 'UAU', 'UAU.1', 'UCA', 'UCA.1', 'UCC', 'UCC.1', 'UCG', 'UCG.1', 'UCU', 'UCU.1', 'UGA', 'UGA.1', 'UGC', 'UGC.1', 'UGG', 'UGG.1', 'UGU', 'UGU.1', 'UUA', 'UUA.1', '

In [79]:
test_data = test_data[train_data.columns]


In [81]:
print(test_data.shape)
print(X_train.shape)

(1096, 171)
(54464, 171)


In [None]:
def fit_scaler_and_evaluate(X_train,test_X, test_y, clf):

    try:
        # Fit the scaler on the training data
        scaler = StandardScaler()
        scaler.fit(X_train)

        # Scale the test data
        test_X_scaled = scaler.transform(test_data)
        print("Sample test labels:", test_y[:10])
        print("Predicted labels:", clf.predict(test_X_scaled)[:10])

        # Prediction
        test_predictions = clf.predict(test_X_scaled)
        
        print("Training class distribution:")
        print(train_y.value_counts())
        print("Test class distribution:")
        print(test_y.value_counts())

        # Print evaluation results
        print("Accuracy:", accuracy_score(test_y, test_predictions))
        print("Classification Report:\n", classification_report(test_y, test_predictions))
    except Exception as e:
        print(f"Error during scaling or evaluation: {e}")



fit_scaler_and_evaluate(X_train,test_data, test_y, clf)




Sample test labels: 0    1
1    1
2    1
3    1
4    1
5    1
6    1
7    1
8    1
9    1
Name: label, dtype: int64
Predicted labels: [1 1 1 1 1 1 1 1 1 1]
Training class distribution:
label
0    27469
1    26995
Name: count, dtype: int64
Test class distribution:
label
1    1096
Name: count, dtype: int64
Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           1       1.00      1.00      1.00      1096

    accuracy                           1.00      1096
   macro avg       1.00      1.00      1.00      1096
weighted avg       1.00      1.00      1.00      1096

