In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ks_1samp, norm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from scipy.stats import ks_2samp, kstest, norm, skew, boxcox
import numpy as np
from sklearn.ensemble import StackingClassifier, RandomForestClassifier, GradientBoostingClassifier
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
from scipy.stats import mstats
from sklearn.metrics import make_scorer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv("dataset.csv", delimiter=";")

In [3]:
df.head()

Unnamed: 0,meanR,meanG,meanB,stdR,stdG,stdB,skewR,skewG,skewB,kurR,kurG,kurB,entR,entG,entB,Class
0,17.395776,18.057278,4.548844,40.818315,42.7474,15.235375,2.080558,2.117612,4.194824,2.786645,2.922868,18.932746,11.312396,11.302187,10.86453,1
1,17.04893,17.143965,4.285857,41.389466,42.116279,14.243516,2.185737,2.233318,4.19578,3.243245,3.436646,19.700106,11.256998,11.24231,10.867538,1
2,19.957301,16.498584,5.082156,48.724795,40.816921,15.203335,2.169338,2.245723,3.863509,3.069086,3.474671,17.313374,11.242156,11.228013,11.041794,1
3,19.713066,18.029258,4.707696,47.484912,43.629222,14.230307,2.135099,2.147982,3.987222,2.92059,2.926331,18.956953,11.266421,11.260806,11.034323,1
4,17.383914,14.581238,4.632718,46.006665,38.900382,14.446719,2.417031,2.458251,3.903999,4.259136,4.460186,17.438845,11.103429,11.094264,10.944285,1


In [4]:
df.shape

(5656, 16)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5656 entries, 0 to 5655
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   meanR   5656 non-null   float64
 1   meanG   5656 non-null   float64
 2   meanB   5656 non-null   float64
 3   stdR    5656 non-null   float64
 4   stdG    5656 non-null   float64
 5   stdB    5656 non-null   float64
 6   skewR   5656 non-null   float64
 7   skewG   5656 non-null   float64
 8   skewB   5656 non-null   float64
 9   kurR    5656 non-null   float64
 10  kurG    5656 non-null   float64
 11  kurB    5656 non-null   float64
 12  entR    5656 non-null   float64
 13  entG    5656 non-null   float64
 14  entB    5656 non-null   float64
 15  Class   5656 non-null   int64  
dtypes: float64(15), int64(1)
memory usage: 707.1 KB


In [6]:
df['Class'].unique()

array([1, 2, 3, 4, 5, 6], dtype=int64)

In [7]:
X = df.drop(columns=["Class"])
y = df["Class"]

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Encode y to make classes zero-indexed
y_encoded = label_encoder.fit_transform(y)

# Proceed with splitting the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

In [8]:
def apply_boxcox_transformation(data):
    transformed_data = data.copy()
    for col in transformed_data.columns:
        if (transformed_data[col] > 0).all():
            transformed_data[col], _ = boxcox(transformed_data[col] + 1e-8)
    return transformed_data

In [9]:
def apply_log_transformation(data):
    transformed_data = data.copy()
    for col in transformed_data.columns:
        if (transformed_data[col] > 0).all():
            transformed_data[col] = np.log1p(transformed_data[col])
    return transformed_data

In [10]:
def apply_sqrt_transformation(data):
    transformed_data = data.copy()
    for col in transformed_data.columns:
        if (transformed_data[col] >= 0).all():
            transformed_data[col] = np.sqrt(transformed_data[col])
    return transformed_data

In [11]:
def apply_yeo_johnson_transformation(data):
    transformed_data = data.copy()
    numeric_cols = transformed_data.select_dtypes(include=['float64', 'int64']).columns
    
    # Winsorize outliers
    for col in numeric_cols:
        transformed_data[col] = mstats.winsorize(transformed_data[col], limits=[0.001, 0.001])

    # Apply log transformation
    transformed_data[numeric_cols] = transformed_data[numeric_cols].apply(np.log1p)

    # Apply Yeo-Johnson Power Transform
    pt = PowerTransformer(method='yeo-johnson')
    transformed_data[numeric_cols] = pt.fit_transform(transformed_data[numeric_cols])
    
    return transformed_data

In [12]:
# Function for Combined Transformations (Box-Cox -> Log -> Sqrt -> Yeo-Johnson)
def apply_combined_transformations(data):
    transformed_data = data.copy()
    data_boxcox = apply_boxcox_transformation(transformed_data)
    data_log = apply_log_transformation(data_boxcox)
    data_sqrt = apply_sqrt_transformation(data_log)
    data_yeo_johnson = apply_yeo_johnson_transformation(data_sqrt)
    return data_yeo_johnson

In [13]:

# Scale the data
scaler = StandardScaler()

In [14]:
# Step 1: Apply Combined Transformations to X_train and X_test
X_train_combination = apply_combined_transformations(X_train)
X_test_combination = apply_combined_transformations(X_test)

In [15]:
# Define parameter grids
logistic_params = {
    "penalty": ["l1", "l2"],
    "C": [0.1, 1, 10],
    "solver": ["liblinear", "saga"],
    "max_iter": [1000, 2000],
}

xgboost_params = {
    "learning_rate": [0.1, 0.3],
    "n_estimators": [100, 200],
    "max_depth": [5, 7],
}

# Combine stacking parameters
stacking_params = {
    "logistic__penalty": logistic_params["penalty"],
    "logistic__C": logistic_params["C"],
    "logistic__solver": logistic_params["solver"],
    "logistic__max_iter": logistic_params["max_iter"],
    "xgboost__learning_rate": xgboost_params["learning_rate"],
    "xgboost__n_estimators": xgboost_params["n_estimators"],
    "xgboost__max_depth": xgboost_params["max_depth"],
    "final_estimator__C": [0.1],
    "final_estimator__max_iter": [1000],
}

# Initialize models
logistic_model = LogisticRegression(random_state=42)
xgboost_model = xgb.XGBClassifier(eval_metric="logloss", random_state=42)

# StackingClassifier setup
stacking_model = StackingClassifier(
    estimators=[
        ("logistic", LogisticRegression(random_state=42)),
        ("xgboost", xgb.XGBClassifier(eval_metric="logloss", random_state=42)),
    ],
    final_estimator=LogisticRegression(random_state=42),
)

# Define scoring metrics for multiclass problems

# Perform grid search for each model
results = []

def evaluate_model_multiple_runs(name, model, X_train, y_train, X_test, y_test, param_grid, n_runs=1):
    grid_search = GridSearchCV(
        model,
        param_grid,
        refit="accuracy",  # Refits using the best model based on accuracy
        cv=2,
        verbose=1
    )
    grid_search.fit(X_train, y_train)

    best_params = grid_search.best_params_

    # Refit the best model n_runs times and log metrics
    for run in range(1, n_runs + 1):
        best_model = grid_search.best_estimator_
        best_model.fit(X_train, y_train)
        y_pred = best_model.predict(X_test)

        # Get class-wise metrics
        class_report = classification_report(y_test, y_pred, output_dict=True)

        # Initialize dictionary for each run
        metrics = {
            "run_number": run,
            "method": name,
            "parameters": best_params,
        }

        # Add global metrics like accuracy
        metrics["accuracy"] = class_report["accuracy"]

        # Add per-class metrics as separate columns
        for label, label_metrics in class_report.items():
            if isinstance(label_metrics, dict):  # Ignore global rows like "accuracy"
                metrics[f"precision_class_{label}"] = label_metrics["precision"]
                metrics[f"recall_class_{label}"] = label_metrics["recall"]
                metrics[f"f1_class_{label}"] = label_metrics["f1-score"]
                metrics[f"support_class_{label}"] = label_metrics["support"]

        # Append results
        results.append(metrics)

# Evaluate Logistic Regression
evaluate_model_multiple_runs("Logistic Regression", logistic_model, X_train_combination, y_train, X_test_combination, y_test, logistic_params)

# Evaluate XGBoost
evaluate_model_multiple_runs("XGBoost", xgboost_model, X_train_combination, y_train, X_test_combination, y_test, xgboost_params)

# Evaluate Stacking Classifier
evaluate_model_multiple_runs("Stacking", stacking_model, X_train_combination, y_train, X_test_combination, y_test, stacking_params)

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Save results to CSV if needed
results_df.to_csv("grid_search_results_with_runs.csv", index=False)

results_df

Fitting 2 folds for each of 24 candidates, totalling 48 fits




Fitting 2 folds for each of 8 candidates, totalling 16 fits
Fitting 2 folds for each of 192 candidates, totalling 384 fits




KeyboardInterrupt: 