In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# 1. Load Dataset

In [None]:
import pandas as pd

df = pd.read_csv('covtype.csv', low_memory=True)
df.info(memory_usage='deep')
target_col = "Cover_Type"


In [None]:
display("Initial Data Overview:")
display(df.head())

display("\nMissing Values per Column:")
display(df.isnull().sum())


In [None]:
columns_to_remove = ["StudentID", "Name"]
cols_exist = [col for col in columns_to_remove if col in df.columns]

if cols_exist:
    print(f"[INFO] Dropping columns: {cols_exist}")
    df = df.drop(columns=cols_exist)
else:
    print("[INFO] No columns to drop found in dataframe.")


In [None]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd

def encode_dataframe(df: pd.DataFrame, target_col: str):
    df_encoded = df.copy()

    if df_encoded[target_col].dtype == 'object' or df_encoded[target_col].dtype.name == 'category':
        le = LabelEncoder()
        df_encoded[target_col] = le.fit_transform(df_encoded[target_col])
        print(f"[INFO] Target column '{target_col}' encoded: {list(le.classes_)} -> {list(range(len(le.classes_)))}")

    categorical_cols = df_encoded.select_dtypes(include=['object', 'category']).columns.tolist()
    if target_col in categorical_cols:
        categorical_cols.remove(target_col)

    if categorical_cols:
        print(f"[INFO] Encoding categorical columns: {categorical_cols}")
        df_encoded = pd.get_dummies(df_encoded, columns=categorical_cols, drop_first=True)
    else:
        print("[INFO] No categorical columns to encode.")

    return df_encoded
df = encode_dataframe(df, target_col)


# 2. Visualization Before Filling Nulls

In [None]:
import matplotlib.pyplot as plt

missing_counts = df.isnull().sum()

missing_counts = missing_counts[missing_counts > 0]

if not missing_counts.empty:
    plt.figure(figsize=(10, 6))
    missing_counts.plot(kind='bar', color='orange')
    plt.title("Missing Values per Column")
    plt.ylabel("Count")
    plt.show()
else:
    print("[INFO] No missing values found.")


# 3. Handle Missing Values


In [None]:
import matplotlib.pyplot as plt

cat_cols = df.select_dtypes(include=['object', 'category']).columns
num_cols = df.select_dtypes(include=['number']).columns

for col in cat_cols:
    if df[col].isnull().any():
        mode_val = df[col].mode()[0]
        df[col].fillna(mode_val, inplace=True)
        print(f"Column {col} filled with Mode: {mode_val}")

for col in num_cols:
    if df[col].isnull().any():
        mean_val = df[col].mean()
        df[col].fillna(mean_val, inplace=True)
        print(f"Column {col} filled with Mean: {mean_val:.2f}")

missing_counts = df.isnull().sum()
missing_counts = missing_counts[missing_counts > 0]

if not missing_counts.empty:
    plt.figure(figsize=(10, 6))
    missing_counts.plot(kind='bar', color='green')
    plt.title("Missing Values After Filling")
    plt.ylabel("Count")
    plt.show()
else:
    print("[INFO] No missing values remaining.")


# 4. Handle Duplicates


In [None]:
num_duplicates = df.duplicated().sum()
print(f"\nNumber of duplicate rows: {num_duplicates}")

if num_duplicates > 0:
    print("Examples of duplicate rows:")
    display(df[df.duplicated()].head())
    
    df.drop_duplicates(inplace=True)
    print("Duplicates removed")
else:
    print("[INFO] No duplicate rows found.")


# 1. Basic Distributions 


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import math

sns.set(style="whitegrid")

def plot_distributions_auto(df, sample_size=5000, max_unique_for_countplot=15, top_n_bar=10):
    df_sample = df.sample(sample_size, random_state=42) if len(df) > sample_size else df
    cols = df_sample.columns.tolist()
    
    n_cols = 3
    n_rows = math.ceil(len(cols) / n_cols)

    row_heights = []
    for idx in range(n_rows):
        row_cols = cols[idx*n_cols:(idx+1)*n_cols]
        max_height = 4
        for col in row_cols:
            n_unique = df_sample[col].nunique()
            dtype = df_sample[col].dtype
            if dtype in ["int64", "float64"]:
                if n_unique > 15:
                    max_height = max(max_height, 4)
                elif 3 < n_unique <= max_unique_for_countplot:
                    max_height = max(max_height, 3)
                else:
                    max_height = max(max_height, 3.5)
            else:
                if n_unique == 2:
                    max_height = max(max_height, 4)
                elif n_unique <= max_unique_for_countplot:
                    max_height = max(max_height, 4.5)
                else:
                    max_height = max(max_height, 5)
        row_heights.append(max_height)

    total_height = sum(row_heights) + 1
    plt.figure(figsize=(n_cols*6, total_height))

    # Plot each column
    for idx, col in enumerate(cols):
        plt.subplot(n_rows, n_cols, idx + 1)
        n_unique = df_sample[col].nunique()
        dtype = df_sample[col].dtype

        # Automatic plot selection
        if dtype in ["int64", "float64"]:
            if n_unique > 15:
                sns.histplot(df_sample[col], kde=True, bins=30)
                plt.title(f"{col} Histogram & KDE")
            elif 3 < n_unique <= max_unique_for_countplot:
                sns.boxplot(x=df_sample[col])
                plt.title(f"{col} Boxplot")
            else:
                sns.violinplot(y=df_sample[col])
                plt.title(f"{col} Violin")
        else:
            if n_unique == 2:
                df_sample[col].value_counts().plot.pie(autopct="%1.1f%%", startangle=90)
                plt.title(f"{col} Pie")
                plt.ylabel("")
            elif n_unique <= max_unique_for_countplot:
                sns.countplot(x=df_sample[col], order=df_sample[col].value_counts().index)
                plt.title(f"{col} Countplot")
                plt.xticks(rotation=45)
            else:
                top_vals = df_sample[col].value_counts().nlargest(top_n_bar)
                sns.barplot(x=top_vals.index, y=top_vals.values)
                plt.title(f"{col} Top {top_n_bar}")
                plt.xticks(rotation=45)

        plt.tight_layout()

    plt.show()

# Usage
plot_distributions_auto(df)


# 2. Compare features with Target


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import math

def compare_with_target(df, target_col):
    features = [col for col in df.columns if col != target_col]
    n = len(features)
    n_cols = 3 
    n_rows = math.ceil(n / n_cols)
    
    plt.figure(figsize=(18, n_rows * 4))
    
    for i, col in enumerate(features):
        plt.subplot(n_rows, n_cols, i + 1)
        
        # Numeric feature vs Categorical target
        if df[col].dtype in ["int64", "float64"] and df[target_col].dtype == "object":
            if df[target_col].nunique() <= 10:
                sns.boxplot(x=df[target_col], y=df[col])
                plt.title(f"{col} vs {target_col}")
            else:
                sns.violinplot(x=df[target_col], y=df[col])
                plt.title(f"{col} vs {target_col}")
        
        # Categorical feature vs Categorical target
        elif df[col].dtype == "object" and df[target_col].dtype == "object":
            if df[col].nunique() <= 10 and df[target_col].nunique() <= 10:
                sns.countplot(x=df[col], hue=df[target_col])
                plt.title(f"{col} vs {target_col}")
            else:
                top_vals = df[col].value_counts().nlargest(10).index
                sns.countplot(x=df[col][df[col].isin(top_vals)], hue=df[target_col])
                plt.title(f"Top 10 {col} vs {target_col}")
                plt.xticks(rotation=45)
        
        # Numeric feature vs Numeric target
        elif df[col].dtype in ["int64","float64"] and df[target_col].dtype in ["int64","float64"]:
            sns.scatterplot(x=df[col], y=df[target_col])
            plt.title(f"{col} vs {target_col}")
        
        # Categorical feature vs Numeric target
        elif df[col].dtype == "object" and df[target_col].dtype in ["int64","float64"]:
            sns.boxplot(x=df[col], y=df[target_col])
            plt.title(f"{col} vs {target_col}")
            plt.xticks(rotation=45)
    
    plt.tight_layout()
    plt.show()

# Usage
compare_with_target(df, target_col)


# 3. Compare features with each other (no target)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from itertools import combinations
import numpy as np

sns.set(style="whitegrid")

def compare_features_auto(df, sample_size=5000, max_pairplots=500, max_categories=10, n_cols=3):
    numeric_cols = df.select_dtypes(include=["int64","float64"]).columns
    categorical_cols = df.select_dtypes(include=["object","category"]).columns

    df_sample = df.sample(min(sample_size, len(df)), random_state=42)

    # 1. Correlation Heatmap
    if len(numeric_cols) > 1:
        plt.figure(figsize=(12,10))
        sns.heatmap(df_sample[numeric_cols].corr(), annot=True, cmap="coolwarm", fmt=".2f",
                    annot_kws={"size":8})
        plt.title("Correlation Heatmap (Numeric Features)", fontsize=16)
        plt.xticks(rotation=45)
        plt.yticks(rotation=0)
        plt.tight_layout()
        plt.show()

    if len(numeric_cols) >= 2:
        pairs = list(combinations(numeric_cols, 2))[:max_pairplots]
        chunk_size = n_cols * 3  
        for i in range(0, len(pairs), chunk_size):
            chunk = pairs[i:i+chunk_size]
            n_rows = int(np.ceil(len(chunk)/n_cols))
            fig, axes = plt.subplots(n_rows, n_cols, figsize=(n_cols*5, n_rows*4))
            axes = np.array(axes).flatten()
            for idx, (col1, col2) in enumerate(chunk):
                sns.scatterplot(x=df_sample[col1], y=df_sample[col2], ax=axes[idx])
                axes[idx].set_title(f"{col1} vs {col2}", fontsize=10)
            for j in range(len(chunk), len(axes)):
                axes[j].set_visible(False)
            plt.tight_layout()
            plt.show()

    if len(categorical_cols) >= 2:
        cat_pairs = list(combinations(categorical_cols, 2))
        chunk_size = n_cols * 3
        for i in range(0, len(cat_pairs), chunk_size):
            chunk = cat_pairs[i:i+chunk_size]
            n_rows = int(np.ceil(len(chunk)/n_cols))
            fig, axes = plt.subplots(n_rows, n_cols, figsize=(n_cols*5, n_rows*4))
            axes = np.array(axes).flatten()
            for idx, (col1, col2) in enumerate(chunk):
                top1 = df_sample[col1].value_counts().nlargest(max_categories).index
                top2 = df_sample[col2].value_counts().nlargest(max_categories).index
                crosstab = pd.crosstab(df_sample[col1].where(df_sample[col1].isin(top1)),
                                       df_sample[col2].where(df_sample[col2].isin(top2)))
                if crosstab.empty:
                    axes[idx].set_visible(False)
                    continue
                sns.heatmap(crosstab, annot=False, cmap="Blues", ax=axes[idx])
                axes[idx].set_title(f"{col1} vs {col2}", fontsize=10)
                axes[idx].tick_params(axis='x', rotation=45)
                axes[idx].tick_params(axis='y', rotation=0)
            for j in range(len(chunk), len(axes)):
                axes[j].set_visible(False)
            plt.tight_layout()
            plt.show()

# Usage
compare_features_auto(df)


# 4. Features vs Target + Others


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import math
import numpy as np

sns.set(style="whitegrid")

def scatter_features_with_target_dynamic(df, target_col, sample_size=5000, n_cols=3):
    numeric_cols = [col for col in df.select_dtypes(include=["int64","float64"]).columns if col != target_col]
    if not numeric_cols:
        print("No numeric features found.")
        return

    df_sample = df.sample(sample_size, random_state=42) if len(df) > sample_size else df

    n_rows = math.ceil(len(numeric_cols) / n_cols)
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(n_cols*5, n_rows*4))
    axes = np.array(axes).flatten()

    for idx, col in enumerate(numeric_cols):
        ax = axes[idx]
        if df_sample[target_col].dtype in ["int64","float64"]:
            sns.regplot(x=df_sample[col], y=df_sample[target_col], scatter_kws={"alpha":0.5}, ax=ax)
            ax.set_title(f"{col} vs {target_col} Scatter + Reg")
        elif df_sample[target_col].dtype == "object":
            sns.boxplot(x=df_sample[target_col], y=df_sample[col], ax=ax)
            sns.stripplot(x=df_sample[target_col], y=df_sample[col], color="black", alpha=0.3, ax=ax)
            ax.set_title(f"{col} vs {target_col} Box + Strip")

    # Hide empty subplots
    for i in range(len(numeric_cols), len(axes)):
        axes[i].set_visible(False)

    plt.tight_layout()
    plt.show()

# Usage
scatter_features_with_target_dynamic(df, target_col)


# 5. Compare feature with itself


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import math
from scipy import stats
import numpy as np

sns.set(style="whitegrid")

def self_comparison_dynamic(df, sample_size=5000, n_cols=3):
    numeric_cols = df.select_dtypes(include=["int64","float64"]).columns
    if numeric_cols.empty:
        print("No numeric features found.")
        return

    # Sampling
    df_sample = df.sample(sample_size, random_state=42) if len(df) > sample_size else df

    plot_types = ["hist_kde", "box", "violin", "ecdf", "qq"]

    for col in numeric_cols:
        n_plots = len(plot_types)
        n_rows = math.ceil(n_plots / n_cols)
        fig, axes = plt.subplots(n_rows, min(n_cols, n_plots), figsize=(n_cols*5, n_rows*4))
        axes = np.array(axes).flatten()

        for idx, plot_type in enumerate(plot_types):
            ax = axes[idx]
            if plot_type == "hist_kde":
                sns.histplot(df_sample[col], kde=True, bins=30, color="steelblue", ax=ax)
                ax.set_title(f"Histogram & KDE of {col}")
            elif plot_type == "box":
                sns.boxplot(x=df_sample[col], color="orange", ax=ax)
                ax.set_title(f"Boxplot of {col}")
            elif plot_type == "violin":
                sns.violinplot(y=df_sample[col], color="purple", ax=ax)
                ax.set_title(f"Violin Plot of {col}")
            elif plot_type == "ecdf":
                sns.ecdfplot(df_sample[col], color="green", ax=ax)
                ax.set_title(f"ECDF of {col}")
            elif plot_type == "qq":
                stats.probplot(df_sample[col], dist="norm", plot=ax)
                ax.set_title(f"Q-Q Plot of {col}")

        plt.tight_layout()
        plt.show()

# Usage
self_comparison_dynamic(df)


# 6. Extra Ideas


In [None]:
import numpy as np
import pandas as pd
from numba import njit, prange
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from sklearn.covariance import EllipticEnvelope
from sklearn.neighbors import LocalOutlierFactor
import warnings
warnings.filterwarnings("ignore")

try:
    from imblearn.over_sampling import SMOTE, SMOTENC, RandomOverSampler
    _IMBLEARN_OK = True
except:
    _IMBLEARN_OK = False

# ====================== Fast LOF with Numba ======================
@njit(parallel=True)
def pairwise_distance(X):
    n_samples = X.shape[0]
    distances = np.empty((n_samples, n_samples), dtype=np.float32)
    for i in prange(n_samples):
        for j in range(n_samples):
            diff = X[i] - X[j]
            distances[i, j] = np.sqrt(np.dot(diff, diff))
    return distances

@njit
def local_reachability_density(distances, k):
    n_samples = distances.shape[0]
    lrd = np.zeros(n_samples, dtype=np.float32)
    for i in range(n_samples):
        sorted_idx = np.argsort(distances[i])
        neighbors_idx = sorted_idx[1:k+1]
        reach_dist_sum = 0.0
        for j in neighbors_idx:
            k_dist_j = distances[j][np.argsort(distances[j])[k]]
            reach_dist = max(k_dist_j, distances[i, j])
            reach_dist_sum += reach_dist
        lrd[i] = k / reach_dist_sum if reach_dist_sum > 0 else 0
    return lrd

@njit
def lof_score(lrd, distances, k):
    n_samples = distances.shape[0]
    lof = np.zeros(n_samples, dtype=np.float32)
    for i in range(n_samples):
        sorted_idx = np.argsort(distances[i])
        neighbors_idx = sorted_idx[1:k+1]
        sum_ratio = 0.0
        for j in neighbors_idx:
            if lrd[i] > 0:
                sum_ratio += lrd[j] / lrd[i]
        lof[i] = sum_ratio / k
    return lof

def smart_lof_mask(df, numeric_cols, k=20, threshold=1.5):
    X = df[numeric_cols].fillna(df[numeric_cols].median()).values.astype(np.float32)
    X = StandardScaler().fit_transform(X)
    distances = pairwise_distance(X)
    lrd = local_reachability_density(distances, k)
    lof = lof_score(lrd, distances, k)
    mask = lof <= threshold
    print(f"[INFO] LOF: removed {(~mask).sum()} rows ({100*(~mask).sum()/len(df):.2f}%)")
    return mask

# ====================== Smart Preprocessing + Balancing ======================
def smart_preprocess_balance_fast(
    df: pd.DataFrame,
    target_col: str,
    drop_cols: list = None,
    outlier_contamination: float = 0.01,
    balance_method: str = "smote",
    test_size: float = 0.2,
    random_state: int = 42
):
    df2 = df.copy()
    if drop_cols:
        df2.drop(columns=[c for c in drop_cols if c in df2.columns], inplace=True)

    if target_col not in df2.columns:
        raise ValueError(f"Target column '{target_col}' not found.")

    cat_cols = df2.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()
    if target_col in cat_cols:
        cat_cols.remove(target_col)
    num_cols = df2.select_dtypes(include=[np.number]).columns.tolist()
    if target_col in num_cols:
        num_cols.remove(target_col)

    print(f"[INFO] Categorical cols: {cat_cols}")
    print(f"[INFO] Numerical cols: {num_cols}")

    # ---- Outlier Removal ----
    if len(num_cols) > 0:
        n_rows, n_cols_df = df2.shape
        method_used = None
        if n_rows > 10000:  
            print("[INFO] Using IsolationForest for outlier removal")
            iso = IsolationForest(contamination=outlier_contamination, random_state=random_state)
            mask = iso.fit_predict(df2[num_cols].fillna(df2[num_cols].median())) != -1
            method_used = "IsolationForest"
        elif n_cols_df <= 5:  
            print("[INFO] Using EllipticEnvelope for outlier removal")
            env = EllipticEnvelope(contamination=outlier_contamination, random_state=random_state)
            mask = env.fit_predict(df2[num_cols].fillna(df2[num_cols].median())) != -1
            method_used = "EllipticEnvelope"
        else:
            print("[INFO] Using fast LOF for outlier removal")
            mask = smart_lof_mask(df2, num_cols)
            method_used = "LOF"

        df2 = df2.loc[mask].reset_index(drop=True)
        print(f"[INFO] Outlier removal done using {method_used}, new shape: {df2.shape}")

    # ---- Preprocessing Pipelines ----
    num_transformer = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

    cat_transformer = None
    if len(cat_cols) > 0:
        cat_transformer = Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore", sparse=True))
        ])

    transformers = [("num", num_transformer, num_cols)]
    if cat_transformer:
        transformers.append(("cat", cat_transformer, cat_cols))

    preprocessor = ColumnTransformer(transformers=transformers, remainder='drop')

    X = df2.drop(columns=[target_col])
    y = df2[target_col]

    # ---- Detect Task Type ----
    if (pd.api.types.is_integer_dtype(y) or pd.api.types.is_object_dtype(y)) and y.nunique() < 20:
        task_type = "classification"
        if pd.api.types.is_object_dtype(y):
            y = y.astype('category').cat.codes
    else:
        task_type = "regression"
    print(f"[INFO] Detected task type: {task_type}")

    # ---- Split ----
    if task_type == "classification":
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=random_state, stratify=y)
        print("[INFO] Stratified split applied.")
    else:
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=random_state)
        print("[INFO] Normal split applied.")

    # ---- Preprocess ----
    X_train_proc = preprocessor.fit_transform(X_train)
    X_test_proc = preprocessor.transform(X_test)

    # ---- Feature Names ----
    feat_names = None
    try:
        cat_names = []
        if cat_transformer:
            cat_names = preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(cat_cols)
        feat_names = np.concatenate([num_cols, cat_names])
    except Exception:
        pass

    # ---- Balancing ----
    if task_type == "classification" and _IMBLEARN_OK and balance_method.lower() != "none":
        sampler = None
        if balance_method.lower() == "smote":
            if len(cat_cols) > 0:
                try:
                    cat_idx = list(range(len(num_cols), X_train_proc.shape[1]))
                    sampler = SMOTENC(categorical_features=cat_idx, random_state=random_state, n_jobs=-1)
                except Exception:
                    sampler = SMOTE(random_state=random_state, n_jobs=-1)
            else:
                sampler = SMOTE(random_state=random_state, n_jobs=-1)
        elif balance_method.lower() in ["ros", "randomoversampler"]:
            sampler = RandomOverSampler(random_state=random_state)

        if sampler:
            X_train_proc, y_train = sampler.fit_resample(X_train_proc, y_train)
            print(f"[INFO] Balanced training set: {len(y_train)} samples")

    return X_train_proc, X_test_proc, y_train, y_test, preprocessor, feat_names

X_train_proc, X_test_proc, y_train, y_test, preprocessor, feat_names = smart_preprocess_balance_fast(
    df=df,
    target_col=target_col,
    outlier_contamination=0.01,
    balance_method="smote",
    test_size=0.2,
    random_state=42
)

if feat_names is not None:
    X_train_df = pd.DataFrame(
        X_train_proc.toarray() if hasattr(X_train_proc, 'toarray') else X_train_proc,
        columns=feat_names
    )
    print(X_train_df.head())

In [None]:
import warnings
warnings.filterwarnings("ignore")
import time
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, r2_score, mean_squared_error

# Classic models
from sklearn.linear_model import LogisticRegression, Ridge, Lasso, ElasticNet, SGDClassifier, SGDRegressor, BayesianRidge, PassiveAggressiveClassifier, PassiveAggressiveRegressor, Perceptron, LinearRegression, HuberRegressor, Lars, LassoLars, OrthogonalMatchingPursuit, ARDRegression, TweedieRegressor, PoissonRegressor, GammaRegressor, QuantileRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, ExtraTreeClassifier, ExtraTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, ExtraTreesClassifier, ExtraTreesRegressor, GradientBoostingClassifier, GradientBoostingRegressor, AdaBoostClassifier, AdaBoostRegressor, BaggingClassifier, BaggingRegressor, VotingClassifier, VotingRegressor, StackingClassifier, StackingRegressor, HistGradientBoostingClassifier, HistGradientBoostingRegressor
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB, ComplementNB
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor, RadiusNeighborsClassifier, RadiusNeighborsRegressor, NearestCentroid
from sklearn.svm import LinearSVC, LinearSVR
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

# Optional models
try:
    from lightgbm import LGBMClassifier, LGBMRegressor
    _LGBM_OK = True
except: _LGBM_OK = False
try:
    from xgboost import XGBClassifier, XGBRegressor
    _XGB_OK = True
except: _XGB_OK = False
try:
    from catboost import CatBoostClassifier, CatBoostRegressor
    _CAT_OK = True
except: _CAT_OK = False
try:
    from prophet import Prophet
    _PROPHET_OK = True
except: _PROPHET_OK = False
try:
    from pmdarima import auto_arima
    _PMDARIMA_OK = True
except: _PMDARIMA_OK = False

#  Numba for fast sorting (used in feature importance, etc) 
from numba import njit

@njit
def numba_argsort(arr):
    return np.argsort(arr)

def print_step(msg):
    print(f"\n{'='*10} {msg} {'='*10}")

def detect_problem_type(df, target_col, time_col=None):
    if time_col and time_col in df.columns:
        try:
            pd.to_datetime(df[time_col])
            return "time_series"
        except: pass
    y = df[target_col]
    if pd.api.types.is_numeric_dtype(y):
        if (y.dropna() % 1 == 0).all() and y.nunique(dropna=True) <= 20:
            return "classification"
        else:
            return "regression"
    return "classification"

def preprocess_data(df, target_col, time_col=None):
    df = df.copy()
    if time_col and time_col in df.columns:
        df = df.sort_values(time_col)
    cat_cols = df.select_dtypes(include=['object','category','bool']).columns.tolist()
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if target_col in cat_cols: cat_cols.remove(target_col)
    if target_col in num_cols: num_cols.remove(target_col)
    num_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])
    cat_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse=False))
    ]) if cat_cols else "drop"
    preprocessor = ColumnTransformer([
        ("num", num_pipe, num_cols),
        ("cat", cat_pipe, cat_cols)
    ])
    X = df.drop(columns=[target_col])
    y = df[target_col]
    X_proc = preprocessor.fit_transform(X)
    feat_names = []
    feat_names.extend(num_cols)
    if cat_cols:
        feat_names.extend(preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(cat_cols))
    return X_proc, y, feat_names

def get_models(task, n_samples=None):
    # Reduce max_iter for heavy models if data is large
    heavy_iter = 200 if n_samples is not None and n_samples > 50000 else 2000
    mlp_iter = 100 if n_samples is not None and n_samples > 50000 else 300
    lgbm_estimators = 50 if n_samples is not None and n_samples > 50000 else 200
    xgb_estimators = 50 if n_samples is not None and n_samples > 50000 else 200
    cat_estimators = 50 if n_samples is not None and n_samples > 50000 else 200

    models = []
    if task == "classification":
        models += [
            ("RandomForest", RandomForestClassifier(n_estimators=lgbm_estimators, n_jobs=-1, random_state=42)),
            ("ExtraTrees", ExtraTreesClassifier(n_estimators=lgbm_estimators, n_jobs=-1, random_state=42)),
            ("GradientBoosting", GradientBoostingClassifier(n_estimators=lgbm_estimators, random_state=42)),
            ("HistGB", HistGradientBoostingClassifier(max_iter=lgbm_estimators, random_state=42)),
            ("DecisionTree", DecisionTreeClassifier(random_state=42)),
            ("ExtraTree", ExtraTreeClassifier(random_state=42)),
            ("AdaBoost", AdaBoostClassifier(n_estimators=lgbm_estimators, random_state=42)),
            ("Bagging", BaggingClassifier(n_estimators=100, n_jobs=-1, random_state=42)),
            ("LogisticRegression", LogisticRegression(max_iter=heavy_iter, n_jobs=-1, random_state=42)),
            ("SGDClassifier", SGDClassifier(max_iter=heavy_iter, random_state=42)),
            ("LinearSVC", LinearSVC(max_iter=heavy_iter, random_state=42)),
            ("RidgeClassifier", Ridge()),
            ("PassiveAggressiveClassifier", PassiveAggressiveClassifier(max_iter=1000, random_state=42)),
            ("Perceptron", Perceptron(max_iter=1000, random_state=42)),
            ("GaussianNB", GaussianNB()),
            ("BernoulliNB", BernoulliNB()),
            ("MultinomialNB", MultinomialNB()),
            ("ComplementNB", ComplementNB()),
            ("LDA", LinearDiscriminantAnalysis()),
            ("QDA", QuadraticDiscriminantAnalysis()),
            ("KNN", KNeighborsClassifier(n_jobs=-1)),
            ("RadiusNN", RadiusNeighborsClassifier(n_jobs=-1, outlier_label=0)),
            ("NearestCentroid", NearestCentroid()),
            ("MLP", MLPClassifier(hidden_layer_sizes=(64,32), max_iter=mlp_iter, random_state=42)),
        ]
        if _LGBM_OK: models.append(("LightGBM", LGBMClassifier(n_estimators=lgbm_estimators, n_jobs=-1, random_state=42)))
        if _XGB_OK:  models.append(("XGBoost", XGBClassifier(n_estimators=xgb_estimators, n_jobs=-1, random_state=42, verbosity=0, use_label_encoder=False)))
        if _CAT_OK:  models.append(("CatBoost", CatBoostClassifier(n_estimators=cat_estimators, verbose=0, random_state=42)))
    else:
        models += [
            ("RandomForest", RandomForestRegressor(n_estimators=lgbm_estimators, n_jobs=-1, random_state=42)),
            ("ExtraTrees", ExtraTreesRegressor(n_estimators=lgbm_estimators, n_jobs=-1, random_state=42)),
            ("GradientBoosting", GradientBoostingRegressor(n_estimators=lgbm_estimators, random_state=42)),
            ("HistGB", HistGradientBoostingRegressor(max_iter=lgbm_estimators, random_state=42)),
            ("DecisionTree", DecisionTreeRegressor(random_state=42)),
            ("ExtraTree", ExtraTreeRegressor(random_state=42)),
            ("AdaBoost", AdaBoostRegressor(n_estimators=lgbm_estimators, random_state=42)),
            ("Bagging", BaggingRegressor(n_estimators=100, n_jobs=-1, random_state=42)),
            ("LinearRegression", LinearRegression()),
            ("Ridge", Ridge()),
            ("Lasso", Lasso()),
            ("ElasticNet", ElasticNet()),
            ("SGDRegressor", SGDRegressor(max_iter=heavy_iter, random_state=42)),
            ("BayesianRidge", BayesianRidge()),
            ("HuberRegressor", HuberRegressor()),
            ("Lars", Lars()),
            ("LassoLars", LassoLars()),
            ("OrthogonalMatchingPursuit", OrthogonalMatchingPursuit()),
            ("ARDRegression", ARDRegression()),
            ("TweedieRegressor", TweedieRegressor()),
            ("PoissonRegressor", PoissonRegressor()),
            ("GammaRegressor", GammaRegressor()),
            ("QuantileRegressor", QuantileRegressor()),
            ("PassiveAggressiveRegressor", PassiveAggressiveRegressor(max_iter=1000, random_state=42)),
            ("KNN", KNeighborsRegressor(n_jobs=-1)),
            ("RadiusNN", RadiusNeighborsRegressor(n_jobs=-1)),
            ("LinearSVR", LinearSVR(max_iter=heavy_iter, random_state=42)),
            ("MLP", MLPRegressor(hidden_layer_sizes=(64,32), max_iter=mlp_iter, random_state=42)),
        ]
        if _LGBM_OK: models.append(("LightGBM", LGBMRegressor(n_estimators=lgbm_estimators, n_jobs=-1, random_state=42)))
        if _XGB_OK:  models.append(("XGBoost", XGBRegressor(n_estimators=xgb_estimators, n_jobs=-1, random_state=42, verbosity=0)))
        if _CAT_OK:  models.append(("CatBoost", CatBoostRegressor(n_estimators=cat_estimators, verbose=0, random_state=42)))
    return models

def auto_ml_ultra(df, target_col, time_col=None, min_score=0.88):
    import joblib
    t0 = time.time()
    print_step("Detecting problem type")
    task = detect_problem_type(df, target_col, time_col)
    print(f"[INFO] Task: {task}")

    if task == "time_series":
        if _PROPHET_OK:
            print_step("Prophet Forecasting")
            df2 = df[[time_col, target_col]].rename(columns={time_col: "ds", target_col: "y"})
            m = Prophet()
            m.fit(df2)
            future = m.make_future_dataframe(periods=10)
            forecast = m.predict(future)
            print(forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail())
        if _PMDARIMA_OK:
            print_step("ARIMA Forecasting")
            model = auto_arima(df[target_col], seasonal=False, trace=True)
            print(model.summary())
        print("[INFO] Time Series task finished.")
        return

    print_step("Preprocessing")
    X_proc, y, feat_names = preprocess_data(df, target_col, time_col)
    print(f"[INFO] Features: {X_proc.shape[1]} | Samples: {X_proc.shape[0]}")

    print_step("Splitting data")
    if task == "classification":
        X_train, X_test, y_train, y_test = train_test_split(X_proc, y, test_size=0.2, random_state=42, stratify=y)
        cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
        score_metric = "accuracy"
    else:
        X_train, X_test, y_train, y_test = train_test_split(X_proc, y, test_size=0.2, random_state=42)
        cv = KFold(n_splits=3, shuffle=True, random_state=42)
        score_metric = "r2"
    print(f"[INFO] Train: {X_train.shape}, Test: {X_test.shape}")

    print_step("Model selection & training")
    models = get_models(task, n_samples=X_train.shape[0])
    results = []
    results_table = []
    for idx, (name, model) in enumerate(models):
        t1 = time.time()
        try:
            scores = cross_val_score(model, X_train, y_train, scoring=score_metric, cv=cv, n_jobs=-1)
            score = scores.mean()
            std = scores.std()
            print(f"[{idx+1:02d}/{len(models)}] {name}: CV {score_metric} = {score:.4f} ± {std:.4f} | Time: {time.time()-t1:.1f}s")
            results.append((score, name, model))
            results_table.append({
                "Model": name,
                "CV_Mean": score,
                "CV_Std": std,
                "Time_sec": time.time()-t1
            })
        except Exception as e:
            print(f"[{idx+1:02d}/{len(models)}] {name}: ERROR {e}")
            results_table.append({
                "Model": name,
                "CV_Mean": np.nan,
                "CV_Std": np.nan,
                "Time_sec": time.time()-t1,
                "Error": str(e)
            })
    results.sort(reverse=True)
    top_models = results[:10]
    best_score, best_name, best_model = top_models[0]

    # Voting/Stacking
    print_step("Ensemble (Voting/Stacking)")
    try:
        if task == "classification":
            voting = VotingClassifier(estimators=[(n, m) for _, n, m in top_models], voting='soft', n_jobs=-1)
            stacking = StackingClassifier(estimators=[(n, m) for _, n, m in top_models], final_estimator=LogisticRegression(max_iter=2000), n_jobs=-1)
        else:
            voting = VotingRegressor(estimators=[(n, m) for _, n, m in top_models], n_jobs=-1)
            stacking = StackingRegressor(estimators=[(n, m) for _, n, m in top_models], final_estimator=Ridge(), n_jobs=-1)
        for ens_name, ens_model in [("Voting", voting), ("Stacking", stacking)]:
            t1 = time.time()
            scores = cross_val_score(ens_model, X_train, y_train, scoring=score_metric, cv=cv, n_jobs=-1)
            score = scores.mean()
            std = scores.std()
            print(f"[Ensemble] {ens_name}: CV {score_metric} = {score:.4f} ± {std:.4f} | Time: {time.time()-t1:.1f}s")
            results.append((score, ens_name, ens_model))
            results_table.append({
                "Model": ens_name,
                "CV_Mean": score,
                "CV_Std": std,
                "Time_sec": time.time()-t1
            })
            if score > best_score:
                best_score, best_name, best_model = score, ens_name, ens_model
    except Exception as e:
        print(f"Ensemble Error: {e}")

    print_step(f"Best Model: {best_name} (CV {score_metric}: {best_score:.4f})")
    best_model.fit(X_train, y_train)
    # Save best model
    joblib.dump(best_model, "best_model.pkl")
    print("[INFO] Best model saved as best_model.pkl")
    y_pred = best_model.predict(X_test)
    if task == "classification":
        acc = accuracy_score(y_test, y_pred)
        print(f"Test Accuracy: {acc:.4f}")
        print(classification_report(y_test, y_pred))
        cm = confusion_matrix(y_test, y_pred)
        print("Confusion Matrix:\n", cm)
        if acc < min_score:
            print(f"\n[WARNING] Accuracy is less than {min_score*100:.1f}%!")
    else:
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        print(f"Test R2: {r2:.4f}, MSE: {mse:.4f}")
        if r2 < min_score:
            print(f"\n[WARNING] R2 is less than {min_score*100:.1f}%!")

    if hasattr(best_model, "feature_importances_"):
        importances = best_model.feature_importances_
        idx = numba_argsort(importances)[::-1][:10]
        print_step("Top 10 Features")
        for i in idx:
            print(f"{feat_names[i]}: {importances[i]:.4f}")

    print(f"\n[INFO] Total pipeline time: {time.time()-t0:.1f} sec.")

    print_step("Summary Table (All Models)")
    df_results = pd.DataFrame(results_table)
    df_results = df_results.sort_values("CV_Mean", ascending=False)
    display(df_results.reset_index(drop=True))


auto_ml_ultra(df, target_col, time_col=None)

In [None]:
import warnings
warnings.filterwarnings("ignore")

import time
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

X = df.drop(target_col, axis=1)
y = df[target_col]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

model = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(),
    n_estimators=50,
    random_state=42,
    n_jobs=-1
)

start_time = time.time()
model.fit(X_train, y_train)
end_time = time.time()

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Bagging: Test accuracy = {accuracy:.4f} | Time: {end_time - start_time:.1f}s")
print("\nClassification Report:\n", report)
print("\nConfusion Matrix:\n", conf_matrix)

joblib.dump(model, "bagging_model.pkl")
print("bagging_model.pkl saved")


In [None]:
# import warnings
# warnings.filterwarnings("ignore")
# import time
# import numpy as np
# import pandas as pd
# from sklearn.model_selection import train_test_split, StratifiedKFold, KFold, cross_val_score
# from sklearn.preprocessing import StandardScaler, OneHotEncoder
# from sklearn.impute import SimpleImputer
# from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline
# from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, r2_score, mean_squared_error

# # Classic models
# from sklearn.linear_model import LogisticRegression, Ridge, Lasso, ElasticNet, SGDClassifier, SGDRegressor, BayesianRidge, PassiveAggressiveClassifier, PassiveAggressiveRegressor, Perceptron, LinearRegression, HuberRegressor, Lars, LassoLars, OrthogonalMatchingPursuit, ARDRegression, TweedieRegressor, PoissonRegressor, GammaRegressor, QuantileRegressor
# from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, ExtraTreeClassifier, ExtraTreeRegressor
# from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, ExtraTreesClassifier, ExtraTreesRegressor, GradientBoostingClassifier, GradientBoostingRegressor, AdaBoostClassifier, AdaBoostRegressor, BaggingClassifier, BaggingRegressor, VotingClassifier, VotingRegressor, StackingClassifier, StackingRegressor, HistGradientBoostingClassifier, HistGradientBoostingRegressor
# from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB, ComplementNB
# from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor, RadiusNeighborsClassifier, RadiusNeighborsRegressor, NearestCentroid
# from sklearn.svm import LinearSVC, LinearSVR
# from sklearn.neural_network import MLPClassifier, MLPRegressor
# from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

# # Optional models
# try:
#     from lightgbm import LGBMClassifier, LGBMRegressor
#     _LGBM_OK = True
# except: _LGBM_OK = False
# try:
#     from xgboost import XGBClassifier, XGBRegressor
#     _XGB_OK = True
# except: _XGB_OK = False
# try:
#     from catboost import CatBoostClassifier, CatBoostRegressor
#     _CAT_OK = True
# except: _CAT_OK = False
# try:
#     from prophet import Prophet
#     _PROPHET_OK = True
# except: _PROPHET_OK = False
# try:
#     from pmdarima import auto_arima
#     _PMDARIMA_OK = True
# except: _PMDARIMA_OK = False

# def print_step(msg):
#     print(f"\n{'='*10} {msg} {'='*10}")

# def detect_problem_type(df, target_col, time_col=None):
#     if time_col and time_col in df.columns:
#         try:
#             pd.to_datetime(df[time_col])
#             return "time_series"
#         except: pass
#     y = df[target_col]
#     if pd.api.types.is_numeric_dtype(y):
#         if (y.dropna() % 1 == 0).all() and y.nunique(dropna=True) <= 20:
#             return "classification"
#         else:
#             return "regression"
#     return "classification"

# def preprocess_data(df, target_col, time_col=None):
#     df = df.copy()
#     if time_col and time_col in df.columns:
#         df = df.sort_values(time_col)
#     cat_cols = df.select_dtypes(include=['object','category','bool']).columns.tolist()
#     num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
#     if target_col in cat_cols: cat_cols.remove(target_col)
#     if target_col in num_cols: num_cols.remove(target_col)
#     num_pipe = Pipeline([
#         ("imputer", SimpleImputer(strategy="median")),
#         ("scaler", StandardScaler())
#     ])
#     cat_pipe = Pipeline([
#         ("imputer", SimpleImputer(strategy="most_frequent")),
#         ("onehot", OneHotEncoder(handle_unknown="ignore", sparse=False))
#     ]) if cat_cols else "drop"
#     preprocessor = ColumnTransformer([
#         ("num", num_pipe, num_cols),
#         ("cat", cat_pipe, cat_cols)
#     ])
#     X = df.drop(columns=[target_col])
#     y = df[target_col]
#     X_proc = preprocessor.fit_transform(X)
#     feat_names = []
#     feat_names.extend(num_cols)
#     if cat_cols:
#         feat_names.extend(preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(cat_cols))
#     return X_proc, y, feat_names

# def get_models(task):
#     models = []
#     if task == "classification":
#         models += [
#             ("RandomForest", RandomForestClassifier(n_estimators=200, n_jobs=-1, random_state=42)),
#             ("ExtraTrees", ExtraTreesClassifier(n_estimators=200, n_jobs=-1, random_state=42)),
#             ("GradientBoosting", GradientBoostingClassifier(n_estimators=200, random_state=42)),
#             ("HistGB", HistGradientBoostingClassifier(max_iter=200, random_state=42)),
#             ("DecisionTree", DecisionTreeClassifier(random_state=42)),
#             ("ExtraTree", ExtraTreeClassifier(random_state=42)),
#             ("AdaBoost", AdaBoostClassifier(n_estimators=200, random_state=42)),
#             ("Bagging", BaggingClassifier(n_estimators=100, n_jobs=-1, random_state=42)),
#             ("LogisticRegression", LogisticRegression(max_iter=2000, n_jobs=-1, random_state=42)),
#             ("SGDClassifier", SGDClassifier(max_iter=2000, random_state=42)),
#             ("LinearSVC", LinearSVC(max_iter=2000, random_state=42)),
#             ("RidgeClassifier", Ridge()),
#             ("PassiveAggressiveClassifier", PassiveAggressiveClassifier(max_iter=1000, random_state=42)),
#             ("Perceptron", Perceptron(max_iter=1000, random_state=42)),
#             ("GaussianNB", GaussianNB()),
#             ("BernoulliNB", BernoulliNB()),
#             ("MultinomialNB", MultinomialNB()),
#             ("ComplementNB", ComplementNB()),
#             ("LDA", LinearDiscriminantAnalysis()),
#             ("QDA", QuadraticDiscriminantAnalysis()),
#             ("KNN", KNeighborsClassifier(n_jobs=-1)),
#             ("RadiusNN", RadiusNeighborsClassifier(n_jobs=-1, outlier_label=0)),
#             ("NearestCentroid", NearestCentroid()),
#             ("MLP", MLPClassifier(hidden_layer_sizes=(64,32), max_iter=300, random_state=42)),
#         ]
#         if _LGBM_OK: models.append(("LightGBM", LGBMClassifier(n_estimators=200, n_jobs=-1, random_state=42)))
#         if _XGB_OK:  models.append(("XGBoost", XGBClassifier(n_estimators=200, n_jobs=-1, random_state=42, verbosity=0, use_label_encoder=False)))
#         if _CAT_OK:  models.append(("CatBoost", CatBoostClassifier(n_estimators=200, verbose=0, random_state=42)))
#     else:
#         models += [
#             ("RandomForest", RandomForestRegressor(n_estimators=200, n_jobs=-1, random_state=42)),
#             ("ExtraTrees", ExtraTreesRegressor(n_estimators=200, n_jobs=-1, random_state=42)),
#             ("GradientBoosting", GradientBoostingRegressor(n_estimators=200, random_state=42)),
#             ("HistGB", HistGradientBoostingRegressor(max_iter=200, random_state=42)),
#             ("DecisionTree", DecisionTreeRegressor(random_state=42)),
#             ("ExtraTree", ExtraTreeRegressor(random_state=42)),
#             ("AdaBoost", AdaBoostRegressor(n_estimators=200, random_state=42)),
#             ("Bagging", BaggingRegressor(n_estimators=100, n_jobs=-1, random_state=42)),
#             ("LinearRegression", LinearRegression()),
#             ("Ridge", Ridge()),
#             ("Lasso", Lasso()),
#             ("ElasticNet", ElasticNet()),
#             ("SGDRegressor", SGDRegressor(max_iter=2000, random_state=42)),
#             ("BayesianRidge", BayesianRidge()),
#             ("HuberRegressor", HuberRegressor()),
#             ("Lars", Lars()),
#             ("LassoLars", LassoLars()),
#             ("OrthogonalMatchingPursuit", OrthogonalMatchingPursuit()),
#             ("ARDRegression", ARDRegression()),
#             ("TweedieRegressor", TweedieRegressor()),
#             ("PoissonRegressor", PoissonRegressor()),
#             ("GammaRegressor", GammaRegressor()),
#             ("QuantileRegressor", QuantileRegressor()),
#             ("PassiveAggressiveRegressor", PassiveAggressiveRegressor(max_iter=1000, random_state=42)),
#             ("KNN", KNeighborsRegressor(n_jobs=-1)),
#             ("RadiusNN", RadiusNeighborsRegressor(n_jobs=-1)),

#             ("LinearSVR", LinearSVR(max_iter=2000, random_state=42)),
#             ("MLP", MLPRegressor(hidden_layer_sizes=(64,32), max_iter=300, random_state=42)),
#         ]
#         if _LGBM_OK: models.append(("LightGBM", LGBMRegressor(n_estimators=200, n_jobs=-1, random_state=42)))
#         if _XGB_OK:  models.append(("XGBoost", XGBRegressor(n_estimators=200, n_jobs=-1, random_state=42, verbosity=0)))
#         if _CAT_OK:  models.append(("CatBoost", CatBoostRegressor(n_estimators=200, verbose=0, random_state=42)))
#     return models

# def auto_ml_ultra(df, target_col, time_col=None, min_score=0.88):
#     t0 = time.time()
#     print_step("Detecting problem type")
#     task = detect_problem_type(df, target_col, time_col)
#     print(f"[INFO] Task: {task}")

#     if task == "time_series":
#         if _PROPHET_OK:
#             print_step("Prophet Forecasting")
#             df2 = df[[time_col, target_col]].rename(columns={time_col: "ds", target_col: "y"})
#             m = Prophet()
#             m.fit(df2)
#             future = m.make_future_dataframe(periods=10)
#             forecast = m.predict(future)
#             print(forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail())
#         if _PMDARIMA_OK:
#             print_step("ARIMA Forecasting")
#             model = auto_arima(df[target_col], seasonal=False, trace=True)
#             print(model.summary())
#         print("[INFO] Time Series task finished.")
#         return

#     print_step("Preprocessing")
#     X_proc, y, feat_names = preprocess_data(df, target_col, time_col)
#     print(f"[INFO] Features: {X_proc.shape[1]} | Samples: {X_proc.shape[0]}")

#     print_step("Splitting data")
#     if task == "classification":
#         X_train, X_test, y_train, y_test = train_test_split(X_proc, y, test_size=0.2, random_state=42, stratify=y)
#         cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
#         score_metric = "accuracy"
#     else:
#         X_train, X_test, y_train, y_test = train_test_split(X_proc, y, test_size=0.2, random_state=42)
#         cv = KFold(n_splits=3, shuffle=True, random_state=42)
#         score_metric = "r2"
#     print(f"[INFO] Train: {X_train.shape}, Test: {X_test.shape}")

#     print_step("Model selection & training")
#     models = get_models(task)
#     results = []
#     results_table = []
#     for idx, (name, model) in enumerate(models):
#         t1 = time.time()
#         try:
#             scores = cross_val_score(model, X_train, y_train, scoring=score_metric, cv=cv, n_jobs=-1)
#             score = scores.mean()
#             std = scores.std()
#             print(f"[{idx+1:02d}/{len(models)}] {name}: CV {score_metric} = {score:.4f} ± {std:.4f} | Time: {time.time()-t1:.1f}s")
#             results.append((score, name, model))
#             results_table.append({
#                 "Model": name,
#                 "CV_Mean": score,
#                 "CV_Std": std,
#                 "Time_sec": time.time()-t1
#             })
#         except Exception as e:
#             print(f"[{idx+1:02d}/{len(models)}] {name}: ERROR {e}")
#             results_table.append({
#                 "Model": name,
#                 "CV_Mean": np.nan,
#                 "CV_Std": np.nan,
#                 "Time_sec": time.time()-t1,
#                 "Error": str(e)
#             })
#     results.sort(reverse=True)
#     top_models = results[:10]
#     best_score, best_name, best_model = top_models[0]

#     # Voting/Stacking
#     print_step("Ensemble (Voting/Stacking)")
#     try:
#         if task == "classification":
#             voting = VotingClassifier(estimators=[(n, m) for _, n, m in top_models], voting='soft', n_jobs=-1)
#             stacking = StackingClassifier(estimators=[(n, m) for _, n, m in top_models], final_estimator=LogisticRegression(max_iter=2000), n_jobs=-1)
#         else:
#             voting = VotingRegressor(estimators=[(n, m) for _, n, m in top_models], n_jobs=-1)
#             stacking = StackingRegressor(estimators=[(n, m) for _, n, m in top_models], final_estimator=Ridge(), n_jobs=-1)
#         for ens_name, ens_model in [("Voting", voting), ("Stacking", stacking)]:
#             t1 = time.time()
#             scores = cross_val_score(ens_model, X_train, y_train, scoring=score_metric, cv=cv, n_jobs=-1)
#             score = scores.mean()
#             std = scores.std()
#             print(f"[Ensemble] {ens_name}: CV {score_metric} = {score:.4f} ± {std:.4f} | Time: {time.time()-t1:.1f}s")
#             results.append((score, ens_name, ens_model))
#             results_table.append({
#                 "Model": ens_name,
#                 "CV_Mean": score,
#                 "CV_Std": std,
#                 "Time_sec": time.time()-t1
#             })
#             if score > best_score:
#                 best_score, best_name, best_model = score, ens_name, ens_model
#     except Exception as e:
#         print(f"Ensemble Error: {e}")

#     print_step(f"Best Model: {best_name} (CV {score_metric}: {best_score:.4f})")
#     best_model.fit(X_train, y_train)
#     y_pred = best_model.predict(X_test)
#     if task == "classification":
#         acc = accuracy_score(y_test, y_pred)
#         print(f"Test Accuracy: {acc:.4f}")
#         print(classification_report(y_test, y_pred))
#         cm = confusion_matrix(y_test, y_pred)
#         print("Confusion Matrix:\n", cm)
#         if acc < min_score:
#             print(f"\n[WARNING] Accuracy is less than {min_score*100:.1f}%!")
#     else:
#         mse = mean_squared_error(y_test, y_pred)
#         r2 = r2_score(y_test, y_pred)
#         print(f"Test R2: {r2:.4f}, MSE: {mse:.4f}")
#         if r2 < min_score:
#             print(f"\n[WARNING] R2 is less than {min_score*100:.1f}%!")

#     if hasattr(best_model, "feature_importances_"):
#         importances = best_model.feature_importances_
#         idx = np.argsort(importances)[::-1][:10]
#         print_step("Top 10 Features")
#         for i in idx:
#             print(f"{feat_names[i]}: {importances[i]:.4f}")

#     print(f"\n[INFO] Total pipeline time: {time.time()-t0:.1f} sec.")

#     print_step("Summary Table (All Models)")
#     df_results = pd.DataFrame(results_table)
#     df_results = df_results.sort_values("CV_Mean", ascending=False)
#     display(df_results.reset_index(drop=True))


# auto_ml_ultra(df, target_col, time_col=None)

 

In [None]:
# import pkg_resources
# output_file = "requirements.txt"
# packages = [
#     "numpy",
#     "pandas",
#     "numba",
#     "scikit-learn",
#     "matplotlib",
#     "seaborn"
# ]
# with open(output_file, "w") as f:
#     for package in packages:
#         try:
#             version = pkg_resources.get_distribution(package).version
#             f.write(f"{package}=={version}\n")
#         except pkg_resources.DistributionNotFound:
#             f.write(f"{package}\n")  
# print({output_file})
