<a href="https://colab.research.google.com/github/IngaSamoneneko/1D-to-2D-distributions/blob/main/1D-to-2D%20densities%2085%20accuracy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from typing import List, Tuple, Union

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import numpy as np
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier


In [1]:
# Generate non-overlapping circles in a unit square
def generate_circles(num_circles, radius):
    positions = []
    for _ in range(num_circles):
        while True:
            x = np.random.uniform(radius, 1 - radius)
            y = np.random.uniform(radius, 1 - radius)
            if all(np.sqrt((x - px) ** 2 + (y - py) ** 2) >= 2 * radius for px, py in positions):
                positions.append((x, y))
                break
    return positions

# Collect chord lengths from all circle for one secant (one cut-line)
def calculate_chord_lengths_for_cut(a: float, b: float,
                                  positions: List[Tuple[float, float]],
                                  radius: float) -> List[float]:
    return [calculate_chord_length(a, b, pos, radius) for pos in positions]

# Calculate chord length for a single circle and line
def calculate_chord_length(a: float, b: float,
                         center: Tuple[float, float],
                         radius: float) -> float:
    x0, y0 = center
    distance = abs(a*x0 - y0 + b)/np.sqrt(a**2 + 1)
    return 2*np.sqrt(max(0, radius**2 - distance**2))

# Discretize a vector of chord lengths into bins
def discretize_chords(chord_lengths: List[float],
                      num_bins: int = 15,
                      max_length: float = 0.16) -> np.ndarray:
    bin_edges = np.linspace(0, max_length, num_bins + 1)
    hist = np.zeros(num_bins, dtype=int)

    for val in chord_lengths:
        if val == 0.0:
            continue            # this helps me to avoid collection of zeros in the first bin
        if val == max_length:
            hist[-1] += 1
            continue
        for i in range(num_bins):
            if bin_edges[i] <= val < bin_edges[i + 1]:
                hist[i] += 1
                break

    return hist

# Generate multiple discretized vectors
def generate_distribution(positions: List[Tuple[float, float]],
                        radius: float,
                        num_cuts: int,
                        num_bins: int) -> List[np.ndarray]:
    distribution = []
    for _ in range(num_cuts):
        #x1, y1 = np.random.rand(2)
        #x2, y2 = np.random.rand(2)
        #while x1 == x2:
        #    x2 = np.random.rand()
        #a = (y2 - y1) / (x2 - x1)
        #b = y1 - a * x1
        a = 0
        b = np.random.uniform(0, 1)
        chords = calculate_chord_lengths_for_cut(a, b, positions, radius)
        discretized = discretize_chords(chords, num_bins)
        distribution.append(discretized)
    return distribution

# this helps to test models - the sample is in the right copy-and-paste format
def sample_from(distribution: List[np.ndarray]) -> Union[np.ndarray, None]:
    """Get first non-empty discretized vector"""
    for vec in distribution:
        if np.any(vec > 0):  # Check if any bin has counts
            return vec
    return None

In [2]:
# Parameters
NUM_CUTS = 1000  # Number of horizontal cuts
NUM_BINS = 10 #feature vectors
RADIUS = 0.08  # Radius of each circle

positions_21 = generate_circles(21,RADIUS)
distribution_21 = generate_distribution(positions_21, RADIUS, NUM_CUTS, NUM_BINS) #discretised vectors

positions_11 = generate_circles(11, RADIUS)
distribution_11 = generate_distribution(positions_11, RADIUS, NUM_CUTS, NUM_BINS) #discretised vectors

In [3]:
# Flatten vectors and create (X, y) pairs
def create_X_y(distribution, label):
    X = distribution
    y = np.full(len(X), label)
    return X, y

X_11, y_11 = create_X_y(distribution_11, label=0)  # low density
X_17, y_17 = create_X_y(distribution_21, label=1)  # high density

# Combine training data
X_all = np.vstack([X_11, X_17])
y_all = np.concatenate([y_11, y_17])

X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size = 0.2,random_state=42,stratify=y_all)

In [4]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": xgb.XGBClassifier(),
    "Naive Bayes": GaussianNB(),
    "MLP": MLPClassifier(max_iter=1000)
}

In [5]:
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"{name} Accuracy: {accuracy_score(y_test, y_pred):.2f}")

Logistic Regression Accuracy: 0.83
SVM Accuracy: 0.86
KNN Accuracy: 0.82
Random Forest Accuracy: 0.86
XGBoost Accuracy: 0.85
Naive Bayes Accuracy: 0.72
MLP Accuracy: 0.86


In [7]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for name, model in models.items():
    accuracies = []

    for train_idx, val_idx in skf.split(X_all, y_all):
        X_train_k, X_test_k = X_all[train_idx], X_all[val_idx]
        y_train_k, y_test_k = y_all[train_idx], y_all[val_idx]

        model.fit(X_train_k, y_train_k)
        y_pred_k = model.predict(X_test_k)
        acc = accuracy_score(y_test_k, y_pred_k)
        accuracies.append(acc)

    print(f"{name} Stratified CV Accuracy: {np.mean(accuracies):.2f} ± {np.std(accuracies):.2f}")

Logistic Regression Stratified CV Accuracy: 0.85 ± 0.02
SVM Stratified CV Accuracy: 0.87 ± 0.01
KNN Stratified CV Accuracy: 0.77 ± 0.01
Random Forest Stratified CV Accuracy: 0.88 ± 0.02
XGBoost Stratified CV Accuracy: 0.87 ± 0.01
Naive Bayes Stratified CV Accuracy: 0.75 ± 0.02
MLP Stratified CV Accuracy: 0.87 ± 0.01


In [24]:
# ==========================
# ASSUME X_all, y_all ARE ALREADY DEFINED
# ==========================
# Stratified 5-fold setup
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# ==========================
# DEFINE MODELS AND THEIR PARAM GRIDS
# ==========================
tuning_dict = {
    "XGBoost": {
        "estimator": XGBClassifier(eval_metric='logloss', use_label_encoder=False, random_state=42),
        "param_grid": {
            'n_estimators': [50, 100, 200],
            'max_depth': [3, 5, 7],
            'learning_rate': [0.01, 0.1, 0.2],
            'subsample': [0.8, 1.0]
        }
    },
    "MLP": {
        "estimator": MLPClassifier(max_iter=1000, early_stopping=True, random_state=42),
        "param_grid": {
            'hidden_layer_sizes': [(50,), (100,), (50, 50)],
            'activation': ['relu', 'tanh'],
            'alpha': [0.0001, 0.001, 0.01],
            'learning_rate_init': [0.001, 0.01]
        }
    },
    "Random Forest": {
        "estimator": RandomForestClassifier(random_state=42, n_jobs=-1),
        "param_grid": {
            'n_estimators': [100, 200],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5],
            'max_features': ['sqrt', 'log2'],
            'class_weight': [None, 'balanced']
        }
    }
}

# ==========================
# RUN GRID SEARCH FOR EACH MODEL
# ==========================
results = {}

for name, cfg in tuning_dict.items():
    print(f"\n=== Tuning {name} ===")

    model = cfg["estimator"]
    grid = GridSearchCV(
        estimator=model,
        param_grid=cfg["param_grid"],
        cv=skf,
        scoring='accuracy',
        n_jobs=-1
    )
    grid.fit(X_all, y_all)

    best_params = grid.best_params_
    best_score = grid.best_score_
    best_est = grid.best_estimator_

    print(f"Best {name} Params: {best_params}")
    print(f"Best {name} CV Accuracy: {best_score:.4f}")

    # Optional independent stratified CV to confirm
    cv_accs = []
    for train_idx, val_idx in skf.split(X_all, y_all):
        X_train, X_val = X_all[train_idx], X_all[val_idx]
        y_train, y_val = y_all[train_idx], y_all[val_idx]

        # Fit a fresh clone to avoid any side effects
        best_clone = best_est.__class__(**best_params, random_state=42)
        best_clone.fit(X_train, y_train)
        preds = best_clone.predict(X_val)
        cv_accs.append(accuracy_score(y_val, preds))

    mean_acc, std_acc = np.mean(cv_accs), np.std(cv_accs)
    print(f"Independent Stratified CV: {mean_acc:.4f} ± {std_acc:.4f}")

    # Store results
    results[name] = {
        "best_params": best_params,
        "grid_cv_score": best_score,
        "independent_cv": (mean_acc, std_acc)
    }

# ==========================
# ALL RESULTS STORED IN 'results' DICT
# ==========================



=== Tuning XGBoost ===


Parameters: { "use_label_encoder" } are not used.



Best XGBoost Params: {'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 200, 'subsample': 0.8}
Best XGBoost CV Accuracy: 0.8730
Independent Stratified CV: 0.8730 ± 0.0181

=== Tuning MLP ===
Best MLP Params: {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 50), 'learning_rate_init': 0.01}
Best MLP CV Accuracy: 0.8755
Independent Stratified CV: 0.8735 ± 0.0215

=== Tuning Random Forest ===
Best Random Forest Params: {'class_weight': None, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 200}
Best Random Forest CV Accuracy: 0.8770
Independent Stratified CV: 0.8770 ± 0.0170


In [26]:
import pandas as pd
import numpy as np

n = 200  # number of samples to generate
results = []

models = {
    "Random Forest": best_rf_model,
    "XGBoost": best_xgb_model,
    "MLP": best_mlp_model
}

ones_count = {name: 0 for name in models}

for i in range(n):
    # Step 1: Generate new positions and one cut
    positions = generate_circles(21, 0.08)
    vec = generate_distribution(positions, 0.08, 1, NUM_BINS)

    sample_vector = np.array(vec, dtype=np.float32).reshape(1, -1)

    # Step 3: Run model predictions
    row = {"Sample #": i + 1}
    for name, model in models.items():
        try:
            pred = model.predict(sample_vector)
            row[name] = pred[0]
            if pred[0] == 1:
                ones_count[name] += 1
        except Exception as e:
            row[name] = f"Error: {str(e)}"

    results.append(row)

# Step 4: Convert to DataFrame and show
df = pd.DataFrame(results)

# Step 5: Append ones count as summary row
summary_row = {"Sample #": "Total 1s"}
summary_row.update({name: (ones_count[name]/n) for name in models})
df = pd.concat([df, pd.DataFrame([summary_row])], ignore_index=True)

print(df)


     Sample #  Random Forest  XGBoost    MLP
0           1           0.00     0.00  0.000
1           2           1.00     1.00  1.000
2           3           1.00     1.00  1.000
3           4           1.00     1.00  1.000
4           5           1.00     1.00  1.000
..        ...            ...      ...    ...
196       197           0.00     0.00  0.000
197       198           1.00     1.00  1.000
198       199           1.00     1.00  1.000
199       200           1.00     1.00  1.000
200  Total 1s           0.78     0.77  0.795

[201 rows x 4 columns]


In [25]:
import numpy as np
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

# 1) Define StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 2) Combine both models and their parameter distributions into one dict
search_space = [
    {
        'model': [RandomForestClassifier(random_state=42)],
        'model__n_estimators': [100, 200, 300],
        'model__max_depth': [None, 10, 20],
        'model__min_samples_split': [2, 5],
        'model__max_features': ['sqrt', 'log2'],
        'model__class_weight': [None, 'balanced']
    },
    {
        'model': [XGBClassifier(eval_metric='logloss', use_label_encoder=False, random_state=42)],
        'model__n_estimators': [50, 100, 200],
        'model__max_depth': [3, 5, 7],
        'model__learning_rate': [0.01, 0.1, 0.2],
        'model__subsample': [0.8, 1.0]
    }
]

# 3) Wrap models in a simple Pipeline (even if no preprocessing)
from sklearn.pipeline import Pipeline
pipe = Pipeline([('model', RandomForestClassifier())])

# 4) Set up RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=pipe,
    param_distributions=search_space,
    n_iter=20,                      # limit to 20 random combos (instead of full grid)
    scoring='accuracy',
    cv=skf,
    n_jobs=-1,
    random_state=42,
    verbose=1
)

# 5) Run the search
random_search.fit(X_all, y_all)

# 6) Extract best results
best_model = random_search.best_estimator_.named_steps['model']
best_params = random_search.best_params_
best_score = random_search.best_score_

print("Best Model:", best_model.__class__.__name__)
print("Best Params:", best_params)
print(f"Best Stratified CV Accuracy: {best_score:.4f}")

# 7) Final independent Stratified CV check
cv_accs = []
for train_idx, val_idx in skf.split(X_all, y_all):
    X_train, X_val = X_all[train_idx], X_all[val_idx]
    y_train, y_val = y_all[train_idx], y_all[val_idx]
    clone = best_model.__class__(**{k.replace('model__',''): v for k,v in best_params.items() if k.startswith('model__')})
    clone.fit(X_train, y_train)
    preds = clone.predict(X_val)
    cv_accs.append(accuracy_score(y_val, preds))

mean_acc, std_acc = np.mean(cv_accs), np.std(cv_accs)
print(f"Independent CV Accuracy: {mean_acc:.4f} ± {std_acc:.4f}")


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Model: RandomForestClassifier
Best Params: {'model__n_estimators': 200, 'model__min_samples_split': 2, 'model__max_features': 'log2', 'model__max_depth': 20, 'model__class_weight': 'balanced', 'model': RandomForestClassifier(random_state=42)}
Best Stratified CV Accuracy: 0.8770
Independent CV Accuracy: 0.8760 ± 0.0176
