# Baseline Model

## Table of Contents
1. [Model Choice](#model-choice)
2. [Feature Selection](#feature-selection)
3. [Implementation](#implementation)
4. [Evaluation](#evaluation)


In [None]:
# Import necessary libraries
import pandas as pd
import os
import tensorflow as tf
import numpy as np
from tensorflow.keras.applications import ResNet50
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import metrics
from tensorflow.keras import optimizers
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import shutil
from sklearn.model_selection import train_test_split
import sklearn.utils
import tensorflow as tf
from tensorflow.keras import backend as K

import matplotlib.pyplot as plt


## Model Choice
For this binary image classification task (distinguishing between **HP (benign polyp)** and **SSA (malignant adenoma)** classes), we considered and experimented with a custom Convolutional Neural Network (CNN) --> a basic CNN architecture with three convolutional blocks followed by dense layers.

* to establish a baseline performance.
* full control over architecture and training.
* Lightweight and suitable for quick iteration and debugging.

**Limitations:**

Less expressive power compared to modern deep networks.
Prone to overfitting on small datasets.




## Feature Selection

**Image Preprocessing Rescaling:** All pixel values were normalized to the [0, 1] range using rescale=1./255 to aid model convergence and numerical stability.
Standardized Input Shape: All images were resized to 224×224 pixels to match the input requirements of models like ResNet50 and DenseNet121.

**Data Augmentation:** To improve generalization and simulate variability in real-world data, the following augmentations were applied using ImageDataGenerator:
* Horizontal Flip: Introduces invariance to image mirroring.
* Rotation (±15 degrees): Helps the model become robust to slight rotations.
* Zooming (±10%): Simulates scale variation and partial views.

These augmentations act as a form of implicit feature engineering, improving the diversity of the training set without increasing its size.

**Custom Loss Function (Focal Loss):** Replacing standard binary cross-entropy with focal loss introduced a class-weighting mechanism to tackle potential class imbalance.


In [None]:
# paths to the dataset
excel_path = "C:/Users/user/Desktop/Tensor-FLow Project/Filik.xlsx"
image_src_dir = "C:/Users/user/Desktop/Tensor-FLow Project/images"  # папка где все .png
target_base_dir = "C:/Users/user/Desktop/Tensor-FLow Project/images_by_class"  # новая структура

# read Excel
df = pd.read_excel(excel_path)
df.columns = ['filename', 'label_str', 'partition']

# organize dataset
for _, row in df.iterrows():
    label = row['label_str']   # HP or SSA
    part = row['partition']    # train or test
    fname = row['filename']

    src_path = os.path.join(image_src_dir, fname)
    dst_dir = os.path.join(target_base_dir, part, label)
    dst_path = os.path.join(dst_dir, fname)

    os.makedirs(dst_dir, exist_ok=True)

    if os.path.exists(src_path):
        shutil.copy(src_path, dst_path)
    else:
        print(f"⚠️ File not found: {src_path}")


In [None]:
# path to test folder
test_dir = "C:/Users/user/Desktop/Tensor-FLow Project/images_by_class/test"

data = []
for label in ['HP', 'SSA']:
    class_dir = os.path.join(test_dir, label)
    for fname in os.listdir(class_dir):
        data.append({
            'filename': os.path.join(class_dir, fname),
            'class': label
        })

df_test = pd.DataFrame(data)

# Divide into validation and final test
df_val, df_final_test = train_test_split(
    df_test, test_size=0.2, stratify=df_test['class'], random_state=42
)

In [None]:
datagen = ImageDataGenerator(
    rescale=1./255,
    horizontal_flip=True,
    rotation_range=15,
    zoom_range=0.1
)

train_gen = datagen.flow_from_directory(
    "C:/Users/user/Desktop/Tensor-FLow Project/images_by_class/train",
    target_size=(224, 224),
    class_mode='binary',
    batch_size=16,
    shuffle=True
)

val_gen = datagen.flow_from_dataframe(
    df_val,
    x_col='filename',
    y_col='class',
    target_size=(224, 224),
    class_mode='binary',
    batch_size=16,
    shuffle=False
)

test_gen = datagen.flow_from_dataframe(
    df_final_test,
    x_col='filename',
    y_col='class',
    target_size=(224, 224),
    class_mode='binary',
    batch_size=16,
    shuffle=False
)

## Implementation






#### Random Search

In [None]:
!pip install keras-tuner

In [None]:
from keras_tuner import RandomSearch
import keras_tuner as kt

def focal_loss(gamma=2.0, alpha=0.75):
    def loss(y_true, y_pred):
        y_pred = K.clip(y_pred, K.epsilon(), 1. - K.epsilon())
        alpha_t = tf.where(tf.equal(y_true, 1), alpha, 1 - alpha)
        pt = tf.where(tf.equal(y_true, 1), y_pred, 1 - y_pred)
        return -alpha_t * K.pow(1. - pt, gamma) * K.log(pt)
    return loss


def build_model(hp):
    model = models.Sequential([
        layers.Input(shape=(224, 224, 3)),

        layers.Conv2D(hp.Int('conv1_filters', 32, 128, step=32), (3, 3), padding='same'),
        layers.BatchNormalization(),
        layers.ReLU(),
        layers.MaxPooling2D(),
        layers.Dropout(hp.Float('dropout1', 0.2, 0.5, step=0.1)),

        layers.Conv2D(hp.Int('conv2_filters', 64, 256, step=64), (3, 3), padding='same'),
        layers.BatchNormalization(),
        layers.ReLU(),
        layers.MaxPooling2D(),
        layers.Dropout(hp.Float('dropout2', 0.2, 0.5, step=0.1)),

        layers.Conv2D(hp.Int('conv3_filters', 128, 512, step=128), (3, 3), padding='same'),
        layers.BatchNormalization(),
        layers.ReLU(),
        layers.MaxPooling2D(),
        layers.Dropout(hp.Float('dropout3', 0.2, 0.5, step=0.1)),

        layers.Flatten(),
        layers.Dense(hp.Int('dense_units', 128, 512, step=64)),
        layers.BatchNormalization(),
        layers.ReLU(),
        layers.Dropout(hp.Float('dropout4', 0.2, 0.5, step=0.1)),

        layers.Dense(1, activation='sigmoid')
    ])

    model.compile(
        optimizer=optimizers.Adam(learning_rate=hp.Choice('lr', [1e-5])),
        loss=focal_loss(gamma=hp.Choice('gamma', [1.5, 2.0]), alpha=hp.Choice('alpha', [0.5, 0.75])),
        metrics=[
            'accuracy',
            metrics.AUC(name='auc'),
            metrics.Recall(name='tpr'),
            metrics.FalsePositives(name='fp'),
            metrics.TruePositives(name='tp'),
        ]
    )
    return model

tuner = RandomSearch(
    build_model,
    objective='val_auc',
    max_trials=8,
    executions_per_trial=1,
    directory='cnn_tuning',
    project_name='base_model_randomsearch'
)

early_stop = EarlyStopping(
    monitor='val_loss',
    patience=3,
    min_delta=0.01,
    restore_best_weights=True,
    verbose=1
)

tuner.search(train_gen, validation_data=val_gen, epochs=15, callbacks=[early_stop])

In [None]:
best_model = tuner.get_best_models(num_models=1)[0]
best_hp = tuner.get_best_hyperparameters(1)[0]
print("Best hyperparameters:")
print(best_hp.values)

In [None]:
import pandas as pd

# Get all trials
trials = tuner.oracle.trials.values()

# Collect hyperparameters and metrics
trial_data = []

for trial in trials:
    data = trial.hyperparameters.values.copy()  # Hyperparameters
    data['trial_id'] = trial.trial_id

    # Get metrics
    for metric_name, metric_history in trial.metrics.metrics.items():
        # Get the list of all metric's observations
        observations = metric_history.get_history()
        if observations:
            # Take last metric value
            last_value = observations[-1].value
            data[metric_name] = last_value
        else:
            data[metric_name] = None  # if we don't have any observation

    trial_data.append(data)

# Convert to DataFrame
df_trials = pd.DataFrame(trial_data)

# Save
df_trials.to_excel('C:/Users/user/Downloads/cnn_randomsearch_results1.xlsx', index=False)


df_trials.head(8)

#### Best trial

In [None]:
model = models.Sequential([
    layers.Input(shape=(224, 224, 3)),

    layers.Conv2D(128, (3, 3), padding='same'),
    layers.BatchNormalization(),
    layers.ReLU(),
    layers.MaxPooling2D(),
    layers.Dropout(0.2),

    layers.Conv2D(64, (3, 3), padding='same'),
    layers.BatchNormalization(),
    layers.ReLU(),
    layers.MaxPooling2D(),
    layers.Dropout(0.4),

    layers.Conv2D(512, (3, 3), padding='same'),
    layers.BatchNormalization(),
    layers.ReLU(),
    layers.MaxPooling2D(),
    layers.Dropout(0.3),

    layers.Flatten(),
    layers.Dense(448),
    layers.BatchNormalization(),
    layers.ReLU(),
    layers.Dropout(0.3),

    layers.Dense(1, activation='sigmoid')  # Выход
])

In [None]:
def focal_loss(gamma=2.0, alpha=0.75):
    def loss(y_true, y_pred):
        y_pred = K.clip(y_pred, K.epsilon(), 1. - K.epsilon())
        alpha_t = tf.where(tf.equal(y_true, 1), alpha, 1 - alpha)
        pt = tf.where(tf.equal(y_true, 1), y_pred, 1 - y_pred)
        return -alpha_t * K.pow(1. - pt, gamma) * K.log(pt)
    return loss


optimizer = optimizers.Adam(learning_rate=1e-4)

model.compile(optimizer=optimizer,
              loss=focal_loss(gamma=1.5, alpha=0.5),
              metrics=[
        'accuracy',
        metrics.AUC(name='auc'),
        metrics.Recall(name='tpr'),     # TPR = Recall
        metrics.FalsePositives(name='fp'),
        metrics.TruePositives(name='tp'),
    ]
)

In [None]:
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=3,
    min_delta=0.03,
    restore_best_weights=True,
    verbose=1
)

checkpoint = ModelCheckpoint(
    'AfterSearchCNNbest_model.keras',
    monitor='val_loss',
    save_best_only=True,
    save_weights_only=False,
    verbose=1
)

history = model.fit(
    train_gen,
    validation_data=val_gen,
    epochs=15,
    callbacks=[checkpoint]
)

In [None]:
from tensorflow.keras.models import load_model

BestCNNmodel = load_model('AfterSearchCNNbest_model.keras', custom_objects={'loss': focal_loss(gamma=1.5, alpha=0.75)})

In [None]:
plt.plot(history.history['loss'], label='train loss')
plt.plot(history.history['val_loss'], label='val loss')
plt.legend()
plt.title("Loss over epochs")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.savefig('C:/Users/user/Desktop/Tensor-FLow Project/Plots/CNN/Best_loss_plot.png', dpi=300)
plt.show()

## Evaluation

We use loss, accuracy and AUC as our primary evaluation metrics. Accuracy provides a general performance measure, AUC gives insight into the model’s discriminatory power. Additionally, confusion matrices are used for detailed error analysis.


In [None]:
plt.figure(figsize=(8, 5))

# Plots
plt.plot(history.history['loss'], label='Train loss',
         color='#f64ad6', linewidth=2.5, marker='o')
plt.plot(history.history['val_loss'], label='Validation loss',
         color='#A0D400', linewidth=2.5, marker='s')

# Design
plt.title("Model loss", fontsize=18, weight='bold')
plt.xlabel("Epoch", fontsize=14, weight='bold')
plt.ylabel("Loss", fontsize=14, weight='bold')
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.legend(fontsize=14)
plt.grid(False)
plt.tight_layout()
plt.savefig('C:/Users/user/Desktop/Tensor-FLow Project/Plots/CNN/Forpres_Best_loss_plot.png', dpi=300, transparent=True)
plt.show()

In [None]:
plt.figure(figsize=(8, 5))

# Plots
plt.plot(history.history['loss'], label='Train loss',
         color='#f64ad6', linewidth=2.5, marker='o')
plt.plot(history.history['val_loss'], label='Validation loss',
         color='#A0D400', linewidth=2.5, marker='s')

# Design
plt.title("Model loss", fontsize=18, weight='bold')
plt.xlabel("Epoch", fontsize=14, weight='bold')
plt.ylabel("Loss", fontsize=14, weight='bold')
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.legend(fontsize=14)
plt.grid(False)
plt.tight_layout()
plt.savefig('C:/Users/user/Desktop/Tensor-FLow Project/Plots/CNN/Forpres_Best_loss_plot.png', dpi=300, transparent=True)
plt.show()

In [None]:
plt.figure(figsize=(8, 5))

# Plots
plt.plot(history.history['auc'], label='Train AUC',
         color='#f64ad6', linewidth=2.5, marker='o')
plt.plot(history.history['val_auc'], label='validation AUC',
         color='#A0D400', linewidth=2.5, marker='s')

# Design
plt.title("Model AUC", fontsize=18, weight='bold')
plt.xlabel("Epoch", fontsize=14, weight='bold')
plt.ylabel("AUC", fontsize=14, weight='bold')
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.legend(fontsize=12)
plt.grid(False)
plt.tight_layout()
plt.savefig('C:/Users/user/Desktop/Tensor-FLow Project/Plots/CNN/Forpres_Best_AUC_plot.png', dpi=300, transparent=True)
plt.show()