In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from tensorflow import keras
from tensorflow.keras import layers, regularizers
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.base import BaseEstimator, ClassifierMixin
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import StratifiedKFold, GridSearchCV

# Load the dataset
data = pd.read_csv(r'C:\Users\Shakshi Singh\Documents\ml project\secom.data', delim_whitespace=True, header=None)
labels = pd.read_csv(r'C:\Users\Shakshi Singh\Documents\ml project\secom_labels.data', delim_whitespace=True, header=None)

# Check for null values
print("Null values in dataset:", data.isnull().sum().sum())

# Separate features and target variable
X = data.values
y = labels.iloc[:, 0].replace({-1: 0, 1: 1}).values  # Convert -1 to 0 for binary classification

# Impute missing values (NaN)
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Feature selection using ANOVA F-test
selector = SelectKBest(score_func=f_classif, k=40)  # Select top 40 features
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

# Define the model creation function with Batch Normalization and L2 regularization
def create_model():
    model = keras.Sequential([
        layers.Input(shape=(X_train_selected.shape[1],)),
        layers.Dense(512, activation='relu', kernel_regularizer=regularizers.l2(0.001)),
        layers.BatchNormalization(),
        layers.Dropout(0.5),
        layers.Dense(256, activation='relu', kernel_regularizer=regularizers.l2(0.001)),
        layers.BatchNormalization(),
        layers.Dropout(0.5),
        layers.Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.001)),
        layers.BatchNormalization(),
        layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.0001), 
                  loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Define the custom Keras classifier to use with GridSearchCV
class KerasClassifierCustom(BaseEstimator, ClassifierMixin):
    def __init__(self, build_fn=None, epochs=50, batch_size=32, verbose=0):
        self.build_fn = build_fn
        self.epochs = epochs
        self.batch_size = batch_size
        self.verbose = verbose
        self.model = None

    def fit(self, X, y):
        early_stopping = EarlyStopping(monitor='val_loss', patience=10)
        self.model = self.build_fn()
        self.model.fit(X, y, epochs=self.epochs, batch_size=self.batch_size, verbose=self.verbose, validation_split=0.2, callbacks=[early_stopping])
        return self

    def predict(self, X):
        return (self.model.predict(X) > 0.5).astype(int)

    def score(self, X, y):
        return self.model.evaluate(X, y, verbose=0)[1]  # Returns accuracy

# Cross-validation with GridSearchCV for hyperparameter tuning
param_grid = {
    'epochs': [50, 100],
    'batch_size': [32, 64],
    'verbose': [0],
}

grid_search = GridSearchCV(estimator=KerasClassifierCustom(build_fn=create_model), param_grid=param_grid, cv=3)
grid_search.fit(X_train_selected, y_train)

print(f"Best parameters: {grid_search.best_params_}")
model = grid_search.best_estimator_

# Make predictions and evaluate
y_pred = model.predict(X_test_selected)

# Confusion Matrix and Classification Report
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Cross-validation score
cv_scores = cross_val_score(model, X_train_selected, y_train, cv=10)
print(f'10-fold CV Average Accuracy: {np.mean(cv_scores):.4f}')
