In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px

train_features = pd.read_csv("./train_NN.csv")
train_labels = pd.read_csv("./train_labels_NN.csv")
test_features = pd.read_csv("test_NN.csv")

In [2]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(train_features, train_labels)

In [3]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from kerastuner.tuners import RandomSearch
from sklearn.utils.validation import check_is_fitted, check_array
from sklearn.metrics import accuracy_score

# Custom RBF activation function
def rbf_activation(x):
    return tf.math.exp(-1.0 * tf.square(x))

class SimpleNeuralNetwork:
    def __init__(self, input_dim, output_dim):
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.model = None  # Initialize model attribute

    def build_model(self, hp):
        model = Sequential()

        # Add the specified number of hidden layers with units and activations
        for i in range(hp.get('num_hidden_layers')):
            model.add(Dense(
                units=hp.get(f'units_{i}'),
                activation=hp.get(f'activation_{i}'),
                kernel_initializer=hp.get(f'kernel_initializer_{i}'),
                kernel_regularizer=hp.get(f'kernel_regularizer_{i}'),
                input_dim=self.input_dim if i == 0 else None
            ))

            if hp.get(f'batch_normalization_{i}'):
                model.add(tf.keras.layers.BatchNormalization())

            model.add(tf.keras.layers.Dropout(hp.get(f'dropout_{i}')))
                    
        model.add(Dense(units=self.output_dim, activation='sigmoid'))

        # Compile the model with the specified learning rate
        model.compile(
            loss='binary_crossentropy',
            optimizer=tf.keras.optimizers.get(hp.get('optimizer')),
            metrics=['accuracy']
        )

        self.model = model  # Store the model in the attribute
        return model

    def tune_hyperparameters(self, X_train, y_train, epochs, batch_size, validation_data=None):
        tuner = RandomSearch(
            self.build_model,
            objective='val_accuracy',
            max_trials=10,  # Number of hyperparameter combinations to try
            directory='keras_tuner_dir',  # Directory to store the results
            project_name='simple_neural_network'
        )

        tuner.search(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=validation_data)

        # Get the best model and its hyperparameters
        best_model = tuner.get_best_models(num_models=1)[0]
        best_hyperparameters = tuner.oracle.get_best_trials(num_trials=1)[0].hyperparameters.values

        self.model = best_model
        return best_hyperparameters

    
        
    # Other methods remain unchanged
    # def train(self, X_train, y_train, epochs, batch_size, validation_data=None):
    #     class_weights = {0:8, 1:3}  # Adjust class weights based on your data distribution
    #     self.model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=validation_data, class_weight=class_weights)
    # def train(self, X_train, y_train, epochs, batch_size, validation_data=None):
    #     self.model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=validation_data)

    # def evaluate(self, X_test, y_test):
    #     loss, accuracy = self.model.evaluate(X_test, y_test)
    #     print(f'Test Loss: {loss:.4f}')
    #     print(f'Test Accuracy: {accuracy:.4f}')

    # def predict(self, X_data, threshold=0.5):
    #     predictions_proba = self.model.predict(X_data)
    #     predictions = (predictions_proba > threshold).astype(int)
    #     return predictions
    
    def fit(self, X, y, epochs, batch_size, validation_data=None):
        # ... Your existing model training code ...
        self.model.fit(X, y, epochs=epochs, batch_size=batch_size, validation_data=validation_data)

    def predict(self, X):
        check_is_fitted(self, 'model')
        X = check_array(X)
        predictions_proba = self.model.predict(X)
        predictions = (predictions_proba > 0.5).astype(int)  # Assuming binary classification
        return predictions

    def evaluate(self, X, y):
        check_is_fitted(self, 'model')
        X = check_array(X)
        y = check_array(y)
        loss, accuracy = self.model.evaluate(X, y)
        print(f'Test Loss: {loss:.4f}')
        print(f'Test Accuracy: {accuracy:.4f}')

    def score(self, X, y):
        predictions = self.predict(X)
        return accuracy_score(y, predictions)

# Usage
input_dim = X_train.shape[1]
output_dim = 1

# Instantiate SimpleNeuralNetwork with input_dim and output_dim
simple_nn = SimpleNeuralNetwork(input_dim, output_dim)

# Get the hyperparameters dictionary
hyperparameters = {'num_hidden_layers': 2,
                   'units_0': 128,
                   'activation_0': 'relu',
                   'kernel_initializer_0': 'he_normal',
                   'kernel_regularizer_0': None,
                   'batch_normalization_0': True,
                   'dropout_0': 0.3,
                   'units_1': 64,
                   'activation_1': 'relu',
                   'kernel_initializer_1': 'glorot_uniform',
                   'kernel_regularizer_1': 'l2',
                   'batch_normalization_1': False,
                   'dropout_1': 0.2,
                   'optimizer': 'adam',
                   'learning_rate': 0.0001}

# Build and train the model using the provided hyperparameters
model = simple_nn.build_model(hyperparameters)
simple_nn.fit(train_features, train_labels, epochs=10, batch_size=8192)


In [4]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from kerastuner.tuners import RandomSearch
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.metrics import accuracy_score

class SimpleNeuralNetwork(BaseEstimator, ClassifierMixin):
    def __init__(self, input_dim, output_dim):
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.model = self.build_model()  # Initialize model attribute

    def build_model(self):
        model = Sequential()
        model.add(Dense(units=128, activation='relu', input_dim=self.input_dim))
        model.add(BatchNormalization())
        model.add(Dropout(0.3))
        model.add(Dense(units=64, activation='relu'))
        model.add(Dense(units=self.output_dim, activation='sigmoid'))
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        return model

    def fit(self, X, y, epochs=10, batch_size=32, validation_data=None):
        X, y = check_X_y(X, y)
        self.model.fit(X, y, epochs=epochs, batch_size=batch_size, validation_data=validation_data)

    def predict(self, X):
        check_is_fitted(self, 'model')
        X = check_array(X)
        predictions_proba = self.model.predict(X)
        predictions = (predictions_proba[:, 1] > 0.65).astype(int)
        print(predictions.shape())
        return predictions
    
    def predict_proba(self, X):
        check_is_fitted(self, 'model')
        X = check_array(X)
        return self.model.predict(X)

    def evaluate(self, X, y):
        check_is_fitted(self, 'model')
        X = check_array(X)
        y = check_array(y)
        loss, accuracy = self.model.evaluate(X, y)
        print(f'Test Loss: {loss:.4f}')
        print(f'Test Accuracy: {accuracy:.4f}')

    def score(self, X, y):
        predictions = self.predict(X)
        return accuracy_score(y, predictions)

# Usage example
input_dim = X_train.shape[1]
output_dim = 1
simple_nn = SimpleNeuralNetwork(input_dim, output_dim)
simple_nn.fit(train_features, train_labels, epochs=3, batch_size=1024, validation_data=(X_val, y_val))


In [22]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score
from sklearn.datasets import make_classification
from lightgbm import LGBMClassifier

# # Create a synthetic dataset for demonstration purposes
# X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_classes=2, random_state=42)

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define individual models
# random_forest = RandomForestClassifier(n_estimators=100, random_state=42)

random_forest = RandomForestClassifier(
    n_estimators = 500,
    min_samples_split =2,
    max_features = 'log2',
    class_weight = 'balanced',
    random_state=42
)

# xgboost_model = XGBClassifier(random_state=42)

xgboost_model = XGBClassifier(
    n_estimators = 800,
    max_depth = 13,
    learning_rate = 0.08,
    gamma = 0.5,
    reg_lambda = 10,
    min_child_weight = 7,
    # objective = "reg:squaredlogerror"
    # colsample_bytree = 0.45
    # eval_metric = "logloss"
    scale_pos_weight = 1,
    random_state=42
)

lgbm = LGBMClassifier(
    boosting_type= 'gbdt',        # Boosting type: 'gbdt' (Gradient Boosting Decision Tree)
    num_leaves= 80,               # Maximum number of leaves in one tree
    max_depth= 50,                # Maximum depth of tree nodes (set to -1 for unlimited depth)
    learning_rate= 0.1,           # Step size shrinkage to prevent overfitting
    n_estimators= 700,            # Number of boosting rounds (trees to build)
    subsample_for_bin= 2000,      # Number of samples for constructing bins
    objective= 'binary',          # Objective function: 'binary' for binary classification
    metric= 'binary_logloss',     # Evaluation metric: 'binary_logloss' for binary classification
    colsample_bytree= 0.8,        # Fraction of features to be used for each boosting round
    reg_alpha= 0.3,               # L1 regularization term on weights
    reg_lambda= 0.3,              # L2 regularization term on weights
    min_split_gain= 0.0,          # Minimum loss reduction required to make a further partition
    min_child_samples= 20,        # Minimum number of data needed in a child (leaf)
    subsample= 1.0,               # Fraction of samples used for training (set to 1.0 for no subsampling)
    random_state= 42              # Seed for random number generation
)


# neural_network = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)

neural_network = MLPClassifier(
    hidden_layer_sizes=(100, 50),
    activation='relu',
    solver='adam',
    alpha=0.0001,
    batch_size='auto',
    learning_rate='constant',
    max_iter=200,
    random_state=42
)
# neural_network = simple_nn

# Define the ensemble model using a VotingClassifier
ensemble_model = VotingClassifier(estimators=[
    ('lgbm', lgbm),
    ('random_forest', random_forest),
    ('xgboost', xgboost_model),
    ('neural_network', neural_network)
], voting='soft')  # 'soft' for weighted voting based on probabilities



In [23]:
# # Train the ensemble model
# ensemble_model.fit(X_train, y_train)

# # Make predictions
# y_pred = ensemble_model.predict(X_val)

# # Evaluate the ensemble model
# accuracy = accuracy_score(y_val, y_pred)
# print(f"Ensemble Model Accuracy: {accuracy:.4f}")

In [24]:
ensemble_model.fit(train_features, train_labels)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


[LightGBM] [Info] Number of positive: 88304, number of negative: 88304
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.028527 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5893
[LightGBM] [Info] Number of data points in the train set: 176608, number of used features: 24
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [25]:
# Make predictions (class probabilities)
ensemble_predictions_proba = ensemble_model.predict_proba(test_features)
ensemble_predictions_proba = pd.DataFrame(ensemble_predictions_proba)
ensemble_predictions_proba[1].head()
# print(ensemble_predictions_proba.shape())

# Threshold the probabilities to obtain binary predictions
# ensemble_predictions = (ensemble_predictions_proba[:, 1] > 0.5).astype(int)

0    0.800564
1    0.058568
2    0.123681
3    0.533747
4    0.063443
Name: 1, dtype: float64

In [26]:
# predictions = pd.DataFrame(predictions)
# predictions.head()

# predictions[0] = predictions[0].astype('int32')
predictions = ensemble_predictions_proba[1]
predictions = (predictions > 0.4085).astype(int)
predictions.head()

0    1
1    0
2    0
3    1
4    0
Name: 1, dtype: int32

In [27]:
# writing predictions to submission.csv
fin_sub = pd.read_csv("./submission_NN.csv")
fin_sub["Task"] = predictions
fin_sub.head()

Unnamed: 0,ID,Task
0,100721,1
1,30234,0
2,28624,0
3,31173,1
4,573,0


In [28]:
fin_sub.to_csv('./submission_ensemble.csv', encoding='utf-8', index=False)