# Load Dataset

In [12]:
import pandas as pd
import numpy as np
import random
from IPython.display import display, HTML

In [13]:
train_df=pd.read_csv('./Kaggle Data/train_df.csv')
test_df =pd.read_csv('./Kaggle Data/test_df.csv')
# Cek Directory jika tidak terbaca
#import os
#print("Current Working Directory:", os.getcwd())
display(train_df)

Unnamed: 0,income,credit_risk_score,customer_age,payment_type,employment_status,current_address_months_count,fraud_bool
0,0.8,295,40,AC,CA,73,1
1,0.6,97,40,AB,CA,136,1
2,0.2,58,30,AA,CD,70,1
3,0.1,224,40,AB,CA,131,1
4,0.6,268,50,AB,CA,149,1
...,...,...,...,...,...,...,...
745,0.8,141,20,AD,CA,30,0
746,0.6,95,40,AB,CA,245,0
747,0.1,196,20,AB,CA,8,0
748,0.2,61,30,AB,CB,383,0


In [14]:
numerical_features_train = train_df.select_dtypes(include=[np.number])
num_summary_stats_train = numerical_features_train.describe().T
categorical_features_train = train_df.select_dtypes(include=[object])
cat_summary_stats_train = categorical_features_train.describe().T
print("Numerical Features Summary Statistics:")
display(num_summary_stats_train)
print("Categorical Features Summary Statistics:")
display(cat_summary_stats_train)

Numerical Features Summary Statistics:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
income,750.0,0.6136,0.288269,0.1,0.4,0.7,0.9,0.9
credit_risk_score,750.0,152.924,81.260141,-110.0,95.0,148.5,205.0,357.0
customer_age,750.0,37.706667,12.838499,10.0,30.0,40.0,50.0,80.0
current_address_months_count,750.0,101.602667,92.003857,-1.0,33.0,71.0,150.0,388.0
fraud_bool,750.0,0.508,0.50027,0.0,0.0,1.0,1.0,1.0


Categorical Features Summary Statistics:


Unnamed: 0,count,unique,top,freq
payment_type,750,4,AB,279
employment_status,750,6,CA,572


# Data Cleaning

In [15]:
# Berdasarkan metadata yang diberikan -1 berarti missing value untuk "current_address_months_count", kita gunakan imputation median
median_current_address = train_df['current_address_months_count'][train_df['current_address_months_count'] != -1].median()
median_current_address

71.5

# Preprocessing

In [16]:
# Label Encoding
payment_type_map = {
    'AA': 0.353846,
    'AB': 0.485255,
    'AC': 0.614198,
    'AD': 0.472222
}
employment_status_map = {
    'CA': 0.526868,
    'CB': 0.333333,
    'CC': 0.728814,
    'CD': 0.266667,
    'CE': 0.333333,
    'CF': 0.181818
}

class DataPreprocessor:
    def __init__(self):
        self.payment_type_map = {}
        self.employment_status_map = {}
        self.means = {}
        self.std_devs = {}

    def fit(self, df):
        # Label Encoding
        self.payment_type_map = df['payment_type'].value_counts(normalize=True).to_dict()
        self.employment_status_map = df['employment_status'].value_counts(normalize=True).to_dict()

        # Hanya fitur numerik tertentu yang belum terscale akan di standardisasi
        for feature in ['credit_risk_score', 'customer_age', 'current_address_months_count']:
            self.means[feature] = df[feature].mean()
            self.std_devs[feature] = df[feature].std()

    def transform(self, df):
        preprocessed_df = df.copy()

        # Label Encoding
        preprocessed_df['payment_type'] = preprocessed_df['payment_type'].map(self.payment_type_map).astype(float)
        preprocessed_df['employment_status'] = preprocessed_df['employment_status'].map(self.employment_status_map).astype(float)

        # Impute Missing Value
        preprocessed_df['current_address_months_count'] = preprocessed_df['current_address_months_count'].replace(-1, 71)  # 71 merupakan nilai median

        # Standard Scaling
        for feature in ['credit_risk_score', 'customer_age', 'current_address_months_count']:
            preprocessed_df[feature] = (preprocessed_df[feature] - self.means[feature]) / self.std_devs[feature]
        return preprocessed_df

    def fit_transform(self, df):
        self.fit(df)
        return self.transform(df)

In [17]:
preprocessor = DataPreprocessor()
preprocessed_train_df = preprocessor.fit_transform(train_df)
preprocessed_test_df = preprocessor.transform(test_df)

In [18]:
X_train = preprocessed_train_df.drop(columns=["fraud_bool"]).values
y_train = preprocessed_train_df["fraud_bool"].values

X_test = preprocessed_test_df.drop(columns=["fraud_bool"]).values
y_test = preprocessed_test_df["fraud_bool"].values

# Modeling

In [19]:
class RBF:
    def __init__(self, spread, centroids, debug=False):
        self.spread = spread
        self.centroids = np.array(centroids) 
        self.debug = debug
        self.weights = None  
        self.bias = 1 

    def _basis_function(self, r):
        """Radial basis function: exp(-r^2 / (2 * spread^2))"""
        return np.exp(-r ** 2 / (2 * self.spread ** 2))

    def _format_matrix_html(self, matrix, title):
        """Format a matrix as an HTML table (for debugging purposes)."""
        table_html = f"<b>{title}:</b><br><table style='border:1px solid black;'>"
        for row in matrix:
            table_html += "<tr>"
            for val in row:
                table_html += f"<td style='border:1px solid black; padding:5px;'>{val:.4f}</td>"
            table_html += "</tr>"
        table_html += "</table><br>"
        return table_html

    def _calculate_outputs(self, X):
        """Calculate the outputs of the hidden layer (RBF layer)."""
        # Ensure the centroids match the input dimensionality
        assert X.shape[1] == self.centroids.shape[1], f"Input dimensionality ({X.shape[1]}) must match centroid dimensionality ({self.centroids.shape[1]})"
        
        # Calculate distances from each input point to each centroid
        distances = np.linalg.norm(X[:, np.newaxis] - self.centroids, axis=2)
        if self.debug:
            display(HTML(self._format_matrix_html(distances, "Distances")))
        
        # Apply the RBF function to the distances
        outputs = self._basis_function(distances)
        if self.debug:
            display(HTML(self._format_matrix_html(outputs, "RBF Outputs")))
        return outputs

    def fit(self, X, y):
        """Train the RBF network using the pseudo-inverse."""
        rbf_outputs = self._calculate_outputs(X)
        y = y.reshape(-1, 1)
        rbf_outputs = np.hstack((rbf_outputs, np.ones((rbf_outputs.shape[0], 1))))

        # Calculate weights using pseudo-inverse
        self.weights = np.linalg.pinv(rbf_outputs) @ y

        if self.debug:
            display(HTML(self._format_matrix_html(self.weights, "Calculated Weights (including bias)")))

    def predict(self, X):
        """Predict using the trained RBF network."""
        rbf_outputs = self._calculate_outputs(X)
        rbf_outputs = np.hstack((rbf_outputs, np.ones((rbf_outputs.shape[0], 1))))  
        return rbf_outputs @ self.weights

In [20]:
class GeneticAlgorithm:
    def __init__(self, population_size, num_selection, mutation_rate, exposure_rate, max_iterations, seed, train_dataset, test_dataset, debug=True):
        self.population_size = population_size
        self.num_selection = num_selection
        self.mutation_rate = mutation_rate
        self.exposure_rate = exposure_rate
        self.max_iterations = max_iterations
        self.dataset_X_train, self.dataset_y_train = train_dataset
        self.dataset_X_test, self.dataset_y_test = test_dataset
        self.population = []
        np.random.seed(seed)
        self.debug = debug 
        self.initialize_population()

    def calculate_fitness(self):
        fitness_test_values = []
        fitness_train_values = []

        for centroids, spread in self.population:
            rbf = RBF(spread, centroids, debug=False)

            # Fit the model on the training data
            rbf.fit(self.dataset_X_train, self.dataset_y_train)

            # Predictions on the training data
            train_predictions = rbf.predict(self.dataset_X_train)
            train_predicted_classes = (train_predictions >= 0.5).astype(int)
            train_correct_predictions = sum(train_predicted_classes.flatten() == self.dataset_y_train)
            train_accuracy = train_correct_predictions / len(self.dataset_y_train)

            train_true_positive = sum((train_predicted_classes.flatten() == 1) & (self.dataset_y_train == 1))
            train_false_positive = sum((train_predicted_classes.flatten() == 1) & (self.dataset_y_train == 0))
            train_false_negative = sum((train_predicted_classes.flatten() == 0) & (self.dataset_y_train == 1))

            train_precision = train_true_positive / (train_true_positive + train_false_positive) if (train_true_positive + train_false_positive) > 0 else 0
            train_recall = train_true_positive / (train_true_positive + train_false_negative) if (train_true_positive + train_false_negative) > 0 else 0

            train_f1_score = 2 * (train_precision * train_recall) / (train_precision + train_recall) if train_precision + train_recall > 0 else 0
            fitness_train_values.append(train_f1_score)

            # Predictions on the test data
            test_predictions = rbf.predict(self.dataset_X_test)
            test_predicted_classes = (test_predictions >= 0.5).astype(int)
            test_correct_predictions = sum(test_predicted_classes.flatten() == self.dataset_y_test)
            test_accuracy = test_correct_predictions / len(self.dataset_y_test)

            test_true_positive = sum((test_predicted_classes.flatten() == 1) & (self.dataset_y_test == 1))
            test_false_positive = sum((test_predicted_classes.flatten() == 1) & (self.dataset_y_test == 0))
            test_false_negative = sum((test_predicted_classes.flatten() == 0) & (self.dataset_y_test == 1))

            test_precision = test_true_positive / (test_true_positive + test_false_positive) if (test_true_positive + test_false_positive) > 0 else 0
            test_recall = test_true_positive / (test_true_positive + test_false_negative) if (test_true_positive + test_false_negative) > 0 else 0

            test_f1_score = 2 * (test_precision * test_recall) / (test_precision + test_recall) if test_precision + test_recall > 0 else 0
            fitness_test_values.append(test_f1_score)

        return fitness_train_values, fitness_test_values

    def initialize_population(self):
        dataset_X = self.dataset_X_train
        for _ in range(self.population_size):
            num_centroids = np.random.randint(24, 90)
            centroids = self._generate_unique_centroids(dataset_X, num_centroids)
            spread = np.random.uniform(0.01, 2)
            self.population.append((centroids, spread))

        # Debug output for initial population
        if self.debug:
            print("Initial Population:")
            for i, (centroids, spread) in enumerate(self.population):
                print(f"Individual {i + 1}: Number of Centroids={len(centroids)}, Spread={spread:.4f}")

    def _generate_unique_centroids(self, dataset_X, num_centroids):
        """Generate unique centroids by ensuring no duplicates."""
        indices = np.random.choice(dataset_X.shape[0], num_centroids, replace=False)
        return dataset_X[indices]

    def select(self, fitness_values):
        total_fitness = sum(fitness_values)
        selection_probabilities = [f / total_fitness for f in fitness_values]
        cumulative_probabilities = np.cumsum(selection_probabilities)

        selected_population = []
        for _ in range(self.num_selection):
            random_value = np.random.rand()
            selected_index = np.searchsorted(cumulative_probabilities, random_value)
            selected_population.append(self.population[selected_index])

        if self.debug:
            print("Selected Population:")
            for i, (centroids, spread) in enumerate(selected_population):
                print(f"Selected Individual {i + 1}: Number of Centroids={len(centroids)}, Spread={spread:.4f}")
        return selected_population

    def crossover(self, parent1, parent2):
        x = np.random.uniform(0.2, 0.8)
        centroids1, spread1 = parent1
        centroids2, spread2 = parent2

        num_centroids1 = centroids1.shape[0]
        num_centroids2 = centroids2.shape[0]

        # Child 1
        num_child_centroids1_from_parent1 = round(x * num_centroids1)
        num_child_centroids1_from_parent2 = round((1 - x) * num_centroids2)

        selected_indices1 = np.random.choice(num_centroids1, num_child_centroids1_from_parent1, replace=False)
        selected_centroids1 = centroids1[selected_indices1]
        selected_indices2 = np.random.choice(num_centroids2, num_child_centroids1_from_parent2, replace=False)
        selected_centroids2 = centroids2[selected_indices2]
        child1_centroids = np.vstack((selected_centroids1, selected_centroids2))

        # Child 2
        num_child_centroids2_from_parent1 = round((1 - x) * num_centroids1)
        num_child_centroids2_from_parent2 = round(x * num_centroids2)

        selected_indices3 = np.random.choice(num_centroids1, num_child_centroids2_from_parent1, replace=False)
        selected_centroids3 = centroids1[selected_indices3]
        selected_indices4 = np.random.choice(num_centroids2, num_child_centroids2_from_parent2, replace=False)
        selected_centroids4 = centroids2[selected_indices4]
        child2_centroids = np.vstack((selected_centroids3, selected_centroids4))

        # Ensure uniqueness in child centroids
        child1_centroids = self._remove_duplicates(child1_centroids)
        child2_centroids = self._remove_duplicates(child2_centroids)

        if self.debug:
            print(f"Crossover: x={x:.4f}")
            print(f"Parent 1: Number of Centroids={num_centroids1}")
            print(f"Parent 2: Number of Centroids={num_centroids2}")
            print(f"Child 1: Number of Centroids={len(child1_centroids)}")
            print(f"Child 2: Number of Centroids={len(child2_centroids)}")

        return (child1_centroids, spread1), (child2_centroids, spread2)

    def _remove_duplicates(self, centroids):
        """Remove duplicate centroids."""
        unique_centroids = np.unique(centroids, axis=0)
        return unique_centroids[:max(1, len(centroids))]

    def mutate(self, chromosome):
        centroids, spread = chromosome
        if np.random.rand() < self.exposure_rate:
            # Original mutation on centroid values
            for i in range(len(centroids)):
                if np.random.rand() < self.mutation_rate:
                    mutation_factor = np.random.uniform(-self.mutation_rate, self.mutation_rate)
                    centroids[i] = centroids[i] * (1 + mutation_factor)

            # Update spread within bounds
            spread += np.random.uniform(-self.mutation_rate, self.mutation_rate)
            spread = np.clip(spread, 0.1, 3)

            # New mutation to modify the number of centroids
            if np.random.rand() < self.mutation_rate:
                # Calculate the percentage change in the number of centroids
                change_percent = np.random.choice([2, 5]) * self.mutation_rate
                target_centroids_count = max(1, round(len(centroids) * (1 + change_percent)))

                if target_centroids_count > len(centroids):
                    # Add new unique centroids if the target count is higher
                    additional_centroids = self._generate_unique_centroids(self.dataset_X_train, target_centroids_count - len(centroids))
                    centroids = np.vstack((centroids, additional_centroids))
                elif target_centroids_count < len(centroids):
                    # Remove random centroids if the target count is lower
                    keep_indices = np.random.choice(len(centroids), target_centroids_count, replace=False)
                    centroids = centroids[keep_indices]

            # Ensure centroids remain unique after mutation
            centroids = self._remove_duplicates(centroids)

            if self.debug:
                print(f"Mutating: New Number of Centroids={len(centroids)}, Spread={spread:.4f}")
        return centroids, spread

    def run(self):
        best_train_fitness = 0
        best_test_fitness = 0
        best_accuracy_train = 0
        best_accuracy_test = 0
        best_parameters = None

        for iteration in range(self.max_iterations):
            fitness_train_values, fitness_test_values = self.calculate_fitness()
            best_train_fitness = max(fitness_train_values)
            best_test_fitness = max(fitness_test_values)

            print(f"Iteration {iteration + 1}, Best Fitness Train: {best_train_fitness}, Best Fitness Test: {best_test_fitness}")

            best_index = np.argmax(fitness_test_values)
            best_centroids, best_spread = self.population[best_index]

            rbf = RBF(best_spread, best_centroids, debug=False)
            rbf.fit(self.dataset_X_train, self.dataset_y_train)

            # Training accuracy
            train_predictions = rbf.predict(self.dataset_X_train)
            train_predicted_classes = (train_predictions >= 0.5).astype(int)
            train_correct_predictions = sum(train_predicted_classes.flatten() == self.dataset_y_train)
            accuracy_train = train_correct_predictions / len(self.dataset_y_train)

            # Testing accuracy
            test_predictions = rbf.predict(self.dataset_X_test)
            test_predicted_classes = (test_predictions >= 0.5).astype(int)
            test_correct_predictions = sum(test_predicted_classes.flatten() == self.dataset_y_test)
            accuracy_test = test_correct_predictions / len(self.dataset_y_test)

            print(f"Iteration {iteration + 1}, Train Accuracy: {accuracy_train:.4f}, Test Accuracy: {accuracy_test:.4f}")

            if accuracy_test > best_accuracy_test:
                best_accuracy_train = accuracy_train
                best_accuracy_test = accuracy_test
                best_parameters = {
                    "num_centroids": len(best_centroids),
                    "spread": best_spread
                }

            selected_population = self.select(fitness_test_values)
            new_population = []

            while len(new_population) < self.population_size:
                parent1, parent2 = random.sample(selected_population, 2)
                child1, child2 = self.crossover(parent1, parent2)
                mutated_child1 = self.mutate(child1)
                mutated_child2 = self.mutate(child2)
                new_population.extend([mutated_child1, mutated_child2])

            # Ensure the new population doesn't exceed population_size
            self.population = new_population[:self.population_size]

        print(f"Best Train Accuracy: {best_accuracy_train:.4f}, Best Test Accuracy: {best_accuracy_test:.4f}")
        print(f"Best Parameters: Number of Centroids = {best_parameters['num_centroids']}, Spread = {best_parameters['spread']}")


In [25]:
# Parameters for the Genetic Algorithm
population_size = 500  # jumlah populasi awal
num_selection = 250    # Jumalah seleksi populasi, rekomendasi 50% populasi awal
mutation_rate = 0.4    # Probabilitas centroid mutasi
exposure_rate = 0.25   # Probabilitas chromosome terkena mutasi
max_iterations = 12     # Iterasi
seed = 109          # Seed Random

# Create an instance of the Genetic Algorithm
ga = GeneticAlgorithm(
    population_size=population_size,
    num_selection=num_selection,
    mutation_rate=mutation_rate,
    exposure_rate=exposure_rate,
    max_iterations=max_iterations,
    seed=seed,
    train_dataset=(X_train, y_train),
    test_dataset=(X_test, y_test),
    debug=False
)

# Evaluasi

In [26]:
np.random.seed(seed=109)
ga.run()

Iteration 1, Best Fitness Train: 0.783641160949868, Best Fitness Test: 0.7983539094650206
Iteration 1, Train Accuracy: 0.7707, Test Accuracy: 0.8040
Iteration 2, Best Fitness Train: 0.8430609597924773, Best Fitness Test: 0.8083333333333333
Iteration 2, Train Accuracy: 0.8387, Test Accuracy: 0.8160
Iteration 3, Best Fitness Train: 0.9270833333333333, Best Fitness Test: 0.8898305084745762
Iteration 3, Train Accuracy: 0.9253, Test Accuracy: 0.8960
Iteration 4, Best Fitness Train: 0.9308996088657107, Best Fitness Test: 0.8870292887029289
Iteration 4, Train Accuracy: 0.9293, Test Accuracy: 0.8920
Iteration 5, Best Fitness Train: 0.9322916666666666, Best Fitness Test: 0.8823529411764706
Iteration 5, Train Accuracy: 0.9307, Test Accuracy: 0.8880
Iteration 6, Best Fitness Train: 0.9352331606217616, Best Fitness Test: 0.8786610878661087
Iteration 6, Train Accuracy: 0.9333, Test Accuracy: 0.8840
Iteration 7, Best Fitness Train: 0.9076723016905073, Best Fitness Test: 0.8464730290456433
Iteration 