In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix
import hdbscan
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
import numpy as np
from sklearn.metrics.cluster import contingency_matrix
from scipy.optimize import linear_sum_assignment
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from numpy import asarray
from sklearn.preprocessing import OneHotEncoder
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
from ucimlrepo import fetch_ucirepo 

adult = fetch_ucirepo(id=2)   #adult dataset 
X = adult.data.features 
y = adult.data.targets 

df = X.copy()
df['y'] = y
 
df=df.dropna()

df['target'] = df['y'].map({'<=50K': 0, '<=50K.': 0, '>50K': 1, '>50K.': 1})
df = df.drop('y', axis=1)

df_num = df[['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week', 'target']]

def sample_from_group(group):
    return group.sample(min(len(group), 6000), random_state=42)

sampled_df = df_num.groupby('target', group_keys=False).apply(sample_from_group)

sampled_df.reset_index(drop=True, inplace=True)


# MinMaxScaler
scaler = MinMaxScaler()
df_num_scaled = scaler.fit_transform(sampled_df)
df_num = pd.DataFrame(df_num_scaled, columns=df_num.columns)

target_counts = df_num['target'].value_counts()
print(target_counts)

num_observations = df_num.shape[0]
print("Number of Observations:", num_observations)

0.0    6000
1.0    6000
Name: target, dtype: int64
Number of Observations: 12000


In [3]:
# Randomly select 3000 rows
df_X_test = df_num.sample(n=3000, random_state=42)

# Remove the selected rows from the original DataFrame to create the second subset
df_subset_2 = df_num.drop(df_X_test.index)

# df_subset_1 contains 3000 randomly selected observations for test set
# and df_subset_2 contains the rest of the observations from the original subset for train and val

print("Number of rows in df_subset_1:", len(df_X_test))
print("Number of rows in df_subset_2:", len(df_subset_2))

Number of rows in df_subset_1: 3000
Number of rows in df_subset_2: 9000


In [42]:
df_num = df_subset_2.copy()

In [5]:
df_X_test1 = df_X_test.drop('target', axis=1)
df_y_test1 = df_X_test['target']

In [41]:
X = df_num.drop('target', axis=1)
y = df_num['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=100)

In [43]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

0.7026666666666667


In [44]:
# observations that was misclassified from val set

error_mask = y_test != y_pred
error_features = X_test[error_mask]
error_points = y_test[error_mask]

errors_0 = error_features[y_test[error_mask] == 0]
errors_1 = error_features[y_test[error_mask] == 1]
errors_0_y = error_points[y_test[error_mask] == 0]
errors_1_y = error_points[y_test[error_mask] == 1]

print(len(errors_0))
print(len(errors_1))

353
316


In [78]:
print(errors_0.describe().loc[['min', 'max']])
print(errors_1.describe().loc[['min', 'max']])

          age    fnlwgt  capital-gain  capital-loss  hours-per-week
min  0.082192  0.011991      0.000000      0.000000        0.010204
max  0.739726  0.649530      0.068491      0.459366        0.887755
          age    fnlwgt  capital-gain  capital-loss  hours-per-week
min  0.082192  0.000000       0.00000      0.000000        0.061224
max  0.821918  0.474041       0.04386      0.518365        0.908163


In [79]:
# Ranges for each feature for class 0
feature_ranges_0 = {
    'age': (0.082192, 0.739726),
    'fnlwgt': (0.011991, 0.649530),
    'capital-gain': (0.0, 0.068491),
    'capital-loss': (0.0, 0.459366),
    'hours-per-week': (0.010204,  0.887755)
}

# Ranges for each feature for class 1
feature_ranges_1 = {
    'age': (0.082192, 0.821918),
    'fnlwgt': (0.0, 0.474041),
    'capital-gain': (0.0,  0.04386 ),
    'capital-loss': (0.0, 0.518365),
    'hours-per-week': (0.061224, 0.908163)
}

# Filter class 0
filtered_class_0_with_y = df_num[(df_num['target'] == 0) &
    (df_num['age'].between(feature_ranges_0['age'][0], feature_ranges_0['age'][1])) &
    (df_num['fnlwgt'].between(feature_ranges_0['fnlwgt'][0], feature_ranges_0['fnlwgt'][1])) &
    (df_num['capital-gain'].between(feature_ranges_0['capital-gain'][0], feature_ranges_0['capital-gain'][1])) &
    (df_num['capital-loss'].between(feature_ranges_0['capital-loss'][0], feature_ranges_0['capital-loss'][1])) &
    (df_num['hours-per-week'].between(feature_ranges_0['hours-per-week'][0], feature_ranges_0['hours-per-week'][1]))
]

# Filter class 1
filtered_class_1_with_y = df_num[(df_num['target'] == 1) &
    (df_num['age'].between(feature_ranges_1['age'][0], feature_ranges_1['age'][1])) &
    (df_num['fnlwgt'].between(feature_ranges_1['fnlwgt'][0], feature_ranges_1['fnlwgt'][1])) &
    (df_num['capital-gain'].between(feature_ranges_1['capital-gain'][0], feature_ranges_1['capital-gain'][1])) &
    (df_num['capital-loss'].between(feature_ranges_1['capital-loss'][0], feature_ranges_1['capital-loss'][1])) &
    (df_num['hours-per-week'].between(feature_ranges_1['hours-per-week'][0], feature_ranges_1['hours-per-week'][1]))
]


print(filtered_class_0_with_y)
print(filtered_class_1_with_y)

           age    fnlwgt  capital-gain  capital-loss  hours-per-week  target
1     0.301370  0.254985           0.0       0.00000        0.397959     0.0
6     0.095890  0.155360           0.0       0.00000        0.295918     0.0
9     0.328767  0.036319           0.0       0.00000        0.397959     0.0
11    0.178082  0.072266           0.0       0.00000        0.602041     0.0
16    0.273973  0.021040           0.0       0.00000        0.397959     0.0
...        ...       ...           ...           ...             ...     ...
5990  0.547945  0.134525           0.0       0.00000        0.397959     0.0
5992  0.671233  0.074280           0.0       0.00000        0.500000     0.0
5994  0.438356  0.087531           0.0       0.00000        0.448980     0.0
5996  0.561644  0.202261           0.0       0.00000        0.397959     0.0
5997  0.219178  0.071503           0.0       0.20202        0.397959     0.0

[3505 rows x 6 columns]
            age    fnlwgt  capital-gain  capital-lo

In [45]:
from sklearn.neighbors import NearestNeighbors

# Concatenate errors and their corresponding labels for each class
errors_all = pd.concat([errors_0, errors_1])
errors_all_y = pd.concat([errors_0_y, errors_1_y])

# Calculate density for each data point
k_neighbors = 10  
nn = NearestNeighbors(n_neighbors=k_neighbors)
nn.fit(errors_all)
distances, _ = nn.kneighbors(errors_all)

# Calculate density as the inverse of the mean distance to the k nearest neighbors
density = 1.0 / (distances.mean(axis=1) + 1e-6)  # Add a small value to avoid division by zero

# Sort data points by density
sorted_indices = np.argsort(density)

# Select the top 30 % densest data points
densest_points_indices = sorted_indices[-int(len(errors_all) * 0.3):]

# Extract the densest 30 % points and their corresponding labels
densest_points = errors_all.iloc[densest_points_indices]
densest_points_y = errors_all_y.iloc[densest_points_indices]

# Print the lengths of densest points for each class
print("Number of densest points in class 0:", (densest_points_y == 0).sum())
print("Number of densest points in class 1:", (densest_points_y == 1).sum())

Number of densest points in class 0: 108
Number of densest points in class 1: 92


In [46]:
# Calculate the minimum and maximum values for each feature in the densest 30 % points for class 0
ranges_densest_points_class_0 = densest_points[densest_points_y == 0].agg(['min', 'max'])

# Calculate the minimum and maximum values for each feature in the densest 30 % points for class 1
ranges_densest_points_class_1 = densest_points[densest_points_y == 1].agg(['min', 'max'])

print("Ranges of the densest 50% points for class 0:")
print(ranges_densest_points_class_0)

print("\nRanges of the densest 50% points for class 1:")
print(ranges_densest_points_class_1)

Ranges of the densest 50% points for class 0:
          age    fnlwgt  capital-gain  capital-loss  hours-per-week
min  0.136986  0.028452           0.0           0.0        0.397959
max  0.561644  0.254985           0.0           0.0        0.408163

Ranges of the densest 50% points for class 1:
          age    fnlwgt  capital-gain  capital-loss  hours-per-week
min  0.123288  0.043407           0.0           0.0        0.377551
max  0.561644  0.256510           0.0           0.0        0.397959


In [81]:
# Ranges for each feature for class 0
feature_ranges_0 = {
    'age': (0.136986, 0.561645),
    'fnlwgt': (0.028452, 0.2550),
    'capital-gain': (0.0, 0.0),
    'capital-loss': (0.0, 0.0),
    'hours-per-week': (0.397959, 0.408164)
}

# Ranges for each feature for class 1
feature_ranges_1 = {
    'age': (0.123288, 0.561645),
    'fnlwgt': (0.043407, 0.256510),
    'capital-gain': (0.0, 0.0),
    'capital-loss': (0.0, 0.0),
    'hours-per-week': (0.377550, 0.39796)
}

# Filter class 0
filtered_class_0_with_y = df_num[(df_num['target'] == 0) &
    (df_num['age'].between(feature_ranges_0['age'][0], feature_ranges_0['age'][1])) &
    (df_num['fnlwgt'].between(feature_ranges_0['fnlwgt'][0], feature_ranges_0['fnlwgt'][1])) &
    (df_num['capital-gain'].between(feature_ranges_0['capital-gain'][0], feature_ranges_0['capital-gain'][1])) &
    (df_num['capital-loss'].between(feature_ranges_0['capital-loss'][0], feature_ranges_0['capital-loss'][1])) &
    (df_num['hours-per-week'].between(feature_ranges_0['hours-per-week'][0], feature_ranges_0['hours-per-week'][1]))
]

# Filter class 1
filtered_class_1_with_y = df_num[(df_num['target'] == 1) &
    (df_num['age'].between(feature_ranges_1['age'][0], feature_ranges_1['age'][1])) &
    (df_num['fnlwgt'].between(feature_ranges_1['fnlwgt'][0], feature_ranges_1['fnlwgt'][1])) &
    (df_num['capital-gain'].between(feature_ranges_1['capital-gain'][0], feature_ranges_1['capital-gain'][1])) &
    (df_num['capital-loss'].between(feature_ranges_1['capital-loss'][0], feature_ranges_1['capital-loss'][1])) &
    (df_num['hours-per-week'].between(feature_ranges_1['hours-per-week'][0], feature_ranges_1['hours-per-week'][1]))
]


print(filtered_class_0_with_y)
print(filtered_class_1_with_y)

           age    fnlwgt  capital-gain  capital-loss  hours-per-week  target
1     0.301370  0.254985           0.0           0.0        0.397959     0.0
9     0.328767  0.036319           0.0           0.0        0.397959     0.0
18    0.178082  0.038210           0.0           0.0        0.397959     0.0
34    0.260274  0.085881           0.0           0.0        0.397959     0.0
40    0.150685  0.111286           0.0           0.0        0.397959     0.0
...        ...       ...           ...           ...             ...     ...
5985  0.534247  0.096034           0.0           0.0        0.397959     0.0
5986  0.506849  0.092272           0.0           0.0        0.397959     0.0
5987  0.328767  0.177850           0.0           0.0        0.397959     0.0
5990  0.547945  0.134525           0.0           0.0        0.397959     0.0
5996  0.561644  0.202261           0.0           0.0        0.397959     0.0

[1157 rows x 6 columns]
            age    fnlwgt  capital-gain  capital-lo

In [82]:
iterations = 500

all_predicted_probabilities = []

for i in range(iterations):
    
    additional_samples_class_0 = df_num[(df_num['target'] == 0) & 
                                               (~df_num.index.isin(filtered_class_0_with_y.index))].sample(n=500)
    
    additional_samples_class_1 = df_num[(df_num['target'] == 1) & 
                                               (~df_num.index.isin(filtered_class_1_with_y.index))].sample(n=500)
    
    # Combine selected instances
    combined_df = pd.concat([filtered_class_0_with_y, filtered_class_1_with_y, additional_samples_class_0, additional_samples_class_1])
    
    X = combined_df.drop(columns=['target'])
    y = combined_df['target']
    
    clf = RandomForestClassifier()
    clf.fit(X, y)
    proba = clf.predict_proba(df_X_test1)
    
    all_predicted_probabilities.append(proba)

soft_voting_predictions = np.mean(all_predicted_probabilities, axis=0)

soft_voting_accuracy = accuracy_score(df_y_test1, np.argmax(soft_voting_predictions, axis=1))

print("Soft Voting Accuracy:", soft_voting_accuracy)


Soft Voting Accuracy: 0.688
Hard Voting Accuracy: 0.688


In [73]:
from sklearn.utils import resample

# Bootstrap

n_bootstraps = 500

accuracy_scores = []
all_predicted_probabilities = []

for i in range(n_bootstraps):
    X_train_bootstrap, y_train_bootstrap = resample(X, y, replace=True, random_state=i)
    
    model = RandomForestClassifier()
    model.fit(X_train_bootstrap, y_train_bootstrap)
    
    y_pred = model.predict(df_X_test1)

    all_predicted_probabilities.append(model.predict_proba(df_X_test1))
    
    accuracy = accuracy_score(df_y_test1, y_pred)
    accuracy_scores.append(accuracy)

mean_accuracy = np.mean(accuracy_scores)
print("Mean Accuracy:", mean_accuracy)

soft_voting_predictions = np.mean(all_predicted_probabilities, axis=0)
soft_voting_accuracy = accuracy_score(df_y_test1, np.argmax(soft_voting_predictions, axis=1))
print("Soft Voting Accuracy:", soft_voting_accuracy)


Mean Accuracy: 0.667432
Soft Voting Accuracy: 0.6796666666666666
Hard Voting Accuracy: 0.6796666666666666


In [74]:
# Single random forest

model = RandomForestClassifier()
model.fit(X, y)

y_pred = model.predict(df_X_test1)

accuracy = accuracy_score(df_y_test1, y_pred)
print(accuracy)

0.6686666666666666
