In [1]:
# imports
import numpy as np
import umap

# modelling
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, precision_recall_curve
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

# Tree Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# misc
import os

In [2]:
# Function to load and preprocess Quickdraw data
def load_quickdraw_data(dir, category_name, n_samples=10000):
    file = f'full_numpy_bitmap_{category_name}.npy'
    data = np.load(os.path.join(dir, file))
    if n_samples == -1:
        return data
    else:
        return data[:n_samples]

In [3]:
# Data directory
dir = '../data/'

# Load and preprocess data
all_data = []
labels = []

In [4]:
files = os.listdir(dir)
categories = [file.split('_')[-1].split('.')[0] for file in files]

In [5]:
for category_name in categories:
    category_data = load_quickdraw_data(dir, category_name)
    all_data.extend(category_data)
    labels.extend([category_name] * len(category_data))

In [6]:
# Data generator function
def data_generator(data, batch_size=512):
    n_samples = len(data)
    for i in range(0, n_samples, batch_size):
        batch = data[i:i + batch_size]
        yield batch

In [7]:
# Split the data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(all_data, labels, test_size=0.2, random_state=42)

In [8]:
print(len(all_data))

3450000


In [9]:
# Normalize data
x_train = np.array(x_train).astype(np.float32) / 255
x_test = np.array(x_test).astype(np.float32) / 255

In [10]:
# Add Outliers
num_outliers = 500

# Generate random outliers
outliers = np.random.rand(num_outliers, 784)  # Assuming your data has 784 features

In [11]:
# Append outliers to your training data
x_train_with_outliers = np.vstack((x_train, outliers))

# Create labels for the outliers (e.g., label them as "outlier" or use a different category)
y_train_with_outliers = y_train + ['outlier'] * num_outliers

In [12]:
# Dimensionality reduction using UMAP
n_components = 10  # You can adjust this number as needed
umap_model = umap.UMAP(n_components=n_components)

In [13]:
umap_results_train = None
# Perform UMAP on x_train_with_outliers
for batch in data_generator(x_train_with_outliers):
    batch_umap = umap_model.fit_transform(batch)
    if umap_results_train is None:
        umap_results_train = batch_umap
    else:
        umap_results_train = np.vstack((umap_results_train, batch_umap))

MemoryError: Unable to allocate 15.4 MiB for an array with shape (404992, 10) and data type float32

In [None]:
# Shuffle the dataset after adding synthetic outliers
umap_results_train, y_train_with_outliers = shuffle(umap_results_train, y_train_with_outliers, random_state=42)

In [None]:
# Train Isolation Forest with outliers
model = IsolationForest(contamination=0.02, random_state=42)

train_data_generator = data_generator(umap_results_train)

for batch in train_data_generator:
    model.fit(batch)

In [None]:
# Predict labels for x_train_with_outliers to check if outliers are caught
y_pred_train = np.array([])  # Initialize an empty array to store predictions

for batch in data_generator(umap_results_train):
    batch_pred = model.predict(batch)
    y_pred_train = np.concatenate([y_pred_train, batch_pred])

In [None]:
# Map the predictions: 1 for outliers, 0 for inliers
y_pred_train[y_pred_train == 1] = 0  # Inliers
y_pred_train[y_pred_train == -1] = 1  # Outliers

y_train_numeric = [1 if label == 'outlier' else 0 for label in y_train_with_outliers]

In [None]:
# Evaluate the model's performance on training data
precision_train = precision_score(y_train_numeric, y_pred_train)
recall_train = recall_score(y_train_numeric, y_pred_train)
f1_train = f1_score(y_train_numeric, y_pred_train)

print("Training Data Classification Report:")
print(classification_report(y_train_numeric, y_pred_train, target_names=["inlier", "outlier"]))
print("Training Data Precision:", precision_train)
print("Training Data Recall:", recall_train)
print("Training Data F1 Score:", f1_train)

NameError: name 'precision_score' is not defined

In [None]:
# Plot the data to visualize outliers and inliers using UMAP-transformed data
plt.figure(figsize=(10, 6))
colors = ['blue' if label == 'inlier' else 'red' for label in y_train_with_outliers]
plt.scatter(umap_results_train[:, 0], umap_results_train[:, 1], c=colors)
plt.title("Outliers and Inliers (UMAP)")
plt.xlabel("UMAP Component 1")
plt.ylabel("UMAP Component 2")
plt.show()

In [None]:
# Precision-Recall Curve
precision, recall, _ = precision_recall_curve(y_train_numeric, model.decision_function(umap_results_train))
plt.figure(figsize=(10, 6))
plt.plot(recall, precision, marker='.', color='blue')
plt.title('Precision-Recall Curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.show()

In [None]:
# Predict outliers on the test set
test_data_generator = data_generator(umap_model.transform(x_test))

In [None]:
# Predict outliers on the test set
y_pred = np.array([])
for batch in test_data_generator:
    y_pred_batch = model.predict(batch)
    y_pred = np.concatenate([y_pred, y_pred_batch])

In [None]:
# Map the predictions: 1 for outliers, 0 for inliers
y_pred[y_pred == 1] = 0  # Inliers
y_pred[y_pred == -1] = 1  # Outliers

y_test_numeric = [1 if label == 'outlier' else 0 for label in y_test]

In [None]:
precision = precision_score(y_test_numeric, y_pred)
recall = recall_score(y_test_numeric, y_pred)
f1 = f1_score(y_test_numeric, y_pred)

print("Classification Report:")
print(classification_report(y_test_numeric, y_pred))
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

In [None]:
# Plot the data to visualize outliers and inliers in x_test using UMAP-transformed data
plt.figure(figsize=(10, 6))
colors = ['blue' if label == 'inlier' else 'red' for label in y_test]
plt.scatter(umap_model.transform(x_test)[:, 0], umap_model.transform(x_test)[:, 1], c=colors)
plt.title("Outliers and Inliers in x_test (UMAP)")
plt.xlabel("UMAP Component 1")
plt.ylabel("UMAP Component 2")
plt.show()