In [1]:
import os
import numpy as np
from sklearn.decomposition import PCA
from sklearn.svm import OneClassSVM
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [2]:
# Define a function for batch processing
def batch_process(model, data, batch_size):
    num_samples = len(data)
    num_batches = num_samples // batch_size

    results = []

    for i in range(num_batches):
        start = i * batch_size
        end = (i + 1) * batch_size
        batch_data = data[start:end]
        batch_result = model.predict(batch_data)
        results.extend(batch_result)

    return results

In [3]:
# Step 1: Load and preprocess the dataset
data_folder = "../data/"  # Update with the path to your dataset folder


In [4]:
def load_quickdraw_data(data_folder, n_samples=10000):
    data = []
    labels = []

    for category_file in os.listdir(data_folder):
        if category_file.endswith(".npy"):
            category = category_file.split("_")[-1][:-4]  # Extract the category from the filename
            category_data = np.load(os.path.join(data_folder, category_file))
            if len(category_data) > 0:  # Check if the loaded data is not empty
                if n_samples == -1:
                    data.extend(category_data)
                    labels.extend([category] * len(category_data))
                else:
                    data.extend(category_data[:n_samples])
                    labels.extend([category] * len(category_data[:n_samples]))
    
    return data, labels
                

In [5]:
# Define a data generator for PCA
def pca_data_generator(data, batch_size):
    num_samples = len(data)
    num_batches = num_samples // batch_size

    for i in range(num_batches):
        start = i * batch_size
        end = (i + 1) * batch_size
        batch_data = data[start:end]
        batch_data = np.array(batch_data)  # Ensure it's a NumPy array
        yield batch_data

In [6]:
data, labels = load_quickdraw_data(data_folder, n_samples=10000)  # Adjust as needed

In [7]:
# Step 2: Data preprocessing
batch_size = 100

preprocessed_data = []
num_samples = len(data)

for start in range(0, num_samples, batch_size):
    end = min(start + batch_size, num_samples)
    batch = np.array(data[start:end], dtype='float32') / 255.0
    preprocessed_data.extend(batch)

In [8]:
# Step 3: Perform PCA
batch_size_pca = 1000  # Adjust as needed
num_components = 50  # Adjust as needed
pca = PCA(n_components=num_components)

data_pca = None

pca_generator = pca_data_generator(data, batch_size_pca)

for batch in pca_generator:
    batch_pca = pca.fit_transform(batch)
    if data_pca is None:
        data_pca = batch_pca
    else:
        data_pca = np.vstack((data_pca, batch_pca))

In [9]:
# Step 4: Split the data
X_train, X_test, y_train, y_test = train_test_split(data_pca, labels, test_size=0.2, random_state=42)

In [10]:
# Step 5: Train the SVM model in batches
svm_model = OneClassSVM(nu=0.05)  # Adjust nu as needed

In [12]:
batch_size = 32  # Adjust as needed
num_batches = len(X_train) // batch_size

for i in range(num_batches):
    start = i * batch_size
    end = (i + 1) * batch_size
    batch_data = X_train[start:end]
    svm_model.fit(batch_data)

In [13]:
# Step 6: Create synthetic data (Example: Add noise to some samples)
synthetic_data = X_test.copy()
num_synthetic_samples = 100  # Adjust as needed
for _ in range(num_synthetic_samples):
    index = np.random.randint(0, len(synthetic_data))
    synthetic_data[index] += np.random.normal(0, 0.1, num_components)

In [19]:
# Step 7: Model evaluation using batch processing
y_pred_test = batch_process(svm_model, X_test, batch_size)
y_pred_synthetic = batch_process(svm_model, synthetic_data, batch_size)

In [20]:
print("Evaluation on the test data:")
print(classification_report(y_test, y_pred_test))

Evaluation on the test data:


ValueError: Found input variables with inconsistent numbers of samples: [690000, 689984]

In [21]:
print("Evaluation on synthetic data (anomalies):")
print(classification_report(np.ones(num_synthetic_samples), y_pred_synthetic))


Evaluation on synthetic data (anomalies):


ValueError: Found input variables with inconsistent numbers of samples: [100, 689984]