In [103]:
from variable_importance.dgp import DataGenerator

dgp = DataGenerator(num_cols=10, num_rows=10, num_important=5, effects='all', num_interaction_terms=2, correlation_range=[-1, -0.9])
data = dgp.generate_data()
#dgp.frequencies

In [56]:
from variable_importance.dgp import DataGenerator

try:
    for i in range(1000):
        dgp = DataGenerator(num_cols=100, num_rows=10, num_important=5, effects='all', num_interaction_terms=20, correlation_range=[-1, -0.9])
        data = dgp.generate_data()
except Exception as e:
    print("exception occurred", e)


In [94]:
import unittest
import numpy as np
from variable_importance.dgp import DataGenerator

class TestNoiseGeneration(unittest.TestCase):
    def setUp(self):
        self.num_cols = 10
        self.generator = DataGenerator(num_cols=self.num_cols, num_rows=100)

    def test_uniform_noise(self):
        noise = self.generator.generate_noise(size=1000, distribution='uniform', scale=5)
        self.assertEqual(len(noise), 1000)
        self.assertTrue(np.all(noise >= -5) and np.all(noise <= 5))

    def test_normal_noise(self):
        noise = self.generator.generate_noise(size=1000, distribution='normal', scale=2)
        self.assertEqual(len(noise), 1000)
        self.assertTrue(np.mean(noise) < 0.5)  # Assuming mean is approximately 0
        self.assertTrue(np.std(noise) > 1.5 and np.std(noise) < 2.5)  # Assuming std deviation is approximately 2

    def test_gamma_noise(self):
        noise = self.generator.generate_noise(size=1000, distribution='gamma', scale=2)
        self.assertEqual(len(noise), 1000)
        self.assertTrue(np.all(noise >= 0))  # Gamma distribution is always non-negative
        self.assertTrue(np.mean(noise) > 0)

    def test_unsupported_distribution(self):
        with self.assertRaises(ValueError):
            self.generator.generate_noise(size=100, distribution='unsupported', scale=1)

if __name__ == '__main__':
    unittest.main(argv=['first-arg-is-ignored'], exit=False)

....
----------------------------------------------------------------------
Ran 4 tests in 0.012s

OK


In [100]:
import unittest
import numpy as np
import pandas as pd

class TestDataGenerator(unittest.TestCase):
    def setUp(self):
        self.generator = DataGenerator(num_cols=10, num_rows=100, num_important=3, noise_scale=1, noise_distribution='normal')

    def test_initialization(self):
        """Test the initialization and settings of DataGenerator parameters."""
        self.assertEqual(self.generator.num_cols, 10)
        self.assertEqual(self.generator.num_rows, 100)
        self.assertEqual(self.generator.num_important, 3)
        self.assertIsNotNone(self.generator.frequencies)
        self.assertEqual(len(self.generator.frequencies), 10)

    def test_noise_generation(self):
        """Test noise generation for uniform and normal distributions."""
        noise_uniform = self.generator.generate_noise(100, 'uniform', 1)
        noise_normal = self.generator.generate_noise(100, 'normal', 1)
        self.assertEqual(len(noise_uniform), 100)
        self.assertEqual(len(noise_normal), 100)
        self.assertTrue(np.all(noise_uniform >= -1) and np.all(noise_uniform <= 1))
        # Check that the noise is normally distributed by checking the mean is close to 0
        self.assertTrue(abs(np.mean(noise_normal)) < 0.5)

    def test_interaction_generation(self):
        """Test the generation of interaction terms."""
        interactions = self.generator.generate_interactions()
        self.assertIsInstance(interactions, dict)
        self.assertTrue(all(isinstance(k, tuple) and len(k) == 2 for k in interactions.values()))

    def test_data_generation(self):
        """Test the overall data generation process."""
        df = self.generator.generate_data()
        self.assertIsInstance(df, pd.DataFrame)
        self.assertEqual(df.shape, (100, 11))  # 10 features + 1 target
        self.assertTrue('target' in df.columns)

    def test_effects_application(self):
        """Test that effects are applied correctly."""
        self.generator.effects = {i: (lambda x: x * 2) for i in range(10)}
        df = self.generator.generate_data()
        for col in range(10):
            with self.subTest(column=col):
                self.assertTrue((df[col] * 2).equals(df[col] * df[col].apply(self.generator.effects[col])))

# Running the tests in Jupyter Notebook
if __name__ == '__main__':
    unittest.main(argv=['first-arg-is-ignored'], exit=False)

.

.F......
FAIL: test_initialization (__main__.TestDataGenerator)
Test the initialization and settings of DataGenerator parameters.
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/tmp/ipykernel_7197/2874237749.py", line 15, in test_initialization
    self.assertEqual(len(self.generator.frequencies), 10)
AssertionError: 100 != 10

----------------------------------------------------------------------
Ran 9 tests in 0.043s

FAILED (failures=1)
