# Init project

In [1]:
import numpy as np
import pandas as pd

## Set random seed

In [2]:
np.random.seed(42)

In [3]:
n_points = 300

In [None]:
column1 = np.random.normal(loc=5, scale=1, size=n_points)
column2 = np.random.normal(loc=2.5, size=n_points)
column3 = np.random.randint(1, 10, size=n_points)
column4 = column1 + np.random.normal(loc=0, scale=0.5, size=n_points)
column5 = -column1 + np.random.normal(loc=0, scale=0.5, size=n_points)
column6 = np.random.normal(loc=0, scale=3, size=n_points)

In [13]:
data = pd.DataFrame({
    'Col1': column1,
    'Col2': column2,
    'Col3': column3,
    'Col4': column4,
    'Col5': column5,
    'Col6': column6
})

In [None]:
data.to_csv('./artificial_dataset.csv', index=False)
print("The dataset is saved in 'artificial_dataset.csv'.")

Dataset saved in 'artificial_dataset.csv'.


In [15]:
data.describe()

Unnamed: 0,Col1,Col2,Col3,Col4,Col5,Col6
count,300.0,300.0,300.0,300.0,300.0,300.0
mean,4.994451,2.47851,4.976667,4.989699,-5.01235,0.049447
std,0.984194,0.961618,2.606165,1.06832,1.063575,2.871818
min,1.758733,0.028355,1.0,1.680367,-9.02924,-8.613848
25%,4.316754,1.795595,3.0,4.244394,-5.685678,-1.87714
50%,5.059219,2.481235,5.0,4.955621,-5.005157,0.106975
75%,5.626658,3.116338,7.0,5.737971,-4.292856,1.90543
max,8.852731,5.578881,9.0,8.63024,-1.954515,7.989547


# Check result

In [16]:
results = {}

### Check that all columns have a mean different

In [17]:
means = data.mean()
results['Unique Means'] = means.is_unique

### Check that each column has a different standard deviation

In [18]:
stds = data.std()
results['Unique Standard Deviations'] = stds.is_unique

### Verify that at least one column contains integers

In [19]:
is_integer_column = any((data[col] == data[col].astype(int)).all() for col in data.columns)
results['At Least One Integer Column'] = is_integer_column

### Verify that at least one column contains floats

In [20]:
is_float_column = any((data[col].dtypes == float) for col in data.columns)
results['At Least One Float Column'] = is_float_column

### Verify that at least one column has a mean close to 2.5

In [21]:
mean_close_to_2_5 = any(np.isclose(data[col].mean(), 2.5, atol=0.1) for col in data.columns)
results['Mean Close to 2.5'] = mean_close_to_2_5

### Verify positive correlations (> 0.2)

In [22]:
correlations = data.corr()
positive_correlation = (correlations > 0.2).any().any()
results['Positive Correlation close to 0.2 Exist'] = positive_correlation

### Verify negative correlations (< -0.4)

In [23]:
negative_correlation = (correlations < -0.4).any().any()
results['Negative Correlation to -0.4 Exist'] = negative_correlation

### Verify correlations close to 0

In [24]:
correlation_close_to_0 = ((correlations > -0.1) & (correlations < 0.1)).sum().sum() > len(data.columns)
results['Correlation Close to 0 Exists'] = correlation_close_to_0

# Check Result

In [25]:
for key, value in results.items():
    assert value, f"{key} test failed"