
# Part 1 – Artificial Dataset Generation

**Summary**

This notebook generates a synthetic dataset (300 rows × 6 columns) that satisfies every statistical requirement:

* Unique mean and standard deviation for each column  
* At least one integer and one float column  
* One column with mean close to **2.5**  
* Includes positive correlation > 0.2, negative correlation < −0.4, and correlations near 0  

The notebook **validates** all constraints before exporting `artificial_dataset.csv`.  


In [1]:
import numpy as np
import pandas as pd

np.random.seed(42)  # Reproducibility


In [None]:
n_rows = 300  # number of observations

# Feature definitions
col_pos_float = np.random.normal(loc=5.0, scale=1.0, size=n_rows)            # mean ≈ 5
col_mean_2_5  = np.random.normal(loc=2.5, scale=0.8, size=n_rows)            # mean ≈ 2.5
col_integers  = np.random.randint(1, 10, size=n_rows)                        # integers
col_pos_corr  = col_pos_float + np.random.normal(0.0, 0.5, size=n_rows)      # strongly + correlated
col_neg_corr  = -col_pos_float + np.random.normal(0.0, 0.5, size=n_rows)     # strongly - correlated
col_noise     = np.random.normal(loc=0.0, scale=3.0, size=n_rows)            # mostly uncorrelated noise


Unnamed: 0,pos_float,around_2.5,integers,pos_corr,neg_corr,noise
0,5.496714,1.836804,4,5.263989,-5.791096,0.710884
1,4.861736,2.051855,2,5.304994,-4.85709,-1.364461
2,5.647689,3.097835,3,4.945109,-5.578017,-2.407478
3,6.52303,2.988296,3,6.773547,-6.296762,3.896434
4,4.765847,2.483279,4,4.244604,-4.832637,-2.942752


In [None]:
data = pd.DataFrame({
    'pos_float': col_pos_float,
    'around_2.5': col_mean_2_5,
    'integers': col_integers,
    'pos_corr': col_pos_corr,
    'neg_corr': col_neg_corr,
    'noise': col_noise,
})

data.head()

In [None]:
def _has_integer_column(df):
    return any((df[col] == df[col].astype(int)).all() for col in df.columns)

def _has_float_column(df):
    return any(df.dtypes[col].kind == 'f' for col in df.columns)

def validate_dataset(df):

    assert df.mean().is_unique, "means are not unique"
    assert df.std().is_unique, "std deviations are not unique"

    assert _has_integer_column(df), "no integer column found"
    assert _has_float_column(df), "no float column found"

    assert any(np.isclose(df[col].mean(), 2.5, atol=0.1) for col in df.columns), "no mean ≈ 2.5"

    corr      = df.corr(numeric_only=True)
    mask      = np.triu(np.ones_like(corr, dtype=bool), k=1)
    corr_vals = corr.where(~mask)

    assert (corr_vals  >  0.2).any().any(), "no positive correlation > 0.2"
    assert (corr_vals  < -0.4).any().any(), "no negative correlation < -0.4"
    assert ((corr_vals > -0.1) & (corr_vals < 0.1)).sum().sum() > 0, "no correlation near 0"

    print("Validation checks passed.")

validate_dataset(data)


✅ All validation checks passed!


In [5]:
data.to_csv('artificial_dataset.csv', index=False)
print("Dataset exported to 'artificial_dataset.csv'")


Dataset exported to 'artificial_dataset.csv'


In [6]:
data.describe()

Unnamed: 0,pos_float,around_2.5,integers,pos_corr,neg_corr,noise
count,300.0,300.0,300.0,300.0,300.0,300.0
mean,4.994451,2.482808,4.976667,4.989699,-5.01235,0.049447
std,0.984194,0.769294,2.606165,1.06832,1.063575,2.871818
min,1.758733,0.522684,1.0,1.680367,-9.02924,-8.613848
25%,4.316754,1.936476,3.0,4.244394,-5.685678,-1.87714
50%,5.059219,2.484988,5.0,4.955621,-5.005157,0.106975
75%,5.626658,2.99307,7.0,5.737971,-4.292856,1.90543
max,8.852731,4.963105,9.0,8.63024,-1.954515,7.989547
