### Part 1: Artificial Dataset Generation 


The goal of this first part is to generate a numerical dataset containing 300 lines and 6 columns named artificial_dataset.csv.

# Requirements:

- Each column must have a different mean.
- Each column must have a different standard deviation.
- At least one column should contain integers.
- At least one column should contain floats.
- One column must have a mean close to 2.5.
- Some columns must be positively correlated.
- Some columns must be negatively correlated.
- Some columns must have a correlation close to 0.

In [7]:
import numpy as np
import pandas as pd
from pandas.core.frame import DataFrame
from typing import Tuple

# Set a random seed for reproducibility
np.random.seed(57)

# Function to generate artificial dataset
def generate_artificial_dataset() -> DataFrame:
    # Define dataset properties
    num_rows: int = 300
    num_columns: int = 6

    # Create an empty dataframe
    dataset: DataFrame = pd.DataFrame()

    # Generate columns with different means and standard deviations
    for i in range(num_columns):
        mean: float = np.random.uniform(0, 10)
        std_dev: float = np.random.uniform(0.5, 5)

        # Determine column type (integer or float)
        column_type: str = np.random.choice(['int', 'float'])

        # Generate column data based on type
        if column_type == 'int':
            column_data = np.random.randint(0, 100, size=num_rows)
        else:
            column_data = np.random.uniform(0, 100, size=num_rows)

        # Add column to the dataset
        dataset[f'Column_{i+1}'] = column_data

        # Apply mean and standard deviation to the column
        dataset[f'Column_{i+1}'] = dataset[f'Column_{i+1}'] * std_dev + mean

    # Add a column with a mean close to 2.5
    dataset['Column_7'] = np.random.normal(2.5, 0.1, size=num_rows)

    # Create positively correlated columns
    dataset['Column_8'] = dataset['Column_1'] + np.random.normal(0, 2, size=num_rows)

    # Create negatively correlated columns
    dataset['Column_9'] = -dataset['Column_2'] + np.random.normal(0, 2, size=num_rows)

    # Create columns with correlation close to 0
    dataset['Column_10'] = np.random.normal(0, 1, size=num_rows)
    dataset['Column_11'] = np.random.normal(0, 1, size=num_rows)

    return dataset

# Generate the artificial dataset
artificial_data: DataFrame = generate_artificial_dataset()

# Save the dataset to a CSV file
artificial_data.to_csv('artificial_dataset.csv', index=False)


Pol-Antoine Loiseau - Florent Rossignol