In [6]:
import numpy as np
import pandas as pd

# Generate X and Y values for the first time and y_values will be used in another function
def generate_data(correlation, n_samples=1000, min_value=-10, max_value=10, noise=0.1, z_score=2):
    # Generate random values for x within the specified range
    x_values = np.random.uniform(min_value, max_value, n_samples)
    # Calculate the mean and standard deviation of x
    x_mean = np.mean(x_values)
    x_std = np.std(x_values)
    # Generate random noise
    noise_values = np.random.normal(0, noise, n_samples)
    # Generate random values for y within the specified range
    y_values = np.random.uniform(min_value, max_value, n_samples)
    # Calculate the mean and standard deviation of y
    y_mean = np.mean(y_values)
    y_std = np.std(y_values)
    # Calculate the slope and intercept based on the desired correlation
    slope = correlation * (y_std / x_std)
    intercept = y_mean - slope * x_mean
    # Generate y values based on the linear relationship with x and add noise
    y_values = slope * (x_values - x_mean) / x_std + intercept + noise_values
    # Identify and remove outliers based on the z-score threshold
    z_scores = np.abs((y_values - np.mean(y_values)) / np.std(y_values))
    outliers = z_scores > z_score
    x_values = x_values[~outliers]
    y_values = y_values[~outliers]
    outliers_mask = ~outliers
    return x_values, y_values, outliers_mask

def generate_x_from_y(correlation, y_values, n_samples=1000, min_value=-10, max_value=10, noise=0.1, z_score=2):
    # Generate random values for x within the specified range
    x_values = np.random.uniform(min_value, max_value, n_samples)
    # Calculate the mean and standard deviation of x
    x_mean = np.mean(x_values)
    x_std = np.std(x_values)
    # Generate random noise
    noise_values = np.random.normal(0, noise, len(y_values))
    # Calculate the slope and intercept based on the desired correlation
    slope = correlation * (np.std(y_values) / x_std)
    intercept = np.mean(y_values) - slope * x_mean
    # Generate new x values based on the linear relationship with y and add noise
    new_x_values = (y_values - intercept) / slope * x_std + x_mean + noise_values
    # Identify and remove outliers based on the z-score threshold
    z_scores = np.abs((new_x_values - np.mean(new_x_values)) / np.std(new_x_values))
    outliers = z_scores > z_score
    new_x_values = new_x_values[~outliers]
    y_values = y_values[~outliers]
    return new_x_values, y_values

# Set random seed for reproducibility
np.random.seed(42)

# Generate initial data for dsp and eth
dsp_values, eth_values, _ = generate_data(correlation=0.8, n_samples=1000, min_value=0, max_value=75, noise=5, z_score=3)

# Generate other features based on dsp and eth
humidity_values, _ = generate_x_from_y(correlation=0.6, y_values=eth_values, min_value=0, max_value=100, noise=5, z_score=3)
temperature_values, _ = generate_x_from_y(correlation=-0.7, y_values=eth_values, min_value=0, max_value=55, noise=3, z_score=3)
sun_exposure_values, _ = generate_x_from_y(correlation=-0.5, y_values=eth_values, min_value=12, max_value=19, noise=1, z_score=3)
ph_level_values, _ = generate_x_from_y(correlation=0.4, y_values=eth_values, min_value=0, max_value=14, noise=0.5, z_score=3)
nutrient_values, _ = generate_x_from_y(correlation=-0.3, y_values=eth_values, min_value=0, max_value=4, noise=0.3, z_score=3)
sun_intensity_values, _ = generate_x_from_y(correlation=-0.6, y_values=eth_values, min_value=800, max_value=100000, noise=1000, z_score=3)
wavelength_values, _ = generate_x_from_y(correlation=0.5, y_values=eth_values, min_value=380, max_value=741, noise=10, z_score=3)

# Create a DataFrame with the generated data
data = pd.DataFrame({
    'Day since planted': dsp_values,
    'ETH (days)': eth_values,
    'Humidity (%)': humidity_values,
    'Temperature (°C)': temperature_values,
    'Daily Sun Exposure (hours)': sun_exposure_values,
    'Water pH Level': ph_level_values,
    'Nutrient Level': nutrient_values,
    'Sun Exposure Intensity (Lux)': sun_intensity_values,
    'Visible Light Wavelength (nm)': wavelength_values
})

# Display the first few rows of the generated dataset
data.head()

"""
Humedity become in negitive and it should be from 0 to 1
tempreture should be from 0 to 55
sun exposure should be from 12 to 19
ph level should be from 0 to 14

"""


Unnamed: 0,Day since planted,ETH (days),Humidity (%),Temperature (°C),Daily Sun Exposure (hours),Water pH Level,Nutrient Level,Sun Exposure Intensity (Lux),Visible Light Wavelength (nm)
0,28.090509,8.476996,1484.505148,462.573288,43.797148,37.024527,4.567976,1404275000.0,58821.246431
1,71.303573,2.471601,-176.633658,918.10285,54.187114,-12.757784,9.767417,3020311000.0,33156.606084
2,54.899546,10.457188,2038.05079,316.238128,42.659031,51.562533,2.83176,871412400.0,67321.183499
3,44.899386,11.248158,2249.026506,264.921971,40.232027,59.016508,2.040814,658564100.0,70689.482876
4,11.701398,9.795857,1845.062622,365.154201,43.636557,47.794146,3.211992,1049376000.0,64475.586112
