**This cell contains the base code for the Xgboost Regressor model**

In [None]:
%%time

import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load CSV data
data = pd.read_csv('Sample_Data_5000.csv')

# Convert non-numeric values to numeric
data = data.apply(pd.to_numeric, errors='coerce')

# Drop rows with NaN values (if any)
data.dropna(inplace=True)

# Split the data into features (X) and target variable (y)
X = data[['pH', 'EC', 'W_Temp', 'A_Temp', 'Humid', 'CO2']]
y = data['Decimal_Label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the XGBoost regression model with hyperparameters
model = XGBRegressor(
    objective='reg:squarederror',  # Use reg:squarederror instead of reg:linear
    n_estimators=1000,
    learning_rate=0.1,
    max_depth=3,
    min_child_weight=1,
    gamma=0,
    subsample=1,
    colsample_bytree=1,
    reg_alpha=0,
    reg_lambda=1,
    random_state=42
)


model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Round the predicted values to the closest integers
rounded_y_pred = [int(round(val)) for val in y_pred]

# Calculate and print the accuracy of the model
accuracy = accuracy_score(y_test, rounded_y_pred)
print("Accuracy:", accuracy)

**This cell contains the code to generate the synthetic dataset on which the Xgboost Regressor model is tested**

In [None]:
%%time

import csv
import random

def generate_random_values(num_rows):
    rows = []
    for _ in range(num_rows):
        pH = round(random.uniform(5.4, 6.9), 2)
        EC = round(random.uniform(0.3, 1.5), 2)
        W_Temp = round(random.uniform(16, 28), 2)
        A_Temp = round(random.uniform(16, 28), 2)
        Humid = round(random.uniform(50, 95), 1)
        CO2 = random.randint(300, 1100)
        Light_on_off = random.randint(0, 1)

        rows.append([pH, EC, W_Temp, A_Temp, Humid, CO2, Light_on_off])

    return rows

def apply_conditions(rows):
    for row in rows:
        # Additional columns
        W_Temp = row[2]
        A_Temp = row[3]
        Humid = row[4]
        CO2 = row[5]

        row += [1 if W_Temp > 22 else 0,
                1 if A_Temp > 24 else 0,
                1 if CO2 > 1000 else 0,
                1 if Humid > 80 else 0,
                1 if Humid < 60 else 0,
                1 if row[0] > 6.5 else 0,
                1 if row[0] < 5.8 else 0,
                1 if row[1] < 0.6 else 0,
                1 if row[1] > 1.2 else 0]

def convert_to_binary_decimal(rows):
    for row in rows:
        binary = "".join(str(val) for val in row[7:15])  # Exclude Light_on_off column
        decimal_label = int(binary, 2)
        row += [binary, decimal_label]

if __name__ == "__main__":
    num_rows = 50000
    data_rows = generate_random_values(num_rows)
    apply_conditions(data_rows)
    convert_to_binary_decimal(data_rows)

    with open('Sample_Data_50000.csv', 'w', newline='') as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(['pH', 'EC', 'W_Temp', 'A_Temp', 'Humid', 'CO2', 'Light_on_off',
                             'Water_cooler', 'Air_Cooler', 'Air_Vent', 'Dehumidifier',
                             'Humidifier', 'Acid_Dozer', 'Base_Dozer', 'Nutrient_Dozer',
                             'Distilled_Water_Dozer', 'Binary', 'Decimal_Label'])
        csv_writer.writerows(data_rows)

**This cell contains the code which performs Cross-Validation on the Xgboost Regressor model to determine the optimal hyperparameters**

In [None]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, make_scorer

# Load CSV data
data = pd.read_csv('Sample_Data_5000.csv')

# Convert non-numeric values to numeric
data = data.apply(pd.to_numeric, errors='coerce')

# Drop rows with NaN values (if any)
data.dropna(inplace=True)

# Split the data into features (X) and target variable (y)
X = data[['pH', 'EC', 'W_Temp', 'A_Temp', 'Humid', 'CO2']]
y = data['Decimal_Label']

# Create the XGBoost regression model
xgb_model = XGBRegressor(random_state=42)

# Define the hyperparameter grid to search over
param_grid = {
    'n_estimators': [50, 100, 200],      # Number of boosting rounds
    'learning_rate': [0.01, 0.1, 0.2],   # Step size shrinkage
    'max_depth': [3, 5, 7],              # Maximum depth of each tree
    'min_child_weight': [1, 3, 5],       # Minimum sum of instance weight needed in a child
    'gamma': [0, 0.1, 0.2],              # Minimum loss reduction required to make a split
    'subsample': [0.8, 1.0],             # Subsample ratio of the training instances
    'colsample_bytree': [0.8, 1.0],      # Subsample ratio of columns when constructing a tree
    'reg_alpha': [0, 0.1, 0.5],          # L1 regularization term on weights
    'reg_lambda': [0, 1, 10]             # L2 regularization term on weights
}

# Create GridSearchCV object
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring=make_scorer(accuracy_score),
    n_jobs=-1,  # Use all available cores for parallel computation
    verbose=2,  # Show progress during fitting
    cv=5        # Number of cross-validation folds
)

# Perform the grid search to find the best hyperparameters
grid_search.fit(X, y)

# Get the best hyperparameters and best accuracy
best_params = grid_search.best_params_
best_accuracy = grid_search.best_score_

print("Best Hyperparameters:")
print(best_params)
print("Best Accuracy:", best_accuracy)

**This cell contains the code which performs K-Fold Cross-Validation on the Xgboost Regressor model to determine the average accuracy of the model**

In [12]:
%%time

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd                # Importing pandas for data manipulation and analysis
from xgboost import XGBRegressor   # Importing XGBRegressor from xgboost library for building the XGBoost regression model
from xgboost import plot_tree
from xgboost import to_graphviz
from sklearn.model_selection import train_test_split, KFold, cross_val_score  # Importing functions for data splitting and cross-validation
from sklearn.metrics import accuracy_score, r2_score, mean_absolute_error  # Importing accuracy_score metric for evaluation (not used in this regression scenario)

# Load CSV data
data = pd.read_csv('Sample_Data_5000.csv')  # Reading the CSV file and storing the data in a pandas DataFrame 'data'

# Convert non-numeric values to numeric
data = data.apply(pd.to_numeric, errors='coerce')  # Converting non-numeric values to NaN (Not-a-Number)

# Drop rows with NaN values (if any)
data.dropna(inplace=True)  # Removing rows containing NaN values from the DataFrame

# Split the data into features (X) and target variable (y)
X = data[['pH', 'EC', 'W_Temp', 'A_Temp', 'Humid', 'CO2']]  # Extracting the features (independent variables) from 'data'
y = data['Decimal_Label']  # Extracting the target variable (dependent variable) from 'data'

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Splitting the data into training and testing sets using 80% for training and 20% for testing.
# The random_state is set to 42 to ensure reproducibility.

# Train the XGBoost regression model with hyperparameters
model = XGBRegressor(
    objective='reg:squarederror',  # Objective function to use for regression, 'reg:squarederror' minimizes the mean squared error
    n_estimators=1000,  # Number of boosting rounds (trees) to build
    learning_rate=0.1,   # Step size shrinkage used to prevent overfitting
    max_depth=3,         # Maximum depth of a tree. Increasing it may lead to overfitting.
    min_child_weight=1,  # Minimum sum of instance weight (hessian) needed in a child
    gamma=0,             # Minimum loss reduction required to make a further partition on a leaf node
    subsample=1,         # Subsample ratio of the training instance. Lower values prevent overfitting.
    colsample_bytree=1,  # Subsample ratio of columns when constructing each tree. Lower values prevent overfitting.
    reg_alpha=0,         # L1 regularization term on weights
    reg_lambda=1,        # L2 regularization term on weights
    random_state=42      # Seed for random number generator for reproducibility
)

model = model.fit(X_train, y_train)

def k_fold_cross_validation(X, y, model, k=10):
    n_samples = len(X)
    fold_size = n_samples // k
    accuracies = []

    for i in range(k):
        start_idx = i * fold_size
        end_idx = (i + 1) * fold_size if i < k - 1 else n_samples

        X_test = X[start_idx:end_idx]
        y_test = y[start_idx:end_idx]

        X_train = np.concatenate((X[:start_idx], X[end_idx:]), axis=0)
        y_train = np.concatenate((y[:start_idx], y[end_idx:]), axis=0)

        model.fit(X_train, y_train)
        #y_pred = model.predict(X_test)
        y_pred_rounded = model.predict(X_test).round().astype(int)

        accuracy = np.mean(y_pred_rounded == y_test)
        accuracies.append(accuracy)

    avg_accuracy = np.mean(accuracies)
    return avg_accuracy

avg_accuracy = k_fold_cross_validation(X.values, y.values, model)

print("Average Cross-Validation Accuracy: ", avg_accuracy)
# xgb_tree = to_graphviz(mine, num_trees=-1, rankdir='UT')
# xgb_tree.view()

# plt.figure(figsize=(20, 15), dpi=90)
# plot_tree(mine, num_trees=-1, ax=plt.gca())  # You can specify the tree index you want to plot
# plt.show()

Average Cross-Validation Accuracy:  0.9991
CPU times: total: 50.4 s
Wall time: 50.7 s


**This cell contains the code which tests the model trained above with a new dataset and determines the accuracy with which the model can predict the control actions as well as export the Xgboost Regressor model as a .json file**

In [6]:
%%time

# Importing the required libraries
import pandas as pd                # Pandas for data manipulation and analysis
from xgboost import XGBRegressor   # XGBoost regressor for building the model
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error  # Metrics for evaluation
import random  # For random sampling


# Load the new dataset
new_data = pd.read_csv('Sample_Data_50000.csv')


# Preprocess the new dataset
# Convert non-numeric values to numeric by replacing them with NaN (Not-a-Number)
new_data = new_data.apply(pd.to_numeric, errors='coerce')

# Drop rows with any missing (NaN) values, if any
new_data.dropna(inplace=True)


# Split the new dataset into features (X_new) and target variable (y_new)
# X_new contains the features (independent variables) and y_new contains the target variable (dependent variable)
X_new = new_data[['pH', 'EC', 'W_Temp', 'A_Temp', 'Humid', 'CO2']]
y_new = new_data['Decimal_Label']


# Make predictions on the new dataset using the trained model
predictions = model.predict(X_new)


# Round the predicted values to the nearest integer
y_pred_rounded = [int(round(val)) for val in predictions]



# Initialize a counter to keep track of incorrectly predicted data points
count = 0


# Print the actual and predicted values for the 50 random data points
# print("Random 50 Data Points - Actual vs. Predicted Values:")
for i in range(len(X_new)):
    actual_value = y_new.iloc[i]
    predicted_value = y_pred_rounded[i]
    if actual_value != predicted_value:
        count += 1

        
# Calculate the mean squared error (MSE) on the new dataset
mse = mean_squared_error(y_new, y_pred_rounded)


# Optionally, calculate and print other metrics like R-squared and Mean Absolute Error (MAE)
r2 = r2_score(y_new, y_pred_rounded)
mae = mean_absolute_error(y_new, y_pred_rounded)


# Calculate the percentage of correctly predicted control actions
percentage_correct = 100 - (count / len(X_new)) * 100


# Print the evaluation metrics
print("Mean Squared Error (MSE) on the new dataset:", mse)
print("R-squared on the new dataset:", r2)
print("Mean Absolute Error (MAE) on the new dataset:", mae)
print(f"Percentage of Correctly Predicted Control Actions: {percentage_correct:.2f}%")

# Saves the Xgboost Regressor model as a .json file which can later be imported to run the model independently
model.save_model('xgboost_regressor_model.json')

Mean Squared Error (MSE) on the new dataset: 0.64874
R-squared on the new dataset: 0.9998740299246831
Mean Absolute Error (MAE) on the new dataset: 0.04042
Percentage of Correctly Predicted Control Actions: 98.90%
CPU times: total: 1.78 s
Wall time: 1.79 s


**This cell contains code to run the Xgboost Regressor model from the .json file such that it can be imported to other devices to run the model independently**

In [7]:
import random
import xgboost as xgb
import pandas as pd
from xgboost.sklearn import XGBRegressor
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import accuracy_score, r2_score, mean_absolute_error, mean_squared_error

# Load the trained XGBoost Regressor model
loaded_model = XGBRegressor()
loaded_model.load_model('xgboost_regressor_model.json')

# Load the new dataset from a CSV file (assuming 'Sample_Data_50000.csv' contains your data)
new_data = pd.read_csv('Sample_Data_50000.csv')

# Preprocess the new dataset by converting non-numeric values to numeric by replacing them with NaN (Not-a-Number)
new_data = new_data.apply(pd.to_numeric, errors='coerce')

# Drop rows with any missing (NaN) values, if any
new_data.dropna(inplace=True)

# Split the new dataset into features (X_new) and target variable (y_new)
# X_new contains the features (independent variables) and y_new contains the target variable (dependent variable)
X_new = new_data[['pH', 'EC', 'W_Temp', 'A_Temp', 'Humid', 'CO2']]
y_new = new_data['Decimal_Label']

# Make predictions on the new dataset using the trained model
predictions = loaded_model.predict(X_new)

# Round the predicted values to the nearest integer
y_pred_rounded = [int(round(val)) for val in predictions]


# Initialize a counter to keep track of incorrectly predicted data points
count = 0

# Print the actual and predicted values for the 50 random data points
# For idx in random_indices:
for i in range(len(X_new)):
    actual_value = y_new.iloc[i]
    predicted_value = y_pred_rounded[i]
    if actual_value != predicted_value:
        count += 1

# Calculate the mean squared error (MSE) on the new dataset
mse = mean_squared_error(y_new, y_pred_rounded)

# Optionally, calculate and print other metrics like R-squared and Mean Absolute Error (MAE)
r2 = r2_score(y_new, y_pred_rounded)
mae = mean_absolute_error(y_new, y_pred_rounded)

# Calculate the percentage of correctly predicted control actions
percentage_correct = 100 - (count / len(X_new)) * 100

# Print the evaluation metrics
print("Mean Squared Error (MSE) on the new dataset:", mse)
print("R-squared on the new dataset:", r2)
print("Mean Absolute Error (MAE) on the new dataset:", mae)
print(f"Percentage of Correctly Predicted Control Actions: {percentage_correct:.2f}%")

Mean Squared Error (MSE) on the new dataset: 0.64874
R-squared on the new dataset: 0.9998740299246831
Mean Absolute Error (MAE) on the new dataset: 0.04042
Percentage of Correctly Predicted Control Actions: 98.90%
