**Cell containing the main code for the Xgboost Regressor model**

In [7]:
%%time

import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load CSV data
data = pd.read_csv('Sample_Data_5000.csv')

# Convert non-numeric values to numeric
data = data.apply(pd.to_numeric, errors='coerce')

# Drop rows with NaN values (if any)
data.dropna(inplace=True)

# Split the data into features (X) and target variable (y)
X = data[['pH', 'EC', 'W_Temp', 'A_Temp', 'Humid', 'CO2']]
y = data['Decimal_Label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the XGBoost regression model with hyperparameters
model = XGBRegressor(
    objective='reg:squarederror',  # Use reg:squarederror instead of reg:linear
    n_estimators=1000,
    learning_rate=0.1,
    max_depth=3,
    min_child_weight=1,
    gamma=0,
    subsample=1,
    colsample_bytree=1,
    reg_alpha=0,
    reg_lambda=1,
    random_state=42
)


model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Round the predicted values to the closest integers
rounded_y_pred = [int(round(val)) for val in y_pred]

# Calculate and print the accuracy of the model
accuracy = accuracy_score(y_test, rounded_y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.987
CPU times: total: 2.2 s
Wall time: 2.21 s


**This cell contains the code to generate the synthetic dataset on which the Xgboost Regressor model is tested**

In [11]:
%%time

import csv
import random

def generate_random_values(num_rows):
    rows = []
    for _ in range(num_rows):
        pH = round(random.uniform(5.4, 6.9), 2)
        EC = round(random.uniform(0.3, 1.5), 2)
        W_Temp = round(random.uniform(16, 28), 2)
        A_Temp = round(random.uniform(16, 28), 2)
        Humid = round(random.uniform(50, 95), 1)
        CO2 = random.randint(300, 1100)
        Light_on_off = random.randint(0, 1)

        rows.append([pH, EC, W_Temp, A_Temp, Humid, CO2, Light_on_off])

    return rows

def apply_conditions(rows):
    for row in rows:
        # Additional columns
        W_Temp = row[2]
        A_Temp = row[3]
        Humid = row[4]
        CO2 = row[5]

        row += [1 if W_Temp > 22 else 0,
                1 if A_Temp > 24 else 0,
                1 if CO2 > 1000 else 0,
                1 if Humid > 80 else 0,
                1 if Humid < 60 else 0,
                1 if row[0] > 6.5 else 0,
                1 if row[0] < 5.8 else 0,
                1 if row[1] < 0.6 else 0,
                1 if row[1] > 1.2 else 0]

def convert_to_binary_decimal(rows):
    for row in rows:
        binary = "".join(str(val) for val in row[7:15])  # Exclude Light_on_off column
        decimal_label = int(binary, 2)
        row += [binary, decimal_label]

if __name__ == "__main__":
    num_rows = 50000
    data_rows = generate_random_values(num_rows)
    apply_conditions(data_rows)
    convert_to_binary_decimal(data_rows)

    with open('Sample_Data_50000.csv', 'w', newline='') as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(['pH', 'EC', 'W_Temp', 'A_Temp', 'Humid', 'CO2', 'Light_on_off',
                             'Water_cooler', 'Air_Cooler', 'Air_Vent', 'Dehumidifier',
                             'Humidifier', 'Acid_Dozer', 'Base_Dozer', 'Nutrient_Dozer',
                             'Distilled_Water_Dozer', 'Binary', 'Decimal_Label'])
        csv_writer.writerows(data_rows)

CPU times: total: 922 ms
Wall time: 939 ms


**This cell contains the code which performs Cross-Validation on the Xgboost Regressor model to determine the optimal hyperparameters**

In [16]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, make_scorer

# Load CSV data
data = pd.read_csv('Sample_Data_5000.csv')

# Convert non-numeric values to numeric
data = data.apply(pd.to_numeric, errors='coerce')

# Drop rows with NaN values (if any)
data.dropna(inplace=True)

# Split the data into features (X) and target variable (y)
X = data[['pH', 'EC', 'W_Temp', 'A_Temp', 'Humid', 'CO2']]
y = data['Decimal_Label']

# Create the XGBoost regression model
xgb_model = XGBRegressor(random_state=42)

# Define the hyperparameter grid to search over
param_grid = {
    'n_estimators': [50, 100, 200],      # Number of boosting rounds
    'learning_rate': [0.01, 0.1, 0.2],   # Step size shrinkage
    'max_depth': [3, 5, 7],              # Maximum depth of each tree
    'min_child_weight': [1, 3, 5],       # Minimum sum of instance weight needed in a child
    'gamma': [0, 0.1, 0.2],              # Minimum loss reduction required to make a split
    'subsample': [0.8, 1.0],             # Subsample ratio of the training instances
    'colsample_bytree': [0.8, 1.0],      # Subsample ratio of columns when constructing a tree
    'reg_alpha': [0, 0.1, 0.5],          # L1 regularization term on weights
    'reg_lambda': [0, 1, 10]             # L2 regularization term on weights
}

# Create GridSearchCV object
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring=make_scorer(accuracy_score),
    n_jobs=-1,  # Use all available cores for parallel computation
    verbose=2,  # Show progress during fitting
    cv=5        # Number of cross-validation folds
)

# Perform the grid search to find the best hyperparameters
grid_search.fit(X, y)

# Get the best hyperparameters and best accuracy
best_params = grid_search.best_params_
best_accuracy = grid_search.best_score_

print("Best Hyperparameters:")
print(best_params)
print("Best Accuracy:", best_accuracy)

Fitting 5 folds for each of 8748 candidates, totalling 43740 fits


KeyboardInterrupt: 

**This cell contains the code which performs K-Fold Cross-Validation on the Xgboost Regressor model to determine the average accuracy of the model**

In [10]:
%%time

import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import accuracy_score

# Load CSV data
data = pd.read_csv('Sample_Data_5000.csv')

# Convert non-numeric values to numeric
data = data.apply(pd.to_numeric, errors='coerce')

# Drop rows with NaN values (if any)
data.dropna(inplace=True)

# Split the data into features (X) and target variable (y)
X = data[['pH', 'EC', 'W_Temp', 'A_Temp', 'Humid', 'CO2']]
y = data['Decimal_Label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the XGBoost regression model with hyperparameters
model = XGBRegressor(
    objective='reg:squarederror',  # Use reg:squarederror instead of reg:linear
    n_estimators=1000,
    learning_rate=0.1,
    max_depth=3,
    min_child_weight=1,
    gamma=0,
    subsample=1,
    colsample_bytree=1,
    reg_alpha=0,
    reg_lambda=1,
    random_state=42
)

k_folds = KFold(n_splits = 10)

scores = cross_val_score(model, X, y, cv = k_folds)

print("Cross Validation Scores: ", scores)
print("Average CV Score: ", scores.mean())
print("Number of CV Scores used in Average: ", len(scores))

Cross Validation Scores:  [0.99999819 0.99999643 0.99998805 0.99999791 0.99999816 0.99999826
 0.99767785 0.99879116 0.99945554 0.99972819]
Average CV Score:  0.9995629741430427
Number of CV Scores used in Average:  10
CPU times: total: 22.2 s
Wall time: 22.3 s
