In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score


In [15]:
df = pd.read_csv("aquaculture.csv")
df.head()


Unnamed: 0,id,temperatura,oxigeno,ph,turbidez,hour,day,month,temperatura_scaled,oxigeno_scaled
0,49,1.00175,0.089269,0.937644,-0.008761,19,17,1,27.041814,7.968205
1,55,1.044218,0.621644,0.936319,-0.043164,19,24,1,27.330717,8.190253
2,88,0.981081,0.234554,0.997134,0.027833,20,25,1,26.901205,8.028802
3,106,0.990495,-0.71527,0.946473,0.017791,20,27,1,26.965247,7.632639
4,113,1.03142,-0.611011,0.90677,0.006865,20,5,1,27.243652,7.676124


In [16]:
df.isnull().sum()

id                    0
temperatura           0
oxigeno               0
ph                    0
turbidez              0
hour                  0
day                   0
month                 0
temperatura_scaled    0
oxigeno_scaled        0
dtype: int64

In [17]:
# STEP 3: Define Features and Target
features = ['temperatura', 'oxigeno', 'ph', 'hour', 'day', 'month']
target = 'turbidez'

X = df[features]
y = df[target]


In [18]:
# STEP 4: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [19]:
# STEP 5: Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [20]:
# STEP 6: Model Training
# Linear Regression
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)

# Random Forest
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# XGBoost
xgb = XGBRegressor(n_estimators=100, learning_rate=0.1)
xgb.fit(X_train, y_train)


0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [21]:
# STEP 7: Model Evaluation
def evaluate_model(y_true, y_pred, name):
    print(f"{name} Results:")
    print(f"R² Score: {r2_score(y_true, y_pred):.3f}")
    print(f"RMSE: {np.sqrt(mean_squared_error(y_true, y_pred)):.4f}")
    print("-"*30)

# Predict and evaluate
evaluate_model(y_test, lr.predict(X_test_scaled), "Linear Regression")
evaluate_model(y_test, rf.predict(X_test), "Random Forest")
evaluate_model(y_test, xgb.predict(X_test), "XGBoost")


Linear Regression Results:
R² Score: 0.375
RMSE: 0.1678
------------------------------
Random Forest Results:
R² Score: 0.797
RMSE: 0.0956
------------------------------
XGBoost Results:
R² Score: 0.810
RMSE: 0.0924
------------------------------


In [22]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

# Basic setup
xgb = XGBRegressor(objective='reg:squarederror', random_state=42)

# Define parameter grid
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Grid search with 5-fold cross-validation
grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',
    cv=5,
    verbose=1,
    n_jobs=-1
)

# Fit on training data
grid_search.fit(X_train, y_train)

# Best parameters and score
print("✅ Best Parameters:", grid_search.best_params_)
print("🔢 Best CV RMSE:", -grid_search.best_score_)


Fitting 5 folds for each of 72 candidates, totalling 360 fits
✅ Best Parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.8}
🔢 Best CV RMSE: 0.09069893975540391


In [23]:
# Retrieve the best model
best_xgb = grid_search.best_estimator_

# Predict on test data
y_pred_best = best_xgb.predict(X_test)

# Evaluate performance
evaluate_model(y_test, y_pred_best, "XGBoost (Tuned)")


XGBoost (Tuned) Results:
R² Score: 0.814
RMSE: 0.0914
------------------------------


In [24]:
import joblib

# Save the best XGBoost model
joblib.dump(best_xgb, "best_xgb_turbidez_model.pkl")

# Save the scaler too if you used one (e.g., for other models)
joblib.dump(scaler, "scaler_turbidez.pkl")

print("✅ Model and scaler saved successfully.")


✅ Model and scaler saved successfully.
