In [2]:
# 5_Conclusion_and_Report.ipynb

import joblib
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score

# Load the preprocessed data
X_train, X_test, y_train, y_test = joblib.load('data/split_data.pkl')

# Load the best model identified in the previous notebook
best_model = joblib.load('models/xgboost_model.pkl')  # Save the best model's name in the previous step

# Evaluate the best model on the test set
y_pred = best_model.predict(X_test)
mse_best = mean_squared_error(y_test, y_pred)
r2_best = r2_score(y_test, y_pred)

print("### Conclusion and Insights")
print(f"Final Model Performance on Test Set:\n"
      f"Mean Squared Error = {mse_best:.2f}, "
      f"R² Score = {r2_best:.2f}\n")
print("These metrics suggest that the model is performing well in predicting the maximum temperature. "
      "Consider the following insights and potential improvements.")

# Discuss potential limitations and areas for improvement
print("""
### Limitations:
- The model’s performance depends heavily on the quality and quantity of the data used for training.
- Model interpretability might vary, especially with more complex models like XGBoost or Gradient Boosting.

### Areas for Improvement:
- Explore additional data features or perform feature engineering.
- Conduct more comprehensive hyperparameter tuning (e.g., using RandomizedSearchCV or Bayesian Optimization).
""")

# Save the best model for future use
joblib.dump(best_model, 'models/final_best_model.pkl')

# Example of using the best model for prediction
def predict_max_temp(input_data):
    preprocessor = joblib.load('data/preprocessor.pkl')
    input_preprocessed = preprocessor.transform(input_data)
    max_temp_prediction = best_model.predict(input_preprocessed)
    return max_temp_prediction

# Example usage:
input_data = pd.DataFrame({
    'province': ['Hanoi'],
    'wind_d': ['NNE'],
    'min': [30],
    'wind': [14],
    'rain': [0.0],
    'humidi': [100],
    'cloud': [80],
    'pressure': [900]
})

predicted_temp = predict_max_temp(input_data)
print(f"Predicted Maximum Temperature: {predicted_temp[0]:.2f}°C")


### Conclusion and Insights
Final Model Performance on Test Set:
Mean Squared Error = 1.54, R² Score = 0.93

These metrics suggest that the model is performing well in predicting the maximum temperature. Consider the following insights and potential improvements.

### Limitations:
- The model’s performance depends heavily on the quality and quantity of the data used for training.
- Model interpretability might vary, especially with more complex models like XGBoost or Gradient Boosting.

### Areas for Improvement:
- Explore additional data features or perform feature engineering.
- Conduct more comprehensive hyperparameter tuning (e.g., using RandomizedSearchCV or Bayesian Optimization).

Predicted Maximum Temperature: 25.77°C
