In [1]:
# Load multiple CSV files from a dataset folder.
# Train three models: Linear Regression, Random Forest, and LSTM.
# Evaluate their performance.
# Plot the actual vs. predicted nivel_rio.
# %pip install pandas numpy scikit-learn matplotlib tensorflow


In [2]:
# Import required libraries
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt

2024-11-28 22:32:34.620028: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-28 22:32:35.437573: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-28 22:32:36.178711: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1732843956.786428    8331 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1732843956.953116    8331 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-28 22:32:38.536031: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU ins

In [3]:
# Set the number of CSV files to import
NUM_FILES = 5  # Adjust this to the number of files you want to import

# Load all CSV files from the "dataset" folder
def load_data(num_files):
    all_data = []
    for i in range(num_files):
        file_path = f'dataset/data_log_{i}.csv'
        if os.path.exists(file_path):
            df = pd.read_csv(file_path)
            all_data.append(df)
        else:
            print(f"File not found: {file_path}")
    combined_data = pd.concat(all_data, ignore_index=True)
    return combined_data

In [4]:
# Load and preprocess data
data = load_data(NUM_FILES)

# Drop rows with missing values
data = data.dropna()

# Extract features (X) and target (y)
X = data.drop(columns=['nivel_rio'])
y = data['nivel_rio']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

ValueError: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

In [None]:
# --- Linear Regression Model ---
print("Training Linear Regression model...")
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_predictions = lr_model.predict(X_test)

In [None]:
# Evaluate Linear Regression
lr_mse = mean_squared_error(y_test, lr_predictions)
lr_mae = mean_absolute_error(y_test, lr_predictions)
print(f"Linear Regression - MSE: {lr_mse}, MAE: {lr_mae}")

In [None]:
# --- Random Forest Model ---
print("Training Random Forest model...")
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)

In [None]:
# Evaluate Random Forest
rf_mse = mean_squared_error(y_test, rf_predictions)
rf_mae = mean_absolute_error(y_test, rf_predictions)
print(f"Random Forest - MSE: {rf_mse}, MAE: {rf_mae}")

In [None]:
# --- LSTM Model ---
# Prepare data for LSTM (reshape required for LSTM input)
X_train_lstm = np.expand_dims(X_train.values, axis=1)
X_test_lstm = np.expand_dims(X_test.values, axis=1)

print("Training LSTM model...")
lstm_model = Sequential([
    LSTM(64, activation='relu', input_shape=(1, X_train.shape[1])),
    Dense(1)
])
lstm_model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
lstm_model.fit(X_train_lstm, y_train, epochs=20, batch_size=32, verbose=1)

In [None]:
# Make predictions with LSTM
lstm_predictions = lstm_model.predict(X_test_lstm)

# Evaluate LSTM
lstm_mse = mean_squared_error(y_test, lstm_predictions)
lstm_mae = mean_absolute_error(y_test, lstm_predictions)
print(f"LSTM - MSE: {lstm_mse}, MAE: {lstm_mae}")

In [None]:
# --- Plot Predictions ---
plt.figure(figsize=(15, 5))
plt.plot(y_test.values, label='Actual nivel_rio', color='blue')
plt.plot(lr_predictions, label='Linear Regression Predictions', color='green')
plt.plot(rf_predictions, label='Random Forest Predictions', color='orange')
plt.plot(lstm_predictions, label='LSTM Predictions', color='red')
plt.legend()
plt.title('Model Predictions vs Actual nivel_rio')
plt.xlabel('Sample Index')
plt.ylabel('nivel_rio')
plt.show()