In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
# Modified example dataset with NaN values
data = {
    'Fecha': pd.date_range(start='1972-01-01', periods=10),
    'VIRREY': [0.0, 0.0, np.nan, 0.0, 0.0, np.nan, 0.0, np.nan, 0.0, np.nan],
    'MIRAFLORES': [0.0, 0.0, 0.4, 1.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.01],
    'PIURA': [0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
    'MORROPON': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
    'HUANCABAMBA': [6.7, 0.1, 2.6, 0.9, 0.01, 0.0, 2.9, 0.1, 0.4, 1.7],
    'CANCHAQUE': [6.8, 1.4, 14.0, 1.4, 1.9, 0.0, 0.5, 0.0, 0.1, 0.8],
    'SANTO DOMINGO': [0.0, 6.4, 18.4, 4.6, 5.6, 0.8, 0.9, 0.7, 2.6, 3.1],
    'HUARMACA': [3.7, 0.01, 4.1, 2.4, 0.01, 4.1, 0.0, 0.01, 3.1, 1.1]
}
df = pd.DataFrame(data)
# Function to perform train-test split and complete missing data using regression
def complete_missing_data(df, target_station):
    # Drop rows with NaN values in the target station
    # Create a copy of the dropped rows
    dropped_df = df[df[target_station].isna()].copy()

    # Drop the rows with NaN values in 'target_station' from the original DataFrame
    remaining_df = df.dropna(subset=[target_station])
    
    #df.dropna(subset=[target_station], inplace=True)

    # Train-test split (70% train, 30% test)
    #train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)
    train_df, test_df = train_test_split(remaining_df, test_size=0.3, random_state=42)

    # Initialize regression models
    linear_reg = LinearRegression()
    random_forest_reg = RandomForestRegressor()

    # Define performance metric (RMSE in this example)
    def rmse(y_true, y_pred):
        return mean_squared_error(y_true, y_pred, squared=False)

    # Train models on the training set
    X_train = train_df.drop([target_station, 'Fecha'], axis=1)
    y_train = train_df[target_station]
    linear_reg.fit(X_train, y_train)
    random_forest_reg.fit(X_train, y_train)

    # Evaluate models on the test set
    X_test = test_df.drop([target_station, 'Fecha'], axis=1)
    y_test = test_df[target_station]
    linear_reg_rmse = rmse(y_test, linear_reg.predict(X_test))
    random_forest_rmse = rmse(y_test, random_forest_reg.predict(X_test))

    # Choose the best model based on performance (lower RMSE in this case)
    if linear_reg_rmse < random_forest_rmse:
        selected_model = linear_reg
        selected_model_name = 'Linear Regression'
        selected_model_rmse = linear_reg_rmse
    else:
        selected_model = random_forest_reg
        selected_model_name = 'Random Forest'
        selected_model_rmse = random_forest_rmse

    # Use the selected model to complete missing values in the target station
    for index, row in dropped_df.iterrows():
        if pd.isnull(row[target_station]):
            X_pred = row.drop([target_station, 'Fecha'])
            predicted_value = selected_model.predict([X_pred])[0]
            dropped_df.loc[index, target_station] = predicted_value

    # Merge them back together
    merged_df = pd.concat([dropped_df, remaining_df], ignore_index=True)
    return merged_df, selected_model_name, selected_model_rmse

# Example usage: Complete data for 'VIRREY' station and get the selected model name and RMSE
completed_df, model_name, model_rmse = complete_missing_data(df, 'VIRREY')
print(f"Selected Model: {model_name}")
print(f"RMSE: {model_rmse}")
print(completed_df)

Selected Model: Random Forest
RMSE: 0.0
       Fecha  VIRREY  MIRAFLORES  PIURA  MORROPON  HUANCABAMBA  CANCHAQUE  \
0 1972-01-03     0.0        0.40    0.0       0.0         2.60       14.0   
1 1972-01-06     0.0        0.00    0.0       0.0         0.00        0.0   
2 1972-01-08     0.0        0.00    0.0       0.0         0.10        0.0   
3 1972-01-10     0.0        0.01    0.0       0.0         1.70        0.8   
4 1972-01-01     0.0        0.00    0.0       0.0         6.70        6.8   
5 1972-01-02     0.0        0.00    0.0       0.0         0.10        1.4   
6 1972-01-04     0.0        1.50    2.0       0.0         0.90        1.4   
7 1972-01-05     0.0        0.00    0.0       0.0         0.01        1.9   
8 1972-01-07     0.0        0.00    0.0       0.0         2.90        0.5   
9 1972-01-09     0.0        0.00    0.0       0.0         0.40        0.1   

   SANTO DOMINGO  HUARMACA  
0           18.4      4.10  
1            0.8      4.10  
2            0.7      0.0

