In [60]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

# Load the dataset
file_path = 'Complete.csv'
df = pd.read_csv(file_path)

# List of columns to fill based on their correlation with Phytoplankton (cells/ml)
columns_to_fill = ['pH (units)', 'Ammonia (mg/L)','Nitrate (mg/L)', 'Inorganic Phosphate (mg/L)', 'BOD (mg/l)','Dissolved Oxygen (mg/l)']

# Define the predictors to be used
predictors = ['Phytoplankton (cells/ml)', 'Temperature', 'Humidity', 'Wind Speed']

def random_forest_imputation(df, target_column, predictors):
    # Separate rows with missing and complete data
    missing_data = df[df[target_column].isnull()]
    complete_data = df[df[target_column].notnull()]
    
    if missing_data.empty:  # If there are no missing values for the column
        return
    
    # Use only the selected predictors
    X_complete = complete_data[predictors]
    y_complete = complete_data[target_column]
    
    # Fit the Random Forest model
    rf = RandomForestRegressor(n_estimators=100, random_state=0)
    rf.fit(X_complete, y_complete)
    
    # Predict missing values
    X_missing = missing_data[predictors]
    df.loc[df[target_column].isnull(), target_column] = rf.predict(X_missing)

# Apply random forest imputation on each column using the defined predictors
for column in columns_to_fill:
    random_forest_imputation(df, column, predictors)

# Check if missing values are filled
print(df[columns_to_fill].isnull().sum())

# Save the imputed dataset to a new CSV file
df.to_csv('Complete_RF_Imputed.csv', index=False)


pH (units)                    0
Ammonia (mg/L)                0
Nitrate (mg/L)                0
Inorganic Phosphate (mg/L)    0
BOD (mg/l)                    0
Dissolved Oxygen (mg/l)       0
dtype: int64


In [58]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

# Load the dataset
file_path = 'Complete.csv'
df = pd.read_csv(file_path)

# List of columns to fill based on their correlation with Phytoplankton (cells/ml)
columns_to_fill = ['pH (units)', 'Ammonia (mg/L)','Nitrate (mg/L)', 'Inorganic Phosphate (mg/L)', 'BOD (mg/l)','Dissolved Oxygen (mg/l)']

# Define the predictors to be used and group by 'Month' and 'Year'
predictors = ['Phytoplankton (cells/ml)', 'Temperature', 'Humidity', 'Wind Speed']
group_columns = ['Month', 'Year']

# Function to apply Random Forest imputation while considering grouping by Month and Year
def random_forest_imputation_grouped(df, target_column, predictors, group_columns):
    # Group by the specified columns (e.g., Month and Year)
    grouped = df.groupby(group_columns)
    
    for name, group in grouped:
        missing_data = group[group[target_column].isnull()]
        complete_data = group[group[target_column].notnull()]
        
        if complete_data.empty or missing_data.empty:
            continue  # Skip if no data is available for imputation
        
        # Use only the selected predictors for imputation
        X_complete = complete_data[predictors]
        y_complete = complete_data[target_column]
        
        # Train a Random Forest model on the complete data within the group
        rf = RandomForestRegressor(n_estimators=100, random_state=0)
        rf.fit(X_complete, y_complete)
        
        # Predict missing values within the group
        X_missing = missing_data[predictors]
        df.loc[missing_data.index, target_column] = rf.predict(X_missing)

# Apply the grouped random forest imputation on each column with missing values
for column in columns_to_fill:
    random_forest_imputation_grouped(df, column, predictors, group_columns)

# Check if missing values are filled
print(df[columns_to_fill].isnull().sum())

# Save the updated dataset to a new CSV file
df.to_csv('Complete_RF_Imputed_Grouped.csv', index=False)


pH (units)                    222
Ammonia (mg/L)                411
Nitrate (mg/L)                438
Inorganic Phosphate (mg/L)    381
BOD (mg/l)                    192
Dissolved Oxygen (mg/l)        60
dtype: int64
