In [None]:
import os
import pandas as pd
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor
from joblib import load, dump

def calculate_common_uncommon_columns(existing_model_columns, new_data_columns):
    common_columns = existing_model_columns.intersection(new_data_columns)
    uncommon_columns = new_data_columns - existing_model_columns
    return common_columns, uncommon_columns

# Load existing model
existing_model_filename = '/Users/kunalpathak9826/CatBoost_model.pkl'
model = load(existing_model_filename)

# Folder path containing new CSV files
folder_path = input("/Users/kunalpathak9826/Desktop/ISRO/Data/Interpolated CSV/2017 ")

# List all CSV files in the folder
csv_files_folder = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# Load columns of existing model
existing_model_columns = set(model.feature_names_in_)

# Initialize empty list to store concatenated data
concatenated_data = []

# Iterate through files in folder path
for csv_file in csv_files_folder:
    # Load data from CSV file
    df = pd.read_csv(os.path.join(folder_path, csv_file))
    
    # Load columns of new data
    new_data_columns = set(df.columns)
    
    # Calculate common and uncommon columns
    common_columns, uncommon_columns = calculate_common_uncommon_columns(existing_model_columns, new_data_columns)
    
    # Keep only common columns and target variable
    df_common = df[list(common_columns) + ['precipitationCal']]
    
    # Concatenate data to the list
    concatenated_data.append(df_common)
    
# Concatenate all data from folder
concatenated_data = pd.concat(concatenated_data, ignore_index=True)

# Extract features (X) and target (y) from concatenated data for combined set
X_combined = concatenated_data.drop(columns=['precipitationCal'])
y_combined = concatenated_data['precipitationCal']

# Make predictions on the combined set using the existing Random Forest model
y_pred_combined = model.predict(X_combined)

# Calculate the mean squared error for the combined set
mse_combined = mean_squared_error(y_combined, y_pred_combined)
print(f"MSE on Combined Data: {mse_combined:.4f}")

# Save the updated model
dump(model, existing_model_filename)
print(f"Updated model saved as {existing_model_filename}")
