In [24]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import Lasso
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, LSTM
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Initialize variables to store information about the data trained before
trained_data_info = []

# Function to print data trained before
def print_trained_data_info():
    print("Data trained before:")
    for folder, files in trained_data_info:
        print(f"Folder: {folder}, Files: {files}")

# Function to calculate and print evaluation metrics
def print_evaluation_metrics(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r_squared = r2_score(y_true, y_pred)
    print(f"MSE: {mse:.4f}, MAE: {mae:.4f}, RMSE: {rmse:.4f}, R-squared: {r_squared:.4f}")

# Load initial dataset
def load_initial_data(data_dir, encoding='iso-8859-1'):
    data = pd.concat([pd.read_csv(os.path.join(data_dir, file), encoding=encoding) for file in os.listdir(data_dir)], ignore_index=True)
    return data

# Handle missing values by imputing or removing rows with missing values
def handle_missing_values(data):
    # Impute missing values for numerical columns with mean
    numerical_cols = data.select_dtypes(include='number').columns
    data[numerical_cols] = data[numerical_cols].fillna(data[numerical_cols].mean())
    
    # Impute missing values for categorical columns with mode
    categorical_cols = data.select_dtypes(exclude='number').columns
    mode_values = data[categorical_cols].mode()
    print("Mode Values:")
    print(mode_values)
    if not mode_values.empty:
        data[categorical_cols] = data[categorical_cols].fillna(mode_values.iloc[0])
    else:
        print("No mode values found.")
    
    return data

# Update models with new data and validate
def update_models_with_new_data_and_validate(X_train, X_test, y_train, y_test, models, model_names, initial_data_dir, new_data_dir=None):
    if new_data_dir:
        data_dir = new_data_dir
    else:
        data_dir = initial_data_dir
    
    for year_folder in os.listdir(data_dir):
        if os.path.isdir(os.path.join(data_dir, year_folder)):
            print(f"\nTraining models for year {year_folder}:")
            trained_data_info.append((year_folder, []))
            year_folder_path = os.path.join(data_dir, year_folder)
            for idx, file_name in enumerate(os.listdir(year_folder_path)):
                if idx % 10 == 0:
                    print_trained_data_info()
                if idx >= 10:
                    break
                file_path = os.path.join(year_folder_path, file_name)
                new_data = pd.read_csv(file_path)
                new_data = handle_missing_values(new_data)
                X_new = new_data[X_train.columns]
                y_new = new_data['precipitationCal']

                for model, model_name in zip(models, model_names):
                    # Update model
                    model.fit(X_train, y_train)
                    print(f"\n{model_name}:")
                    # Evaluate on validation data
                    print_evaluation_metrics(y_test, model.predict(X_test))

                # Append trained file info to trained_data_info
                trained_data_info[-1][1].append(file_name)

# Main function
def main(new_data_dir=None, initial_data_dir=None, batch_size=10):
    if initial_data_dir is None:
        initial_data_dir = '/Users/kunalpathak9826/Desktop/ISRO/Data/Interpolated_CSV'

    # Load initial dataset
    data = load_initial_data(initial_data_dir)

    # Handle missing values in initial dataset
    data = handle_missing_values(data)
    data_with_precipitation = data[data['precipitationCal'].notna()]

    # Split features and target variable
    X_columns = [col for col in data.columns if 'IMG_TIR1' in col or 'IMG_TIR2' in col or 'IMG_WV' in col]
    X = data[X_columns]
    y = data['precipitationCal']

    # Split initial data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, train_size=0.7, random_state=42)

    # Initialize models
    rf_model = RandomForestRegressor()
    xgb_model = XGBRegressor()
    lgbm_model = LGBMRegressor()
    catboost_model = CatBoostRegressor(verbose=0)
    lasso_model = Lasso(alpha=0.1)
    cnn_model = Sequential([
        Conv1D(32, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)),
        MaxPooling1D(pool_size=2),
        Flatten(),
        Dense(64, activation='relu'),
        Dense(1)
    ])
    rnn_model = Sequential([
        LSTM(32, input_shape=(X_train.shape[1], 1)),
        Dense(1)
    ])

    models = [rf_model, xgb_model, lgbm_model, catboost_model, lasso_model, cnn_model, rnn_model]
    model_names = ['Random Forest', 'XGBoost', 'LightGBM', 'CatBoost', 'Lasso', 'CNN', 'RNN']

    # Update models with new data and validate
    update_models_with_new_data_and_validate(X_train, X_test, y_train, y_test, models, model_names, initial_data_dir, new_data_dir)

if __name__ == "__main__":
    new_data_dir = ''  # Provide the new data directory path here if needed
    initial_data_dir = '/Users/kunalpathak9826/Desktop/ISRO/Data/Interpolated_CSV'  # Define the initial data directory
    main(new_data_dir, initial_data_dir)


Mode Values:
Empty DataFrame
Columns: [Unnamed: 0]
Index: []
No mode values found.


In [17]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import Lasso
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, LSTM
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import numpy as np

# Initialize variables to store information about the data trained before
trained_data_info = []

# Function to print data trained before
def print_trained_data_info():
    print("Data trained before:")
    for folder, files in trained_data_info:
        print(f"Folder: {folder}, Files: {files}")

# Function to calculate and print evaluation metrics
def print_evaluation_metrics(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r_squared = r2_score(y_true, y_pred)
    print(f"MSE: {mse:.4f}, MAE: {mae:.4f}, RMSE: {rmse:.4f}, R-squared: {r_squared:.4f}")

# Load initial dataset
def load_initial_data(data_dir, encoding='iso-8859-1'):
    data = pd.concat([pd.read_csv(os.path.join(data_dir, file), encoding=encoding) for file in os.listdir(data_dir)], ignore_index=True)
    return data


# Handle missing values by imputing or removing rows with missing values
def handle_missing_values(data):
    # Impute missing values for numerical columns with mean
    numerical_cols = data.select_dtypes(include='number').columns
    imputer_num = SimpleImputer(strategy='mean')
    data[numerical_cols] = imputer_num.fit_transform(data[numerical_cols])

    # Scale the numerical columns
    scaler = StandardScaler()
    data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

    # Impute missing values for categorical columns with mode
    categorical_cols = data.select_dtypes(exclude='number').columns
    for col in categorical_cols:
        if data[col].isnull().any():
            imputer_cat = SimpleImputer(strategy='most_frequent')
            data[col] = imputer_cat.fit_transform(data[[col]])

    return data


# Update models with new data and validate
def update_models_with_new_data_and_validate(X_train, X_test, y_train, y_test, models, model_names,initial_data_dir, new_data_dir=None):
    if new_data_dir:
        data_dir = new_data_dir
    else:
        data_dir = initial_data_dir
    
    for year_folder in os.listdir(data_dir):
        if os.path.isdir(os.path.join(data_dir, year_folder)):
            print(f"\nTraining models for year {year_folder}:")
            trained_data_info.append((year_folder, []))
            year_folder_path = os.path.join(data_dir, year_folder)
            for idx, file_name in enumerate(os.listdir(year_folder_path)):
                if idx % 10 == 0:
                    print_trained_data_info()
                if idx >= 10:
                    break
                file_path = os.path.join(year_folder_path, file_name)
                new_data = pd.read_csv(file_path)
                new_data = handle_missing_values(new_data)
                X_new = new_data[X_train.columns]
                y_new = new_data['precipitationCal']

                for model, model_name in zip(models, model_names):
                    # Update model
                    model.fit(X_train, y_train)
                    print(f"\n{model_name}:")
                    # Evaluate on validation data
                    print_evaluation_metrics(y_test, model.predict(X_test))

                # Append trained file info to trained_data_info
                trained_data_info[-1][1].append(file_name)

# Main function
def main(new_data_dir=None, batch_size=10):
    # Define directory paths
    initial_data_dir = '/Users/kunalpathak9826/Desktop/ISRO/Data/2018'

    # Load initial dataset
    data = load_initial_data(initial_data_dir)

    # Handle missing values in initial dataset
    data = handle_missing_values(data)
    data_with_precipitation = data[data['precipitationCal'].notna()]

    # Split features and target variable
    X_columns = [col for col in data.columns if 'IMG_TIR1' in col or 'IMG_TIR2' in col or 'IMG_WV' in col]
    X = data[X_columns]
    y = data['precipitationCal']

    # Split initial data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, train_size=0.7, random_state=42)

    # Initialize models
    rf_model = RandomForestRegressor()
    xgb_model = XGBRegressor()
    lgbm_model = LGBMRegressor()
    catboost_model = CatBoostRegressor(verbose=0)
    lasso_model = Lasso(alpha=0.1)
    cnn_model = Sequential([
        Conv1D(32, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)),
        MaxPooling1D(pool_size=2),
        Flatten(),
        Dense(64, activation='relu'),
        Dense(1)
    ])
    rnn_model = Sequential([
        LSTM(32, input_shape=(X_train.shape[1], 1)),
        Dense(1)
    ])

    models = [rf_model, xgb_model, lgbm_model, catboost_model, lasso_model, cnn_model, rnn_model]
    model_names = ['Random Forest', 'XGBoost', 'LightGBM', 'CatBoost', 'Lasso', 'CNN', 'RNN']

    # Update models with new data and validate
    update_models_with_new_data_and_validate(X_train, X_test, y_train, y_test, models, model_names, new_data_dir)

if __name__ == "__main__":
    new_data_dir = '/Users/kunalpathak9826/Desktop/ISRO/Data/New_Interpolated_CSV'  # Provide the new data directory path here if needed
    main(new_data_dir)


In [11]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import Lasso
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, LSTM
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import numpy as np

# Initialize variables to store information about the data trained before
trained_data_info = []

# Function to set the initial data directory
def set_initial_data_dir(initial_data_dir):
    global initial_data_dir_global
    initial_data_dir_global = initial_data_dir

# Function to set the new data directory
def set_new_data_dir(new_data_dir):
    global new_data_dir_global
    new_data_dir_global = new_data_dir

# Function to print data trained before
def print_trained_data_info():
    print("\nData trained before:")
    for folder, files in trained_data_info:
        print(f"Folder: {folder}, Files: {files}")

# Function to calculate and print evaluation metrics
def print_evaluation_metrics(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r_squared = r2_score(y_true, y_pred)
    print(f"\nModel: {model_name}")
    print(f"MSE: {mse:.4f}, MAE: {mae:.4f}, RMSE: {rmse:.4f}, R-squared: {r_squared:.4f}")

# Load initial dataset
def load_initial_data(encoding='iso-8859-1'):
    data = pd.concat([pd.read_csv(os.path.join(initial_data_dir_global, file), encoding=encoding) for file in os.listdir(initial_data_dir_global)], ignore_index=True)
    return data

# Load new dataset
def load_new_data(data_dir, encoding='iso-8859-1'):
    data = pd.concat([pd.read_csv(os.path.join(data_dir, file), encoding=encoding) for file in os.listdir(data_dir)], ignore_index=True)
    return data

# Handle missing values by imputing or removing rows with missing values
def handle_missing_values(data):
    # Impute missing values for numerical columns with mean
    numerical_cols = data.select_dtypes(include='number').columns
    imputer_num = SimpleImputer(strategy='mean')
    data[numerical_cols] = imputer_num.fit_transform(data[numerical_cols])

    # Scale the numerical columns
    scaler = StandardScaler()
    data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

    # Impute missing values for categorical columns with mode
    categorical_cols = data.select_dtypes(exclude='number').columns
    for col in categorical_cols:
        if data[col].isnull().any():
            imputer_cat = SimpleImputer(strategy='most_frequent')
            data[col] = imputer_cat.fit_transform(data[[col]])

    return data


# Update models with new data and validate
def update_models_with_new_data_and_validate(X_train, X_test, y_train, y_test, models, model_names, new_data_dir=None):
    if new_data_dir:
        data_dir = new_data_dir
    else:
        data_dir = initial_data_dir_global
    
    for year_folder in os.listdir(data_dir):
        if os.path.isdir(os.path.join(data_dir, year_folder)):
            print(f"\nTraining models for year {year_folder}:")
            trained_data_info.append((year_folder, []))
            year_folder_path = os.path.join(data_dir, year_folder)
            for idx, file_name in enumerate(os.listdir(year_folder_path)):
                if idx % 10 == 0:
                    print_trained_data_info()
                if idx >= 10:
                    break
                file_path = os.path.join(year_folder_path, file_name)
                new_data = pd.read_csv(file_path)
                new_data = handle_missing_values(new_data)
                X_new = new_data[X_train.columns]
                y_new = new_data['precipitationCal']

                for model, model_name in zip(models, model_names):
                    # Update model
                    model.fit(X_train, y_train)
                    print(f"\nTraining {model_name} on file: {file_name}")
                    # Evaluate on validation data
                    y_pred = model.predict(X_test)
                    print_evaluation_metrics(model_name, y_test, y_pred)

                # Append trained file info to trained_data_info
                trained_data_info[-1][1].append(file_name)

# Main function
def main(initial_data_dir=None, new_data_dir=None, batch_size=10):
    if initial_data_dir:
        set_initial_data_dir(initial_data_dir)
    if new_data_dir:
        set_new_data_dir(new_data_dir)

    # Load initial dataset
    data = load_initial_data()

    # Handle missing values in initial dataset
    data = handle_missing_values(data)
    data_with_precipitation = data[data['precipitationCal'].notna()]

    # Split features and target variable
    X_columns = [col for col in data.columns if 'IMG_TIR1' in col or 'IMG_TIR2' in col or 'IMG_WV' in col]
    X = data[X_columns]
    y = data['precipitationCal']

    # Split initial data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, train_size=0.7, random_state=42)

    # Initialize models
    rf_model = RandomForestRegressor()
    xgb_model = XGBRegressor()
    lgbm_model = LGBMRegressor()
    catboost_model = CatBoostRegressor(verbose=0)
    lasso_model = Lasso(alpha=0.1)
    cnn_model = Sequential([
        Conv1D(32, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)),
        MaxPooling1D(pool_size=2),
        Flatten(),
        Dense(64, activation='relu'),
        Dense(1)
    ])
    rnn_model = Sequential([
        LSTM(32, input_shape=(X_train.shape[1], 1)),
        Dense(1)
    ])

    models = [rf_model, xgb_model, lgbm_model, catboost_model, lasso_model, cnn_model, rnn_model]
    model_names = ['Random Forest', 'XGBoost', 'LightGBM', 'CatBoost', 'Lasso', 'CNN', 'RNN']

    # Update models with new data and validate
    update_models_with_new_data_and_validate(X_train, X_test, y_train, y_test, models, model_names, new_data_dir)

if __name__ == "__main__":
    main(initial_data_dir='/Users/kunalpathak9826/Desktop/ISRO/Data/Interpolated CSV/2017', new_data_dir='/Users/kunalpathak9826/Desktop/ISRO/Data/Interpolated CSV/2018')


In [14]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import Lasso
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, LSTM
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import numpy as np

# Initialize variables to store information about the data trained before
trained_data_info = []

# Function to set the initial data directory
def set_initial_data_dir(initial_data_dir):
    global initial_data_dir_global
    initial_data_dir_global = initial_data_dir

# Function to set the new data directory
def set_new_data_dir(new_data_dir):
    global new_data_dir_global
    new_data_dir_global = new_data_dir

# Function to print data trained before
def print_trained_data_info():
    print("\nData trained before:")
    for folder, files in trained_data_info:
        print(f"Folder: {folder}, Files: {files}")

# Function to calculate and print evaluation metrics
def print_evaluation_metrics(model_name, y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r_squared = r2_score(y_true, y_pred)
    print(f"\nModel: {model_name}")
    print(f"MSE: {mse:.4f}, MAE: {mae:.4f}, RMSE: {rmse:.4f}, R-squared: {r_squared:.4f}")

# Load initial dataset
def load_initial_data(encoding='iso-8859-1'):
    print("Loading initial data...")
    data = pd.concat([pd.read_csv(os.path.join(initial_data_dir_global, file), encoding=encoding) for file in os.listdir(initial_data_dir_global)], ignore_index=True)
    return data

# Load new dataset
def load_new_data(data_dir, encoding='iso-8859-1'):
    print("Loading new data...")
    data = pd.concat([pd.read_csv(os.path.join(data_dir, file), encoding=encoding) for file in os.listdir(data_dir)], ignore_index=True)
    return data

# Handle missing values by imputing or removing rows with missing values
def handle_missing_values(data):
    print("Handling missing values...")
    # Impute missing values for numerical columns with mean
    numerical_cols = data.select_dtypes(include='number').columns
    imputer_num = SimpleImputer(strategy='mean')
    data[numerical_cols] = imputer_num.fit_transform(data[numerical_cols])

    # Scale the numerical columns
    scaler = StandardScaler()
    data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

    # Impute missing values for categorical columns with mode
    categorical_cols = data.select_dtypes(exclude='number').columns
    for col in categorical_cols:
        if data[col].isnull().any():
            imputer_cat = SimpleImputer(strategy='most_frequent')
            data[col] = imputer_cat.fit_transform(data[[col]])

    return data


# Update models with new data and validate
def update_models_with_new_data_and_validate(X_train, X_test, y_train, y_test, models, model_names, new_data_dir=None):
    if new_data_dir:
        data_dir = new_data_dir
    else:
        data_dir = initial_data_dir_global
    
    print("Updating models with new data and validating...")
    for year_folder in os.listdir(data_dir):
        if os.path.isdir(os.path.join(data_dir, year_folder)):
            print(f"\nTraining models for year {year_folder}:")
            trained_data_info.append((year_folder, []))
            year_folder_path = os.path.join(data_dir, year_folder)
            for idx, file_name in enumerate(os.listdir(year_folder_path)):
                if idx % 10 == 0:
                    print_trained_data_info()
                if idx >= 10:
                    break
                print(f"\nTraining on file: {file_name}")
                file_path = os.path.join(year_folder_path, file_name)
                new_data = pd.read_csv(file_path)
                new_data = handle_missing_values(new_data)
                X_new = new_data[X_train.columns]
                y_new = new_data['precipitationCal']

                for model, model_name in zip(models, model_names):
                    # Update model
                    model.fit(X_new, y_new)
                    print(f"\nTraining {model_name} on file: {file_name}")
                    # Evaluate on validation data
                    y_pred = model.predict(X_new)
                    print_evaluation_metrics(model_name, y_new, y_pred)

                # Append trained file info to trained_data_info
                trained_data_info[-1][1].append(file_name)

# Main function
def main(initial_data_dir=None, new_data_dir=None, batch_size=10):
    if initial_data_dir:
        set_initial_data_dir(initial_data_dir)
    if new_data_dir:
        set_new_data_dir(new_data_dir)

    # Load initial dataset
    print("Starting main function...")
    data = load_initial_data()

    # Handle missing values in initial dataset
    data = handle_missing_values(data)
    data_with_precipitation = data[data['precipitationCal'].notna()]

    # Split features and target variable
    X_columns = [col for col in data.columns if 'IMG_TIR1' in col or 'IMG_TIR2' in col or 'IMG_WV' in col]
    X = data[X_columns]
    y = data['precipitationCal']

    # Split initial data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, train_size=0.7, random_state=42)

    # Initialize models
    rf_model = RandomForestRegressor()
    xgb_model = XGBRegressor()
    lgbm_model = LGBMRegressor()
    catboost_model = CatBoostRegressor(verbose=0)
    lasso_model = Lasso(alpha=0.1)
    cnn_model = Sequential([
        Conv1D(32, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)),
        MaxPooling1D(pool_size=2),
        Flatten(),
        Dense(64, activation='relu'),
        Dense(1)
    ])
    rnn_model = Sequential([
        LSTM(32, input_shape=(X_train.shape[1], 1)),
        Dense(1)
    ])

    models = [rf_model, xgb_model, lgbm_model, catboost_model, lasso_model, cnn_model, rnn_model]
    model_names = ['Random Forest', 'XGBoost', 'LightGBM', 'CatBoost', 'Lasso', 'CNN', 'RNN']

     # Update models with new data and validate
    update_models_with_new_data_and_validate(X_train, X_test, y_train, y_test, models, model_names, new_data_dir)

if __name__ == "__main__":
    main(initial_data_dir='/Users/kunalpathak9826/Desktop/ISRO/Data/Interpolated CSV/2017', new_data_dir='/Users/kunalpathak9826/Desktop/ISRO/Data/Interpolated CSV/2018')


Starting main function...
Loading initial data...
Handling missing values...
Updating models with new data and validating...


In [14]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import Lasso
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, LSTM
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import numpy as np

# Initialize variables to store information about the data trained before
trained_data_info = []
initial_data_dir_global = None 
new_data_dir_global = None  

# Function to set the initial data directory
def set_initial_data_dir(initial_data_dir):
    global initial_data_dir_global
    initial_data_dir_global = initial_data_dir

# Function to set the new data directory
def set_new_data_dir(new_data_dir):
    global new_data_dir_global
    new_data_dir_global = new_data_dir

# Function to print data trained before
def print_trained_data_info():
    print("\nData trained before:")
    for folder, files in trained_data_info:
        print(f"Folder: {folder}, Files: {files}")

# Function to calculate and print evaluation metrics
def print_evaluation_metrics(model_name, y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r_squared = r2_score(y_true, y_pred)
    print(f"\nModel: {model_name}")
    print(f"MSE: {mse:.4f}, MAE: {mae:.4f}, RMSE: {rmse:.4f}, R-squared: {r_squared:.4f}")

# Load initial dataset
def load_initial_data(encoding='iso-8859-1'):
    print("Loading initial data...")
    data = pd.concat([pd.read_csv(os.path.join(initial_data_dir_global, file), encoding=encoding) for file in os.listdir(initial_data_dir_global)], ignore_index=True)
    return data

# Load new dataset
def load_new_data(data_dir, encoding='iso-8859-1'):
    print("Loading new data...")
    data = pd.concat([pd.read_csv(os.path.join(data_dir, file), encoding=encoding) for file in os.listdir(data_dir)], ignore_index=True)
    return data

# Handle missing values by imputing or removing rows with missing values
def handle_missing_values(data):
    print("Handling missing values...")
    # Impute missing values for numerical columns with mean
    numerical_cols = data.select_dtypes(include='number').columns
    imputer_num = SimpleImputer(strategy='mean')
    data[numerical_cols] = imputer_num.fit_transform(data[numerical_cols])

    # Scale the numerical columns
    scaler = StandardScaler()
    data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

    # Impute missing values for categorical columns with mode
    categorical_cols = data.select_dtypes(exclude='number').columns
    for col in categorical_cols:
        if data[col].isnull().any():
            imputer_cat = SimpleImputer(strategy='most_frequent')
            data[col] = imputer_cat.fit_transform(data[[col]])

    return data


def update_models_with_new_data_and_validate(X_train, X_test, y_train, y_test, models, model_names, new_data_dir=None):
    if new_data_dir:
        data_dir = new_data_dir
    else:
        data_dir = initial_data_dir_global
    
    print("Updating models with new data and validating...")
    print("Data directory:", data_dir)  # Print data directory
    
    # Combine initial and new data
    combined_X = pd.concat([X_train, X_test], axis=0)
    combined_y = pd.concat([y_train, y_test], axis=0)
    
    for year_folder in os.listdir(data_dir):
        if os.path.isdir(os.path.join(data_dir, year_folder)):
            print(f"\nTraining models for year {year_folder}:")
            trained_data_info.append((year_folder, []))
            year_folder_path = os.path.join(data_dir, year_folder)
            for idx, file_name in enumerate(os.listdir(year_folder_path)):
                print(f"\nTraining on file: {file_name}")
                file_path = os.path.join(year_folder_path, file_name)
                new_data = pd.read_csv(file_path)
                new_data = handle_missing_values(new_data)
                X_new = new_data[X_train.columns]
                y_new = new_data['precipitationCal']

                print(f"New data shape: {X_new.shape}, {y_new.shape}")  # Print new data shape
                
                # Combine new data with existing data
                combined_X = pd.concat([combined_X, X_new], axis=0)
                combined_y = pd.concat([combined_y, y_new], axis=0)
                
                for model, model_name in zip(models, model_names):
                    # Update model
                    model.fit(X_train, y_train)  # Fit on combined data
                    print(f"\nTraining {model_name} on file: {file_name}")
                    # Evaluate on new data
                    y_pred = model.predict(X_test)
                    print_evaluation_metrics(model_name, y_test, y_pred)

                # Append trained file info to trained_data_info
                trained_data_info[-1][1].append(file_name)



# Main function
def main(initial_data_dir=None, new_data_dir=None, batch_size=10):
    if initial_data_dir:
        set_initial_data_dir(initial_data_dir)
    if new_data_dir:
        set_new_data_dir(new_data_dir)

    # Load initial dataset
    print("Starting main function...")
    data = load_initial_data()

    # Handle missing values in initial dataset
    data = handle_missing_values(data)
    data_with_precipitation = data[data['precipitationCal'].notna()]

    # Split features and target variable
    X_columns = [col for col in data.columns if 'IMG_TIR1' in col or 'IMG_TIR2' in col or 'IMG_WV' in col]
    X = data[X_columns]
    y = data['precipitationCal']

    # Split initial data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, train_size=0.7, random_state=42)
    print("X_train size:", X_train.size)
    print("X_test size:", X_test.size)
    print("y_train size:", y_train.size)
    print("y_test size:", y_test.size)


    # Initialize models
    rf_model = RandomForestRegressor()
    xgb_model = XGBRegressor()
    lgbm_model = LGBMRegressor()
    catboost_model = CatBoostRegressor(verbose=0)
    lasso_model = Lasso(alpha=0.1)
    cnn_model = Sequential([
        Conv1D(32, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)),
        MaxPooling1D(pool_size=2),
        Flatten(),
        Dense(64, activation='relu'),
        Dense(1)
    ])
    rnn_model = Sequential([
        LSTM(32, input_shape=(X_train.shape[1], 1)),
        Dense(1)
    ])

    models = [rf_model, xgb_model, lgbm_model, catboost_model, lasso_model, cnn_model, rnn_model]
    model_names = ['Random Forest', 'XGBoost', 'LightGBM', 'CatBoost', 'Lasso', 'CNN', 'RNN']

     # Update models with new data and validate
    update_models_with_new_data_and_validate(X_train, X_test, y_train, y_test, models, model_names, new_data_dir)

if __name__ == "__main__":
    main(initial_data_dir='/Users/kunalpathak9826/Desktop/ISRO/Data/Interpolated CSV/2017')#, new_data_dir='/Users/kunalpathak9826/Desktop/ISRO/Data/Interpolated CSV/2018')


Starting main function...
Loading initial data...
Handling missing values...
X_train size: 248850000
X_test size: 106650000
y_train size: 175000
y_test size: 75000
Updating models with new data and validating...
Data directory: /Users/kunalpathak9826/Desktop/ISRO/Data/Interpolated CSV/2017


In [27]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import SGDRegressor
import numpy as np

X = np.random.rand(100, 1)
y = 2 * X.squeeze() + np.random.randn(100)
print("X:", X)
print("y:", y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, train_size=0.7, random_state=42)
print("X_train size:", X_train.size)
print("X_test size:", X_test.size)
print("y_train size:", y_train.size)
print("y_test size:", y_test.size)

model = SGDRegressor(learning_rate='constant', eta0=0.01, random_state=42)
print("model:", model)

# Fitting the model on the entire training set
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate the mean squared error
mse = mean_squared_error(y_test, y_pred)
print(f"Final MSE: {mse:.4f}")


X: [[8.38181607e-01]
 [7.40191107e-01]
 [8.49364521e-01]
 [3.85454260e-01]
 [6.87884540e-01]
 [8.47988012e-01]
 [6.99003804e-01]
 [3.86299011e-01]
 [7.78082562e-01]
 [9.04354156e-01]
 [6.44373166e-02]
 [9.35996291e-01]
 [6.60155290e-01]
 [4.02882508e-02]
 [5.56640809e-01]
 [9.17078665e-01]
 [1.35881883e-01]
 [4.17283779e-01]
 [2.95392848e-01]
 [1.37612806e-01]
 [2.03337222e-01]
 [4.55249434e-01]
 [9.33910632e-01]
 [9.05238748e-01]
 [8.50795709e-01]
 [4.13012769e-01]
 [8.06816930e-01]
 [6.78973614e-01]
 [2.69273694e-01]
 [4.77465219e-01]
 [5.55743229e-03]
 [3.82213505e-01]
 [5.42730111e-01]
 [4.59294803e-01]
 [1.72751282e-01]
 [3.95672077e-01]
 [7.38890718e-01]
 [4.09550056e-01]
 [8.72827098e-01]
 [2.47000752e-02]
 [1.12642301e-01]
 [9.53502331e-02]
 [2.39771563e-01]
 [1.33606646e-01]
 [7.51006622e-01]
 [3.59075844e-01]
 [5.14342489e-01]
 [9.59657162e-01]
 [7.63467728e-04]
 [3.03920458e-01]
 [1.85762779e-01]
 [5.25003325e-01]
 [5.90588745e-01]
 [4.61664815e-01]
 [4.19498181e-01]
 [2.973

In [2]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import SGDRegressor

# Directory containing your CSV files
folder_path = '/Users/kunalpathak9826/Desktop/ISRO/Data/Interpolated CSV/2017/'

# List all CSV files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# Loop through each CSV file
for csv_file in csv_files:
    # Load the data from CSV file
    df = pd.read_csv(os.path.join(folder_path, csv_file))

    # Assuming your data is formatted appropriately, extract features (X) and target (y)
    X = df.drop(columns=['precipitationCal'])  # Adjust 'precipitationCal' to your target column name
    y = df['precipitationCal']

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, train_size=0.7, random_state=42)

    # Create and train your model
    model = SGDRegressor(learning_rate='constant', eta0=0.01, random_state=42)
    model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = model.predict(X_test)

    # Calculate the mean squared error
    mse = mean_squared_error(y_test, y_pred)
    print(f"MSE for {csv_file}: {mse:.4f}")


MSE for interpolated_insat_on_imerg_20170109.csv: 253984880455632230893499550091182080.0000
MSE for interpolated_insat_on_imerg_20170108.csv: 327540727142447713759345235420250112.0000
MSE for interpolated_insat_on_imerg_20170101.csv: 263161542804419823748713252160274432.0000
MSE for interpolated_insat_on_imerg_20170103.csv: 190461397001886902001456359235649536.0000
MSE for interpolated_insat_on_imerg_20170102.csv: 189969750731698592919319278086258688.0000
MSE for interpolated_insat_on_imerg_20170106.csv: 127546023269829882025434495013879808.0000
MSE for interpolated_insat_on_imerg_20170107.csv: 1013536208962113048387980810518528.0000
MSE for interpolated_insat_on_imerg_20170105.csv: 82501416541071600488745311443877888.0000
MSE for interpolated_insat_on_imerg_20170104.csv: 166396601923241961520893336936775680.0000
MSE for interpolated_insat_on_imerg_20170110.csv: 246280042441083642643090214743965696.0000


In [5]:
import os
import pandas as pd

# Specify the directory where your CSV files are located
directory = '/Users/kunalpathak9826/Desktop/ISRO/Data/Interpolated CSV/2017/'

# Get a list of all CSV files in the directory
csv_files = [file for file in os.listdir(directory) if file.endswith('.csv')]

# Iterate through each CSV file and count the columns
for file in csv_files:
    file_path = os.path.join(directory, file)
    df = pd.read_csv(file_path)
    num_columns = len(df.columns)
    print(f"File: {file}, Number of columns: {num_columns}")


File: interpolated_insat_on_imerg_20170109.csv, Number of columns: 144
File: interpolated_insat_on_imerg_20170108.csv, Number of columns: 147
File: interpolated_insat_on_imerg_20170101.csv, Number of columns: 147
File: interpolated_insat_on_imerg_20170103.csv, Number of columns: 147
File: interpolated_insat_on_imerg_20170102.csv, Number of columns: 138
File: interpolated_insat_on_imerg_20170106.csv, Number of columns: 144
File: interpolated_insat_on_imerg_20170107.csv, Number of columns: 147
File: interpolated_insat_on_imerg_20170105.csv, Number of columns: 147
File: interpolated_insat_on_imerg_20170104.csv, Number of columns: 144
File: interpolated_insat_on_imerg_20170110.csv, Number of columns: 147


In [5]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import SGDRegressor

# Directory containing your CSV files
folder_path = '/Users/kunalpathak9826/Desktop/ISRO/Data/Interpolated CSV/2017/'

# List all CSV files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# Shuffle the list of CSV files
import random
random.shuffle(csv_files)

# Calculate the index to split files into training and testing sets
split_index = int(len(csv_files) * 0.7)

# Training set and Testing set
train_files = csv_files[:split_index]
test_files = csv_files[split_index:]

total_mse = 0  # Variable to accumulate MSE for all CSV files

# Loop through each CSV file in the training set
for csv_file in train_files:
    # Load the data from CSV file
    df = pd.read_csv(os.path.join(folder_path, csv_file))

    # Assuming your data is formatted appropriately, extract features (X) and target (y)
    X = df.drop(columns=['precipitationCal'])  # Adjust 'precipitationCal' to your target column name
    y = df['precipitationCal']

    # Create and train your model
    model = SGDRegressor(learning_rate='constant', eta0=0.01, random_state=42)
    model.fit(X, y)

    # Loop through each CSV file in the testing set
for csv_file in test_files:
    # Load the data from CSV file
    df = pd.read_csv(os.path.join(folder_path, csv_file))

    # Assuming your data is formatted appropriately, extract features (X) and target (y)
    X_test = df.drop(columns=['precipitationCal'])  # Adjust 'precipitationCal' to your target column name
    y_test = df['precipitationCal']

    # Make predictions on the test set
    y_pred = model.predict(X_test)

    # Calculate the mean squared error for the current CSV file
    mse = mean_squared_error(y_test, y_pred)
    print(f"MSE for {csv_file}: {mse:.4f}")

    # Accumulate MSE for all CSV files
    total_mse += mse

# Calculate average MSE
average_mse = total_mse / len(test_files)
print(f"Average MSE for all CSV files in the testing set: {average_mse:.4f}")


ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- 1015 IMG_TIR1
- 1015 IMG_TIR2
- 1015 IMG_WV


In [4]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import SGDRegressor

# Directory containing your CSV files
folder_path = '/Users/kunalpathak9826/Desktop/ISRO/Data/Interpolated CSV/2017/'

# List all CSV files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# Shuffle the list of CSV files
import random
random.shuffle(csv_files)

# Calculate the index to split files into training and testing sets
split_index = int(len(csv_files) * 0.7)

# Training set and Testing set
train_files = csv_files[:split_index]
test_files = csv_files[split_index:]

total_mse = 0  # Variable to accumulate MSE for all CSV files

# Initialize sets to store all feature column names
all_feature_columns = set()

# Loop through each CSV file in the training set to collect feature column names
for csv_file in train_files:
    # Load the data from CSV file
    df = pd.read_csv(os.path.join(folder_path, csv_file))
    
    # Extract all feature columns
    feature_columns = set(df.columns) - {'precipitationCal'}  # Exclude target column
    all_feature_columns.update(feature_columns)

# Convert the set of all feature column names to a list
all_feature_columns = list(all_feature_columns)

# Loop through each CSV file in the training set again to train the model
for csv_file in train_files:
    # Load the data from CSV file
    df = pd.read_csv(os.path.join(folder_path, csv_file))

    # Assuming your data is formatted appropriately, extract features (X) and target (y)
    X = df[all_feature_columns]  # Select all feature columns
    y = df['precipitationCal']

    # Create and train your model
    model = SGDRegressor(learning_rate='constant', eta0=0.01, random_state=42)
    model.fit(X, y)

# Loop through each CSV file in the testing set
for csv_file in test_files:
    # Load the data from CSV file
    df = pd.read_csv(os.path.join(folder_path, csv_file))

    # Assuming your data is formatted appropriately, extract features (X) and target (y)
    X_test = df[all_feature_columns]  # Select all feature columns
    y_test = df['precipitationCal']

    # Make predictions on the test set
    y_pred = model.predict(X_test)

    # Calculate the mean squared error for the current CSV file
    mse = mean_squared_error(y_test, y_pred)
    print(f"MSE for {csv_file}: {mse:.4f}")

    # Accumulate MSE for all CSV files
    total_mse += mse

# Calculate average MSE
average_mse = total_mse / len(test_files)
print(f"Average MSE for all CSV files in the testing set: {average_mse:.4f}")


ValueError: could not convert string to float: '2017-01-01'

In [13]:
import pandas as pd
import os

# Define the folder path and CSV file name
folder_path = "/Users/kunalpathak9826/Desktop/ISRO/Data/Interpolated CSV/2017/"
#csv_file = "your_data.csv"

# Read the CSV file into a DataFrame
df = pd.read_csv(os.path.join(folder_path, csv_file))

# Print the column names in the DataFrame
print("Column Names:", df.columns)

# Double-check that all feature columns are present in the DataFrame
all_feature_columns = ['column1', 'column2', 'column3']  # Replace with your actual feature column names

missing_columns = [col for col in all_feature_columns if col not in df.columns]
if missing_columns:
    print("Missing Columns:", missing_columns)
    # Handle missing columns as needed
else:
    print("All feature columns present.")

# Now you can proceed with selecting features and target
X = df[all_feature_columns]
y = df['precipitationCal']

# Proceed with model training


Column Names: Index(['longitude', 'latitude', '20170106 0015 IMG_TIR1',
       '20170106 0015 IMG_TIR2', '20170106 0015 IMG_WV',
       '20170106 0045 IMG_TIR1', '20170106 0045 IMG_TIR2',
       '20170106 0045 IMG_WV', '20170106 0115 IMG_TIR1',
       '20170106 0115 IMG_TIR2',
       ...
       '20170106 2245 IMG_TIR1', '20170106 2245 IMG_TIR2',
       '20170106 2245 IMG_WV', '20170106 2315 IMG_TIR1',
       '20170106 2315 IMG_TIR2', '20170106 2315 IMG_WV',
       '20170106 2345 IMG_TIR1', '20170106 2345 IMG_TIR2',
       '20170106 2345 IMG_WV', 'precipitationCal'],
      dtype='object', length=144)
Missing Columns: ['column1', 'column2', 'column3']


KeyError: "None of [Index(['column1', 'column2', 'column3'], dtype='object')] are in the [columns]"

In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import SGDRegressor
import re

# Define the folder path
folder_path = '/Users/kunalpathak9826/Desktop/ISRO/Data/Interpolated CSV/2017/'

# List all CSV files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# Shuffle the list of CSV files
import random
random.shuffle(csv_files)

# Define a function to extract the common part of column names using regular expressions
def extract_variable_name(column_name):
    match = re.match(r'\d+\s+\d+\s+IMG_(.*)', column_name)
    if match:
        return match.group(1)
    else:
        return None

# Create a mapping of old column names to new standardized names
rename_mapping = {}

# Loop through each CSV file to extract variable names and create the mapping
for csv_file in csv_files:
    df = pd.read_csv(os.path.join(folder_path, csv_file))
    variable_names = df.columns.map(extract_variable_name).unique()
    for variable_name in variable_names:
        if variable_name not in rename_mapping.values():
            standard_name = f"IMG_{variable_name}"
            rename_mapping[variable_name] = standard_name

total_mse = 0  # Variable to accumulate MSE for all CSV files

# Training set and Testing set
split_index = int(len(csv_files) * 0.7)
train_files = csv_files[:split_index]
test_files = csv_files[split_index:]

# Loop through each CSV file in the training set
for csv_file in train_files:
    df = pd.read_csv(os.path.join(folder_path, csv_file))
    df.rename(columns=rename_mapping, inplace=True)
    # Assuming your data is formatted appropriately, extract features (X) and target (y)
    X = df.drop(columns=['precipitationCal'])  # Adjust 'precipitationCal' to your target column name
    y = df['precipitationCal']

    # Create and train your model
    model = SGDRegressor(learning_rate='constant', eta0=0.01, random_state=42)
    model.fit(X, y)

    # Loop through each CSV file in the testing set
    for csv_file_test in test_files:
        df_test = pd.read_csv(os.path.join(folder_path, csv_file_test))
        df_test.rename(columns=rename_mapping, inplace=True)
        # Assuming your data is formatted appropriately, extract features (X) and target (y)
        X_test = df_test.drop(columns=['precipitationCal'])  # Adjust 'precipitationCal' to your target column name
        y_test = df_test['precipitationCal']

        # Make predictions on the test set
        y_pred = model.predict(X_test)

        # Calculate the mean squared error for the current CSV file
        mse = mean_squared_error(y_test, y_pred)
        print(f"MSE for {csv_file_test}: {mse:.4f}")

        # Accumulate MSE for all CSV files
        total_mse += mse

# Calculate average MSE
average_mse = total_mse / (len(test_files) * len(train_files))
print(f"Average MSE for all CSV files in the testing set: {average_mse:.4f}")
