In [28]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import SGDRegressor

# Directory containing your CSV files
folder_path = '/Users/kunalpathak9826/Desktop/ISRO/Data/Interpolated CSV/2017/'

# List all CSV files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# Shuffle the list of CSV files
import random
random.shuffle(csv_files)

# Calculate the index to split files into training and testing sets
split_index = int(len(csv_files) * 0.7)

# Training set and Testing set
train_files = csv_files[:split_index]
test_files = csv_files[split_index:]

# Initialize common columns with columns from the first CSV file
common_columns = set(pd.read_csv(os.path.join(folder_path, csv_files[0])).columns)

# Loop through each CSV file in the folder to find common columns
for csv_file in csv_files[1:]:
    df = pd.read_csv(os.path.join(folder_path, csv_file))
    common_columns = common_columns.intersection(df.columns)

# Initialize an empty DataFrame to store concatenated data
concatenated_data = pd.DataFrame(columns=list(common_columns) + ['precipitationCal'])

# Loop through each CSV file in the folder
for csv_file in csv_files:
    # Load the data from CSV file, keeping only the common columns
    df = pd.read_csv(os.path.join(folder_path, csv_file))[list(common_columns) + ['precipitationCal']]
    
    # Concatenate the data to the DataFrame
    concatenated_data = pd.concat([concatenated_data, df], ignore_index=True)

# Extract features (X) and target (y) from concatenated data
X = concatenated_data.drop(columns=['precipitationCal'])
y = concatenated_data['precipitationCal']

# Ensure y contains only one column
if isinstance(y, pd.DataFrame):
    y = y.values.reshape(-1)            #y.iloc[:, 0]
    #y = y.iloc[:, 0]
    # Remove duplicate samples from y
    y = y[:len(X)]
    
print("Length of X:", len(X))
print("Length of y:", len(y))
    
# Ensure X and y have the same number of samples
if len(X) != len(y):
    raise ValueError("Number of samples in X and y are not consistent")


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create and train your model
model = SGDRegressor(learning_rate='constant', eta0=0.01, random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate the mean squared error for the testing set
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error on Test Set: {mse:.4f}")


Length of X: 250000
Length of y: 250000
Mean Squared Error on Test Set: 106400162754806805597051091170274285402041772367308390400.0000


In [10]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import SGDRegressor

# Directory containing your CSV files
folder_path = '/Users/kunalpathak9826/Desktop/ISRO/Data/Interpolated CSV/2017/'

# Initialize common columns with columns from the first CSV file
first_csv_file = os.listdir(folder_path)[0]
common_columns = set(pd.read_csv(os.path.join(folder_path, os.listdir(folder_path)[0])).columns)

# Initialize the model
model = SGDRegressor(learning_rate='constant', eta0=0.01, random_state=42)

# Loop through each CSV file in the folder
for idx, csv_file in enumerate(os.listdir(folder_path)):
    # Load the data from CSV file, keeping only the common columns
    df = pd.read_csv(os.path.join(folder_path, csv_file), encoding = 'latin1')
    df = df[list(common_columns) + ['precipitationCal']]

    # Extract features (X) and target (y) from the data
    X = df.drop(columns=['precipitationCal'])
    y = df['precipitationCal']
    
    # Ensure y contains only one column
    if isinstance(y, pd.DataFrame):
        y = y.values.reshape(-1)            #y.iloc[:, 0]
        #y = y.iloc[:, 0]
        # Remove duplicate samples from y
        y = y[:len(X)]

    # Fit the model on the new data
    model.partial_fit(X, y)

    # Print old and new file names
    if idx > 0:
        old_file = os.listdir(folder_path)[idx - 1]
        print(f"Old File Name: {old_file}")
    new_file = csv_file
    print(f"New File Name: {new_file}")

    # Make predictions on the entire dataset
    y_pred_all = model.predict(X)
    mse_all = mean_squared_error(y, y_pred_all)
    print(f"Mean Squared Error on All Data (including new data): {mse_all:.4f}")

    # Make predictions on the new data only
    y_pred_new = model.predict(X[-1:])
    mse_new = mean_squared_error(y[-1:], y_pred_new)
    print(f"Mean Squared Error on New Data: {mse_new:.4f}")

    # Make predictions on the data model has in memory
    y_pred_memory = model.predict(X[:-1])
    mse_memory = mean_squared_error(y[:-1], y_pred_memory)
    print(f"Mean Squared Error on Data in Model's Memory: {mse_memory:.4f}")

    print("\n")

# Final prediction after processing all data
y_pred = model.predict(X)

# Calculate the mean squared error for the entire dataset
mse = mean_squared_error(y, y_pred)
print(f"Mean Squared Error on Entire Dataset: {mse:.4f}")


New File Name: interpolated_insat_on_imerg_20170109.csv
Mean Squared Error on All Data (including new data): 472991466205821518038888200472377189177829655620179984384.0000
Mean Squared Error on New Data: 472991466217742921480935521986214316448650752965707038720.0000
Mean Squared Error on Data in Model's Memory: 472991466205821082477458541671143956058332142956869320704.0000


Old File Name: interpolated_insat_on_imerg_20170109.csv
New File Name: interpolated_insat_on_imerg_20170108.csv
Mean Squared Error on All Data (including new data): 461475094201108891080363107272529068348001309391119712256.0000
Mean Squared Error on New Data: 461475094216427238112319419604351294579923667896512806912.0000
Mean Squared Error on Data in Model's Memory: 461475094201108281294361584950802541980704791662484783104.0000




KeyError: "None of [Index(['0615 IMG_TIR2', '1045 IMG_WV', '1415 IMG_TIR1', '1215 IMG_TIR1',\n       '1615 IMG_TIR2', '1315 IMG_TIR1', '0945 IMG_TIR2', '1145 IMG_TIR1',\n       '1645 IMG_TIR2', '0715 IMG_WV',\n       ...\n       '1745 IMG_TIR1', '0315 IMG_TIR2', '0645 IMG_WV', '0045 IMG_TIR2',\n       '1015 IMG_TIR1', '1615 IMG_WV', '0315 IMG_WV', '1215 IMG_TIR2',\n       '0945 IMG_TIR1', 'precipitationCal'],\n      dtype='object', length=146)] are in the [columns]"

In [13]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import SGDRegressor

# Directory containing your CSV files
folder_path = '/Users/kunalpathak9826/Desktop/ISRO/Data/Interpolated CSV/2017/'

# List all CSV files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# Shuffle the list of CSV files
import random
random.shuffle(csv_files)

# Calculate the index to split files into training and testing sets
split_index = int(len(csv_files) * 0.7)

# Training set and Testing set
train_files = csv_files[:split_index]
test_files = csv_files[split_index:]

# Print the list of files in the training and testing sets
print("Files in Training Set:")
print(train_files)
print("\nFiles in Testing Set:")
print(test_files)

# Initialize common columns with columns from the first CSV file
common_columns = set(pd.read_csv(os.path.join(folder_path, csv_files[0])).columns)

# Loop through each CSV file in the folder to find common columns
for csv_file in csv_files[1:]:
    df = pd.read_csv(os.path.join(folder_path, csv_file))
    common_columns = common_columns.intersection(df.columns)

# Print the common variables found in all files
print("\nCommon Variables:")
print(common_columns)

# Initialize an empty set to store all unique columns
all_columns = set()

# Loop through each CSV file in the folder to find all unique columns
for csv_file in csv_files:
    df = pd.read_csv(os.path.join(folder_path, csv_file))
    all_columns.update(df.columns)

# Find the uncommon variables
uncommon_variables = all_columns - common_columns

# Print the uncommon variables
print("\nUncommon Variables:")
print(uncommon_variables)

# Initialize an empty DataFrame to store concatenated data
concatenated_data = pd.DataFrame(columns=list(common_columns) + ['precipitationCal'])

# Loop through each CSV file in the folder
for csv_file in csv_files:
    # Load the data from CSV file, keeping only the common columns
    df = pd.read_csv(os.path.join(folder_path, csv_file))[list(common_columns) + ['precipitationCal']]
    
    # Concatenate the data to the DataFrame
    concatenated_data = pd.concat([concatenated_data, df], ignore_index=True)

# Extract features (X) and target (y) from concatenated data
X = concatenated_data.drop(columns=['precipitationCal'])
y = concatenated_data['precipitationCal']

# Ensure y contains only one column
if isinstance(y, pd.DataFrame):
    y = y.values.reshape(-1)            #y.iloc[:, 0]
    #y = y.iloc[:, 0]
    # Remove duplicate samples from y
    y = y[:len(X)]
    
print("Length of X:", len(X))
print("Length of y:", len(y))
    
# Ensure X and y have the same number of samples
if len(X) != len(y):
    raise ValueError("Number of samples in X and y are not consistent")


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create and train your model
model = SGDRegressor(learning_rate='constant', eta0=0.01, random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate the mean squared error for the testing set
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error on Test Set: {mse:.4f}")


Files in Training Set:
['interpolated_insat_on_imerg_20170103.csv', 'interpolated_insat_on_imerg_20170109.csv', 'interpolated_insat_on_imerg_20170104.csv', 'interpolated_insat_on_imerg_20170101.csv', 'interpolated_insat_on_imerg_20170102.csv', 'interpolated_insat_on_imerg_20170106.csv', 'interpolated_insat_on_imerg_20170107.csv']

Files in Testing Set:
['interpolated_insat_on_imerg_20170105.csv', 'interpolated_insat_on_imerg_20170110.csv', 'interpolated_insat_on_imerg_20170108.csv']

Common Variables:
{'1045 IMG_WV', '1415 IMG_TIR1', '1215 IMG_TIR1', '1315 IMG_TIR1', '0945 IMG_TIR2', '1145 IMG_TIR1', '1645 IMG_TIR2', '0715 IMG_WV', '2245 IMG_TIR2', '1815 IMG_WV', '1715 IMG_WV', '1315 IMG_TIR2', '1815 IMG_TIR1', '2145 IMG_WV', '0345 IMG_WV', '1115 IMG_TIR1', '2315 IMG_TIR1', '1145 IMG_WV', '2145 IMG_TIR1', '2015 IMG_TIR1', '0745 IMG_TIR1', '0045 IMG_WV', '0545 IMG_WV', '1245 IMG_TIR1', '0145 IMG_TIR2', '0715 IMG_TIR1', '1345 IMG_TIR2', 'longitude', '0445 IMG_TIR2', '0845 IMG_TIR1', '154

In [33]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import SGDRegressor

# Directory containing your CSV files
folder_path = '/Users/kunalpathak9826/Desktop/ISRO/Data/Interpolated CSV/2017/'

# List all CSV files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# Shuffle the list of CSV files
import random
random.shuffle(csv_files)

# Calculate the index to split files into training and testing sets
split_index = int(len(csv_files) * 0.7)

# Training set and Testing set
train_files = csv_files[:split_index]
test_files = csv_files[split_index:]

# Initialize common columns with columns from the first CSV file
common_columns = set(pd.read_csv(os.path.join(folder_path, csv_files[0])).columns)

# Loop through each CSV file in the folder to find common columns
for csv_file in csv_files[1:]:
    df = pd.read_csv(os.path.join(folder_path, csv_file))
    common_columns = common_columns.intersection(df.columns)

# Print the common variables found in all files
print("\nCommon Variables:")
print(common_columns)

# Initialize an empty set to store all unique columns
all_columns = set()

# Loop through each CSV file in the folder to find all unique columns
for csv_file in csv_files:
    df = pd.read_csv(os.path.join(folder_path, csv_file))
    all_columns.update(df.columns)

# Find the uncommon variables
uncommon_variables = all_columns - common_columns

# Print the uncommon variables
print("\nUncommon Variables:")
print(uncommon_variables)


# Initialize the model
model = SGDRegressor(learning_rate='constant', eta0=0.01, random_state=42)

# Loop through each CSV file in the folder
for idx, csv_file in enumerate(csv_files):
    # Load the data from CSV file, keeping only the common columns
    df = pd.read_csv(os.path.join(folder_path, csv_file))[list(common_columns) + ['precipitationCal']]
    
    # Concatenate the data to the DataFrame
    if idx == 0:
        concatenated_data = df
    else:
        concatenated_data = pd.concat([concatenated_data, df], ignore_index=True)

    # Extract features (X) and target (y) from the data
    X = df.drop(columns=['precipitationCal'])
    y = df['precipitationCal']

    # Ensure y contains only one column
    if isinstance(y, pd.DataFrame):
        y = y.values.reshape(-1)
        y = y[:len(X)]
        #y = y.iloc[:, 0]

    # Fit the model on the new data
    model.partial_fit(X, y)
    


# Extract features (X) and target (y) from concatenated data for test set
X_test = concatenated_data.drop(columns=['precipitationCal'])
y_test = concatenated_data['precipitationCal']

# Ensure y contains only one column
if isinstance(y_test, pd.DataFrame):
    y_test = y_test.values.reshape(-1)
    y_test = y_test[:len(X_test)]

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate the mean squared error for the testing set
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error on Test Set: {mse:.4f}")



Common Variables:
{'1045 IMG_WV', '1415 IMG_TIR1', '1215 IMG_TIR1', '1315 IMG_TIR1', '0945 IMG_TIR2', '1145 IMG_TIR1', '1645 IMG_TIR2', '0715 IMG_WV', '2245 IMG_TIR2', '1815 IMG_WV', '1715 IMG_WV', '1315 IMG_TIR2', '1815 IMG_TIR1', '2145 IMG_WV', '0345 IMG_WV', '1115 IMG_TIR1', '2315 IMG_TIR1', '1145 IMG_WV', '2145 IMG_TIR1', '2015 IMG_TIR1', '0745 IMG_TIR1', '0045 IMG_WV', '0545 IMG_WV', '1245 IMG_TIR1', '0145 IMG_TIR2', '0715 IMG_TIR1', '1345 IMG_TIR2', 'longitude', '0445 IMG_TIR2', '0845 IMG_TIR1', '1545 IMG_WV', '1515 IMG_TIR2', '1215 IMG_WV', '0115 IMG_TIR1', '0815 IMG_TIR2', '1915 IMG_WV', '0245 IMG_TIR2', '2145 IMG_TIR2', '1045 IMG_TIR2', '1045 IMG_TIR1', '1445 IMG_TIR2', '2215 IMG_TIR2', '0115 IMG_TIR2', '1245 IMG_TIR2', '1845 IMG_WV', '1745 IMG_TIR2', '0245 IMG_WV', 'Date', '0345 IMG_TIR2', '1445 IMG_WV', '0645 IMG_TIR1', '0815 IMG_TIR1', '1415 IMG_TIR2', '1115 IMG_TIR2', '0515 IMG_TIR2', '1445 IMG_TIR1', '1245 IMG_WV', 'precipitationCal', '1545 IMG_TIR1', '2215 IMG_WV', '034

In [39]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import SGDRegressor
from joblib import dump

# Directory containing your CSV files
folder_path = '/Users/kunalpathak9826/Desktop/ISRO/Data/Interpolated CSV/2018/'

# List all CSV files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# Shuffle the list of CSV files
import random
random.shuffle(csv_files)

# Calculate the index to split files into training and testing sets
split_index = int(len(csv_files) * 0.7)

# Training set and Testing set
train_files = csv_files[:split_index]
test_files = csv_files[split_index:]

# Initialize common columns with columns from the first CSV file
common_columns = set(pd.read_csv(os.path.join(folder_path, csv_files[0])).columns)

# Loop through each CSV file in the folder to find common columns
for csv_file in csv_files[1:]:
    df = pd.read_csv(os.path.join(folder_path, csv_file))
    common_columns = common_columns.intersection(df.columns)

# Print the common variables found in all files
print("\nCommon Variables:")
print(common_columns)

# Initialize an empty set to store all unique columns
all_columns = set()

# Loop through each CSV file in the folder to find all unique columns
for csv_file in csv_files:
    df = pd.read_csv(os.path.join(folder_path, csv_file))
    all_columns.update(df.columns)

# Find the uncommon variables
uncommon_variables = all_columns - common_columns

# Print the uncommon variables
print("\nUncommon Variables:")
print(uncommon_variables)

# Initialize an empty list to store the files in memory
files_in_memory = []

# Initialize the model
model = SGDRegressor(learning_rate='constant', eta0=0.01, random_state=42)

# Loop through each CSV file in the folder
for idx, csv_file in enumerate(csv_files):
    # Load the data from CSV file, keeping only the common columns
    df = pd.read_csv(os.path.join(folder_path, csv_file))[list(common_columns) + ['precipitationCal']]
    
    # Concatenate the data to the DataFrame
    if idx == 0:
        concatenated_data = df
    else:
        concatenated_data = pd.concat([concatenated_data, df], ignore_index=True)

    # Extract features (X) and target (y) from the data
    X = df.drop(columns=['precipitationCal'])
    y = df['precipitationCal']

    # Ensure y contains only one column
    if isinstance(y, pd.DataFrame):
        y = y.values.reshape(-1)
        y = y[:len(X)]
        #y = y.iloc[:, 0]

    # Fit the model on the new data
    model.partial_fit(X, y)
    
    # Print old and new files
    print(f"Old files: {files_in_memory}")
    print(f"New file: {csv_file}")
    
    # Update files in memory
    files_in_memory.append(csv_file)

# Print all files in memory
print(f"All files in memory: {files_in_memory}")

# Extract features (X) and target (y) from concatenated data for test set
X_test = concatenated_data.drop(columns=['precipitationCal'])
y_test = concatenated_data['precipitationCal']

# Ensure y contains only one column
if isinstance(y_test, pd.DataFrame):
    y_test = y_test.values.reshape(-1)
    y_test = y_test[:len(X_test)]

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate the mean squared error for the testing set
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error on Test Set: {mse:.4f}")

# Save the trained model as a .pkl file
model_filename = 'incremental_model.pkl'
dump(model, model_filename)
print(f"Model saved as {model_filename}")



Common Variables:
{'0615 IMG_TIR2', '1045 IMG_WV', '1415 IMG_TIR1', '1615 IMG_TIR2', '1315 IMG_TIR1', '0945 IMG_TIR2', '1145 IMG_TIR1', '1645 IMG_TIR2', '0715 IMG_WV', '2245 IMG_TIR2', '1815 IMG_WV', '1715 IMG_WV', '1315 IMG_TIR2', '1815 IMG_TIR1', '2145 IMG_WV', '0345 IMG_WV', '1115 IMG_TIR1', '2315 IMG_TIR1', '1145 IMG_WV', '2145 IMG_TIR1', '2015 IMG_TIR1', '0745 IMG_TIR1', '0045 IMG_WV', '0545 IMG_WV', '1245 IMG_TIR1', '0145 IMG_TIR2', '0715 IMG_TIR1', '1345 IMG_TIR2', 'longitude', '0615 IMG_TIR1', '0445 IMG_TIR2', '1545 IMG_WV', '1515 IMG_TIR2', '0115 IMG_TIR1', '1915 IMG_WV', '0245 IMG_TIR2', '2145 IMG_TIR2', '1045 IMG_TIR2', '1045 IMG_TIR1', '1445 IMG_TIR2', '2215 IMG_TIR2', '0115 IMG_TIR2', '1245 IMG_TIR2', '1845 IMG_WV', '1745 IMG_TIR2', 'Date', '0245 IMG_WV', '0345 IMG_TIR2', '1445 IMG_WV', '0645 IMG_TIR1', '1415 IMG_TIR2', '1115 IMG_TIR2', '1445 IMG_TIR1', '1245 IMG_WV', 'precipitationCal', '1545 IMG_TIR1', '2215 IMG_WV', '0345 IMG_TIR1', 'latitude', '1945 IMG_TIR1', '0545 I

In [7]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import SGDRegressor
from joblib import dump, load

# Directory containing your CSV files
folder_path = '/Users/kunalpathak9826/Desktop/ISRO/Data/Interpolated CSV/2018/'

# List all CSV files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# Shuffle the list of CSV files
import random
random.shuffle(csv_files)

# Calculate the index to split files into training and testing sets
split_index = int(len(csv_files) * 0.7)

# Training set and Testing set
train_files = csv_files[:split_index]
test_files = csv_files[split_index:]

# Initialize common columns with columns from the first CSV file
common_columns = set(pd.read_csv(os.path.join(folder_path, csv_files[0])).columns)

# Loop through each CSV file in the folder to find common columns
for csv_file in csv_files[1:]:
    df = pd.read_csv(os.path.join(folder_path, csv_file))
    common_columns = common_columns.intersection(df.columns)

# Print the common variables found in all files
print("\nCommon Variables:")
print(common_columns)

# Initialize an empty list to store the files in memory
files_in_memory = []

# Initialize the model
model = SGDRegressor(learning_rate='constant', eta0=0.01, random_state=42)

# Loop through each CSV file in the folder
for idx, csv_file in enumerate(csv_files):
    # Load the data from CSV file, keeping only the common columns
    df = pd.read_csv(os.path.join(folder_path, csv_file))[list(common_columns) + ['precipitationCal']]
    
    # Extract features (X) and target (y) from the data
    X = df.drop(columns=['precipitationCal'])
    y = df['precipitationCal']

    # Ensure y contains only one column
    if isinstance(y, pd.DataFrame):
        y = y.values.reshape(-1)
        y = y[:len(X)]
        #y = y.iloc[:, 0]

    # Fit the model on the new data
    model.partial_fit(X, y)
    
    # Print old and new files
    print(f"Old files: {files_in_memory}")
    print(f"New file: {csv_file}")
    
    # Update files in memory
    files_in_memory.append(csv_file)

# Print all files in memory
print(f"All files in memory: {files_in_memory}")

# Save the trained model as a .pkl file
model_filename = 'incremental_model.pkl'
dump(model, model_filename)
print(f"Model saved as {model_filename}")

# Now, let's update the model with new data from a folder
new_data_folder = '/Users/kunalpathak9826/Desktop/ISRO/Data/Interpolated CSV/2017/'

# Load the saved model
loaded_model = load(model_filename)

# Iterate through each CSV file in the folder
for file_name in os.listdir(new_data_folder):
    if file_name.endswith('.csv'):
        # Load the CSV file
        file_path = os.path.join(new_data_folder, file_name)
        new_data = pd.read_csv(file_path)
        
        # Preprocess the new data (keep only common columns)
        new_data_common = new_data[list(common_columns)]
        
        # Extract features (X) and target (y) from the new data
        X_new = new_data_common.drop(columns=['precipitationCal'])
        y_new = new_data_common['precipitationCal']

        # Ensure y contains only one column
        if isinstance(y_new, pd.DataFrame):
            y_new = y_new.values.reshape(-1)
            y_new = y_new[:len(X_new)]

        # Update the model with the new data
        loaded_model.partial_fit(X_new, y_new)

# Save the updated model
updated_model_filename = 'updated_incremental_model.pkl'
dump(loaded_model, updated_model_filename)
print(f"Updated model saved as {updated_model_filename}")



Common Variables:
{'1715 IMG_TIR2', '1645 IMG_TIR1', '0715 IMG_TIR2', '2015 IMG_TIR1', '0215 IMG_TIR1', '2215 IMG_TIR1', '0015 IMG_TIR2', '0015 IMG_WV', '1015 IMG_TIR2', '2145 IMG_WV', '0615 IMG_WV', '0115 IMG_TIR1', '1045 IMG_TIR1', '1145 IMG_WV', '1345 IMG_TIR1', '1715 IMG_TIR1', '1115 IMG_TIR2', '0115 IMG_WV', '0645 IMG_TIR1', '1145 IMG_TIR1', '0615 IMG_TIR2', '1315 IMG_TIR1', '1415 IMG_TIR2', '1845 IMG_WV', '1545 IMG_TIR2', '1915 IMG_WV', '2115 IMG_TIR2', '0145 IMG_TIR2', '0315 IMG_WV', '2315 IMG_TIR2', '1815 IMG_TIR2', '1245 IMG_TIR2', '0145 IMG_TIR1', '0615 IMG_TIR1', '1515 IMG_TIR1', '0745 IMG_TIR1', '2045 IMG_TIR1', '1745 IMG_WV', '0445 IMG_WV', 'longitude', '0245 IMG_TIR2', 'precipitationCal', '1315 IMG_WV', '2215 IMG_TIR2', '0545 IMG_TIR1', '1915 IMG_TIR1', '1345 IMG_WV', '2245 IMG_TIR2', '0715 IMG_TIR1', '1815 IMG_TIR1', '1045 IMG_TIR2', '0145 IMG_WV', '1115 IMG_TIR1', '1615 IMG_WV', '2215 IMG_WV', '1245 IMG_WV', '2245 IMG_TIR1', '0245 IMG_WV', '1445 IMG_TIR1', '0315 IMG_TI

KeyError: "['0215 IMG_TIR1', '0615 IMG_WV', '0615 IMG_TIR2', '0615 IMG_TIR1', '1615 IMG_WV', '1615 IMG_TIR2', '1615 IMG_TIR1', '0215 IMG_WV', '0215 IMG_TIR2'] not in index"

In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from joblib import dump

# Directory containing your CSV files
folder_path = '/Users/kunalpathak9826/Desktop/ISRO/Data/Interpolated CSV/2018/'

# List all CSV files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# Shuffle the list of CSV files
import random
random.shuffle(csv_files)

# Calculate the index to split files into training and testing sets
split_index = int(len(csv_files) * 0.7)

# Training set and Testing set
train_files = csv_files[:split_index]
test_files = csv_files[split_index:]

# Initialize common columns with columns from the first CSV file
common_columns = set(pd.read_csv(os.path.join(folder_path, csv_files[0])).columns)

# Loop through each CSV file in the folder to find common columns
for csv_file in csv_files[1:]:
    df = pd.read_csv(os.path.join(folder_path, csv_file))
    common_columns = common_columns.intersection(df.columns)

# Print the common variables found in all files
print("\nCommon Variables:")
print(common_columns)

# Initialize an empty set to store all unique columns
all_columns = set()

# Loop through each CSV file in the folder to find all unique columns
for csv_file in csv_files:
    df = pd.read_csv(os.path.join(folder_path, csv_file))
    all_columns.update(df.columns)

# Find the uncommon variables
uncommon_variables = all_columns - common_columns

# Print the uncommon variables
print("\nUncommon Variables:")
print(uncommon_variables)

# Initialize an empty list to store the files in memory
files_in_memory = []

# Initialize the model (Random Forest Regressor)
model = RandomForestRegressor(random_state=42)

# Loop through each CSV file in the folder
for idx, csv_file in enumerate(csv_files):
    # Load the data from CSV file, keeping only the common columns
    df = pd.read_csv(os.path.join(folder_path, csv_file))[list(common_columns) + ['precipitationCal']]
    
    # Concatenate the data to the DataFrame
    if idx == 0:
        concatenated_data = df
    else:
        concatenated_data = pd.concat([concatenated_data, df], ignore_index=True)

    # Extract features (X) and target (y) from the data
    X = df.drop(columns=['precipitationCal'])
    y = df['precipitationCal']

    # Ensure y contains only one column
    if isinstance(y, pd.DataFrame):
        y = y.values.reshape(-1)
        y = y[:len(X)]
        #y = y.iloc[:, 0]

    # Fit the model on the new data
    model.fit(X, y)
    
    # Print old and new files
    print(f"Old files: {files_in_memory}")
    print(f"New file: {csv_file}")
    
    # Update files in memory
    files_in_memory.append(csv_file)

# Print all files in memory
print(f"All files in memory: {files_in_memory}")

# Extract features (X) and target (y) from concatenated data for test set
X_test = concatenated_data.drop(columns=['precipitationCal'])
y_test = concatenated_data['precipitationCal']

# Ensure y contains only one column
if isinstance(y_test, pd.DataFrame):
    y_test = y_test.values.reshape(-1)
    y_test = y_test[:len(X_test)]

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate the mean squared error for the testing set
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error on Test Set: {mse:.4f}")

# Save the trained model as a .pkl file
model_filename = 'random_forest_model.pkl'
dump(model, model_filename)
print(f"Model saved as {model_filename}")



Common Variables:
{'1915 IMG_TIR2', '0215 IMG_TIR2', '1245 IMG_TIR2', '0215 IMG_WV', '0645 IMG_TIR1', '1015 IMG_TIR1', '1545 IMG_TIR1', '1445 IMG_WV', '1515 IMG_TIR1', '0445 IMG_TIR1', '0245 IMG_TIR1', '1845 IMG_TIR1', '1945 IMG_TIR2', '1915 IMG_WV', '1815 IMG_TIR2', '1915 IMG_TIR1', '0245 IMG_TIR2', '1315 IMG_TIR1', '2115 IMG_TIR2', '0015 IMG_WV', '0445 IMG_TIR2', '0115 IMG_WV', '1415 IMG_TIR1', '0245 IMG_WV', '0715 IMG_WV', '1345 IMG_WV', '0345 IMG_WV', '2115 IMG_WV', '0215 IMG_TIR1', '2045 IMG_TIR1', '2015 IMG_TIR1', '0715 IMG_TIR2', '1315 IMG_TIR2', '0545 IMG_WV', '1415 IMG_WV', '2315 IMG_TIR2', '1745 IMG_TIR2', '1015 IMG_TIR2', '1615 IMG_WV', '0645 IMG_TIR2', '1345 IMG_TIR2', '1815 IMG_TIR1', '0045 IMG_TIR2', '0015 IMG_TIR1', 'latitude', '1145 IMG_TIR2', '1315 IMG_WV', '2245 IMG_TIR1', '2245 IMG_TIR2', '2015 IMG_WV', '2215 IMG_TIR1', '0445 IMG_WV', 'Date', '0145 IMG_TIR2', '1645 IMG_WV', '1415 IMG_TIR2', '1615 IMG_TIR2', '0345 IMG_TIR2', '1245 IMG_TIR1', '1645 IMG_TIR2', '0645 IM

In [2]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from joblib import dump

# Directory containing your CSV files
folder_path = '/Users/kunalpathak9826/Desktop/ISRO/Data/Interpolated CSV/2018/'

# List all CSV files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# Shuffle the list of CSV files
import random
random.shuffle(csv_files)

# Calculate the index to split files into training and testing sets
split_index = int(len(csv_files) * 0.7)

# Training set and Testing set
train_files = csv_files[:split_index]
test_files = csv_files[split_index:]

# Initialize common columns with columns from the first CSV file
common_columns = set(pd.read_csv(os.path.join(folder_path, csv_files[0])).columns)

# Loop through each CSV file in the folder to find common columns
for csv_file in csv_files[1:]:
    df = pd.read_csv(os.path.join(folder_path, csv_file))
    common_columns = common_columns.intersection(df.columns)

# Print the common variables found in all files
print("\nCommon Variables:")
print(common_columns)

# Initialize an empty set to store all unique columns
all_columns = set()

# Loop through each CSV file in the folder to find all unique columns
for csv_file in csv_files:
    df = pd.read_csv(os.path.join(folder_path, csv_file))
    all_columns.update(df.columns)

# Find the uncommon variables
uncommon_variables = all_columns - common_columns

# Print the uncommon variables
print("\nUncommon Variables:")
print(uncommon_variables)

# Initialize an empty list to store the files in memory
files_in_memory = []

# Initialize the model
model = XGBRegressor(random_state=42)

# Loop through each CSV file in the folder
for idx, csv_file in enumerate(csv_files):
    # Load the data from CSV file, keeping only the common columns
    df = pd.read_csv(os.path.join(folder_path, csv_file))[list(common_columns) + ['precipitationCal']]
    
    # Concatenate the data to the DataFrame
    if idx == 0:
        concatenated_data = df
    else:
        concatenated_data = pd.concat([concatenated_data, df], ignore_index=True)

    # Extract features (X) and target (y) from the data
    X = df.drop(columns=['precipitationCal'])
    y = df['precipitationCal']

    # Ensure y contains only one column
    if isinstance(y, pd.DataFrame):
        y = y.values.reshape(-1)
        y = y[:len(X)]
        #y = y.iloc[:, 0]

    # Fit the model on the new data
    model.fit(X, y)
    
    # Print old and new files
    print(f"Old files: {files_in_memory}")
    print(f"New file: {csv_file}")
    
    # Update files in memory
    files_in_memory.append(csv_file)

# Print all files in memory
print(f"All files in memory: {files_in_memory}")

# Extract features (X) and target (y) from concatenated data for test set
X_test = concatenated_data.drop(columns=['precipitationCal'])
y_test = concatenated_data['precipitationCal']

# Ensure y contains only one column
if isinstance(y_test, pd.DataFrame):
    y_test = y_test.values.reshape(-1)
    y_test = y_test[:len(X_test)]

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate the mean squared error for the testing set
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error on Test Set: {mse:.4f}")

# Save the trained model as a .pkl file
model_filename = 'XGBoost_model.pkl'
dump(model, model_filename)
print(f"Model saved as {model_filename}")



Common Variables:
{'1145 IMG_TIR1', '0715 IMG_WV', '1745 IMG_TIR1', '1115 IMG_TIR2', '1815 IMG_TIR2', '2315 IMG_WV', '1445 IMG_TIR1', '1915 IMG_TIR1', '1045 IMG_TIR2', '2015 IMG_TIR2', '1515 IMG_TIR1', '0545 IMG_WV', '2315 IMG_TIR2', '1815 IMG_TIR1', '0715 IMG_TIR1', 'precipitationCal', '1015 IMG_TIR1', '1845 IMG_WV', '1745 IMG_TIR2', '0745 IMG_TIR1', '1945 IMG_WV', '1515 IMG_WV', 'latitude', '1445 IMG_TIR2', '1845 IMG_TIR2', '0945 IMG_TIR1', '1015 IMG_TIR2', '1645 IMG_TIR1', '1715 IMG_TIR1', '1445 IMG_WV', '1145 IMG_TIR2', '2145 IMG_TIR2', '1245 IMG_TIR1', '1415 IMG_TIR2', '0015 IMG_WV', '1415 IMG_TIR1', '2345 IMG_TIR1', '0015 IMG_TIR1', '1345 IMG_WV', '0245 IMG_WV', '2115 IMG_WV', 'longitude', '0315 IMG_TIR1', '1345 IMG_TIR2', '0245 IMG_TIR2', '0045 IMG_WV', 'Date', '2145 IMG_TIR1', '0445 IMG_TIR1', '0645 IMG_TIR2', '2245 IMG_TIR2', '1915 IMG_WV', '0545 IMG_TIR2', '2045 IMG_TIR2', '1615 IMG_TIR1', '1015 IMG_WV', '0215 IMG_WV', '0115 IMG_TIR1', '0545 IMG_TIR1', '2215 IMG_TIR1', '1645