In [1]:
import pandas as pd

# Load the dataset
data = pd.read_csv('./BUCKET/DSCOVR_KP_PER_INTERVAL.csv')

# Display the first few rows of the dataset
data.head()

Unnamed: 0,TIMESTAMP,KP_GRAL,GSE_X,GSE_Y,GSE_Z,CUADRADO,RAW_4,RAW_5,RAW_6,RAW_7,...,RAW_15,RAW_16,RAW_17,RAW_18,RAW_19,RAW_20,RAW_21,RAW_22,RAW_23,RAW_24
0,2018-06-19 13:20:00,1.0,-3.34036,0.898664,2.03659,4.014138,,,,,...,,,,340.457,338.058,412.506,366.075,458.03,489.964,538.543
1,2018-06-19 13:21:00,1.0,-3.37082,0.875381,2.01067,4.021382,,,,,...,,,,340.685,339.708,413.791,366.942,459.124,491.393,540.319
2,2018-06-19 13:22:00,1.0,-3.32255,0.826042,2.1232,4.028605,,,,,...,,,,340.983,339.971,413.128,366.345,458.459,491.094,538.637
3,2018-06-19 13:23:00,1.0,-3.3095,0.9186,2.1213,4.036896,,,,,...,,,,339.2,338.11,412.093,365.506,457.153,488.371,534.105
4,2018-06-19 13:24:00,1.0,-2.95364,1.51061,2.2569,4.012422,,,,,...,,,,340.938,338.628,411.932,365.709,453.664,477.827,517.41


In [4]:
# Extract columns of interest
input_columns = ["GSE_X", "GSE_Y", "GSE_Z", "CUADRADO"] + [col for col in data.columns if "RAW_" in col]
target_columns = ["KP_GRAL"]

# Extract data
input_data = data[input_columns]
target_data = data[target_columns]

# Check for missing values in 'raw_*' columns
missing_values = input_data.isnull().sum()

missing_values

GSE_X         17442
GSE_Y         17442
GSE_Z         17442
CUADRADO      17442
RAW_4       1757636
RAW_5       1695129
RAW_6       1556408
RAW_7       1467449
RAW_8       1355341
RAW_9       1268964
RAW_10      1148804
RAW_11      1053137
RAW_12       921593
RAW_13       853886
RAW_14       732934
RAW_15       682399
RAW_16       623358
RAW_17       554684
RAW_18       506961
RAW_19       472456
RAW_20       425829
RAW_21       425497
RAW_22       434438
RAW_23       521356
RAW_24       573915
dtype: int64

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

# Drop columns with more than 80% missing values
threshold = 0.8 * len(input_data)
input_data = input_data.dropna(axis=1, thresh=threshold)

# Impute missing values using median
imputer = SimpleImputer(strategy='median')
input_data_imputed = imputer.fit_transform(input_data)

# Split data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(input_data_imputed, target_data, test_size=0.2, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2621952, 13), (655488, 13), (2621952, 1), (655488, 1))

In [6]:
# Left over columns (To be used on training data)
input_data.columns

Index(['GSE_X', 'GSE_Y', 'GSE_Z', 'CUADRADO', 'RAW_16', 'RAW_17', 'RAW_18',
       'RAW_19', 'RAW_20', 'RAW_21', 'RAW_22', 'RAW_23', 'RAW_24'],
      dtype='object')

In [7]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Initialize the linear regression model
lr_model = LinearRegression()

# Train the model
lr_model.fit(X_train, y_train)

# Predict on test set
lr_predictions = lr_model.predict(X_test)

# Calculate MSE for each target column
lr_mse = mean_squared_error(y_test, lr_predictions, multioutput='raw_values')

lr_mse

array([1.07019034])

In [8]:
from sklearn.tree import DecisionTreeRegressor

# Initialize the decision tree regressor
dt_model = DecisionTreeRegressor(random_state=42)

# Train the model
dt_model.fit(X_train, y_train)

# Predict on test set
dt_predictions = dt_model.predict(X_test)

# Calculate MSE for each target column
dt_mse = mean_squared_error(y_test, dt_predictions, multioutput='raw_values')

dt_mse

array([0.55748592])

In [14]:
# Save models
from joblib import dump, load
dump(dt_model, 'BUCKET/output/dt_model.joblib') 
dump(lr_model, 'BUCKET/output/lr_model.joblib') 

['BUCKET/output/lr_model.joblib']

In [6]:
# Load Models
from joblib import dump, load
dt_model = load('BUCKET/output/dt_model.joblib') 
lr_model = load('BUCKET/output/lr_model.joblib')

In [12]:
from scipy.optimize import minimize

# Define a function to compute the ensemble predictions and the corresponding MSE
def ensemble_mse(weight, y_preds, y_true):
    weight_lr = weight[0]
    weight_dt = 1 - weight_lr
    y_pred_ensemble = weight_lr * y_preds[0] + weight_dt * y_preds[1]
    return mean_squared_error(y_true, y_pred_ensemble)

In [None]:
# Pack the predictions of both models into a list
y_preds = [lr_predictions, dt_predictions]

# Use the minimize function to find the optimal weight
result = minimize(fun=ensemble_mse, x0=[0.5], args=(y_preds, y_test), bounds=[(0, 1)])

# Extract the optimal weight from the result
optimal_weight_lr = result.x[0]
optimal_weight_dt = 1 - optimal_weight_lr

optimal_weight_lr, optimal_weight_dt

In [13]:
from sklearn.metrics import mean_absolute_error

# Calculate the MAE for each model
lr_mae = mean_absolute_error(y_test, lr_predictions)
dt_mae = mean_absolute_error(y_test, dt_predictions)

lr_mae, dt_mae

(0.8840601312708873, 0.29618156044332355)