In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

In [3]:
df = pd.read_csv('../Datasets/Merged_Dataset.csv')
df1 = df.copy()

In [4]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1108 entries, 0 to 1107
Data columns (total 30 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Timestamp  1108 non-null   object 
 1   Source     1108 non-null   int64  
 2   Bt-med     1108 non-null   float64
 3   Bt-min     1108 non-null   float64
 4   Bt-max     1108 non-null   float64
 5   Bx-med     1108 non-null   float64
 6   Bx-min     1108 non-null   float64
 7   Bx-max     1108 non-null   float64
 8   By-med     1108 non-null   float64
 9   By-min     1108 non-null   float64
 10  By-max     1108 non-null   float64
 11  Bz-med     1108 non-null   float64
 12  Bz-min     1108 non-null   float64
 13  Bz-max     1108 non-null   float64
 14  Phi-mean   1108 non-null   float64
 15  Phi-min    1108 non-null   float64
 16  Phi-max    1108 non-null   float64
 17  Theta-med  1108 non-null   float64
 18  Theta-min  1108 non-null   float64
 19  Theta-max  1108 non-null   float64
 20  Dens-med

In [5]:
print(df1.describe())

            Source        Bt-med        Bt-min        Bt-max        Bx-med  \
count  1108.000000   1108.000000   1108.000000   1108.000000   1108.000000   
mean      0.998195   -173.829350   -175.541977   -172.457879   -180.260866   
std       0.060084   4246.916048   4246.842839   4246.975038   4246.642889   
min      -1.000000 -99999.000000 -99999.000000 -99999.000000 -99999.000000   
25%       1.000000      4.980000      3.645000      5.780000     -2.805000   
50%       1.000000      5.940000      4.690000      6.995000      0.740000   
75%       1.000000      7.492500      5.840000      9.142500      3.182500   
max       1.000000     32.440000     28.460000     33.990000     10.920000   

             Bx-min        Bx-max        By-med        By-min        By-max  \
count   1108.000000   1108.000000   1108.000000   1108.000000   1108.000000   
mean    -184.039675   -176.684233   -180.786173   -184.882987   -176.389305   
std     4246.481880   4246.794863   4246.621208   4246.44658

In [6]:
print(df1.iloc[:, 1:].corr()['Kp'])

Source      -0.007173
Bt-med      -0.004234
Bt-min      -0.004403
Bt-max      -0.004116
Bx-med      -0.004494
Bx-min      -0.004747
Bx-max      -0.004315
By-med      -0.004663
By-min      -0.004894
By-max      -0.004341
Bz-med      -0.004822
Bz-min      -0.005019
Bz-max      -0.004407
Phi-mean    -0.002292
Phi-min     -0.007209
Phi-max     -0.002063
Theta-med   -0.006445
Theta-min   -0.006769
Theta-max   -0.004858
Dens-med    -0.004262
Dens-min    -0.004460
Dens-max    -0.003942
Speed-med    0.000860
Speed-min    0.000172
Speed-max    0.001234
Temp-med     0.060189
Temp-min    -0.017895
Temp-max     0.069590
Kp           1.000000
Name: Kp, dtype: float64


#### X and Y

In [7]:
x = df1.iloc[:, 1:-1]
y = df1.iloc[:, -1]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)

In [15]:
x.to_csv('../Code_only/train-tests/x.csv', index = False)
y.to_csv('../Code_only/train-tests/y.csv', index = False)

#### Model definition

Multiple Linear Regression

In [27]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

multi = LinearRegression().fit(x_train, y_train)
y_pred = multi.predict(x_test)
predicted = y_pred.reshape(len(y_pred),1)

print(y_test)
y_pred = [item for sublist in predicted for item in sublist]

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)

# Calculate R-squared (R2) score
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R2) Score: {r2}")

1050    2.333
982     3.333
766     2.000
491     0.667
457     0.333
        ...  
841     1.333
95      0.667
113     2.000
725     0.667
37      0.667
Name: Kp, Length: 333, dtype: float64
Mean Absolute Error (MAE): 0.6790486276752579
Mean Squared Error (MSE): 0.7640443306957228
Root Mean Squared Error (RMSE): 0.8740962937203902
R-squared (R2) Score: 0.5195210365442081


Polynomial Regression

In [37]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=3)
x_poly = poly.fit_transform(x_train)
reg = LinearRegression().fit(x_poly, y_train)

y_pred = reg.predict(poly.fit_transform(x_test))
predicted = y_pred.reshape(len(y_pred),1)

y_pred = [item for sublist in predicted for item in sublist]

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)

# Calculate R-squared (R2) score
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R2) Score: {r2}")


Mean Absolute Error (MAE): 8.442562673452013
Mean Squared Error (MSE): 744.1389348409579
Root Mean Squared Error (RMSE): 27.27891007428555
R-squared (R2) Score: -466.96120292380056


Decision Tree Regression

In [38]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state = 0)
regressor.fit(x_train, y_train)

y_pred = regressor.predict(x_test)
predicted = y_pred.reshape(len(y_pred),1)

y_pred = [item for sublist in predicted for item in sublist]

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)

# Calculate R-squared (R2) score
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R2) Score: {r2}")


Mean Absolute Error (MAE): 0.9559879879879881
Mean Squared Error (MSE): 1.5292567387387386
Root Mean Squared Error (RMSE): 1.2366312056303361
R-squared (R2) Score: 0.03830751284038392


Random Forest Regression

In [40]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 20, random_state = 0)
regressor.fit(x_train, y_train)

y_pred = regressor.predict(x_test)
predicted = y_pred.reshape(len(y_pred),1)

y_pred = [item for sublist in predicted for item in sublist]

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)

# Calculate R-squared (R2) score
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R2) Score: {r2}")


Mean Absolute Error (MAE): 0.7335405405405405
Mean Squared Error (MSE): 0.8653285311861862
Root Mean Squared Error (RMSE): 0.9302303645797563
R-squared (R2) Score: 0.4558271830451668


In [None]:
# SVM
# ARIMA model (NN)
# LSTM
# Multivariate LSTM (link on slack)
# Multistep LSTM (link sent in slack)
# GRU model (Gated Recurrent Unit)
# Attention Mechanism
# Vanilla LSTM

# Ananya's research papers also included Deep GPR (Gaussian Process Regression)

In [None]:
import datetime
log_file_path = '../Logs/models_log.txt'
def log_output_to_file(output_text, log_file_path):
    timestamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    divider = '-' * 40  # Line divider
    
    with open(log_file_path, 'a') as log_file:
        log_file.write(f'{timestamp}\n')
        log_file.write(divider + '\n')
        log_file.write(output_text)
        log_file.write('\n\n')
