In [1]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import torchvision.transforms as transforms
import imageio as Image
from torchvision.utils import make_grid
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import TensorDataset, DataLoader

In [2]:
df = pd.read_csv("data.csv")

In [3]:
df[['date_part', 'time_part']] = df['Timestamp   '].str.split(' ', 1, expand=True)

In [4]:
df.head()

Unnamed: 0,Timestamp,Source,Bt-med,Bt-min,Bt-max,Bx-med,Bx-min,Bx-max,By-med,By-min,...,Dens-min,Dens-max,Speed-med,Speed-min,Speed-max,Temp-med,Temp-min,Temp-max,date_part,time_part
0,9/9/2023 0:00,1,5.78,5.6,6.07,4.63,3.95,5.21,-2.27,-4.1,...,0.71,1.91,402.1,328.9,407.1,54427,5000,81376,9/9/2023,0:00
1,9/9/2023 0:30,1,5.84,5.6,6.04,4.9,3.94,5.29,-2.86,-4.26,...,0.62,1.94,402.8,330.8,411.2,47657,6116,124309,9/9/2023,0:30
2,9/9/2023 1:00,1,5.68,5.59,6.02,3.41,2.25,5.2,-1.66,-4.5,...,1.46,3.02,378.5,366.9,402.9,85220,46291,129434,9/9/2023,1:00
3,9/9/2023 1:30,1,5.7,5.22,5.98,2.81,0.92,4.19,-4.4,-5.54,...,0.7,3.4,380.4,343.0,402.3,62622,13372,108784,9/9/2023,1:30
4,9/9/2023 2:00,1,5.83,5.21,5.97,3.58,1.22,4.03,-2.66,-4.51,...,0.92,3.38,385.2,376.3,401.0,59709,23345,109656,9/9/2023,2:00


In [5]:
df['time_part'] = pd.to_datetime(df['time_part'], format='%H:%M').dt.time

In [6]:
filtered_df = df[df['time_part'].apply(lambda x: x.hour % 3 == 0 and x.minute == 0)]

In [7]:
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 221 entries, 0 to 1320
Data columns (total 31 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Timestamp        221 non-null    object 
 1           Source   221 non-null    int64  
 2      Bt-med        221 non-null    float64
 3      Bt-min        221 non-null    float64
 4    Bt-max          221 non-null    float64
 5    Bx-med          221 non-null    float64
 6    Bx-min          221 non-null    float64
 7     Bx-max         221 non-null    float64
 8     By-med         221 non-null    float64
 9     By-min         221 non-null    float64
 10    By-max         221 non-null    float64
 11     Bz-med        221 non-null    float64
 12     Bz-min        221 non-null    float64
 13      Bz-max       221 non-null    float64
 14   Phi-mean        221 non-null    float64
 15     Phi-min       221 non-null    float64
 16    Phi-max        221 non-null    float64
 17   Theta-med     

In [8]:
kp = pd.read_csv("KP.csv")

In [9]:
def string_to_float_array(string_value):
    number_strings = string_value.split()
    return [float(num_str) for num_str in number_strings]

# Apply the function to the DataFrame column
kp['K-indices'] = kp['  K-indices'].apply(string_to_float_array)

In [10]:
kp = kp.iloc[2:]

In [11]:
kp_array = np.concatenate(kp['K-indices'].values)
kp_array = kp_array.tolist()

In [12]:
len(kp_array)

224

In [13]:
filtered_df['KP'] = kp_array[:-3]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['KP'] = kp_array[:-3]


In [14]:
filtered_df.to_csv("kp_data.csv", index=False)


In [16]:
formated_data = pd.read_csv("kp_data.csv")
formated_data.to_csv("kp_data.csv", index=False)

# Modeling

In [19]:
kp_data = pd.read_csv("kp_data.csv")

In [20]:
kp_data.columns

Index(['Timestamp', 'Source', 'Bt-med', 'Bt-min', 'Bt-max', 'Bx-med', 'Bx-min',
       'Bx-max', 'By-med', 'By-min', 'By-max', 'Bz-med', 'Bz-min', 'Bz-max',
       'Phi-mean', 'Phi-min', 'Phi-max', 'Theta-med', 'Theta-min', 'Theta-max',
       'Dens-med', 'Dens-min', 'Dens-max', 'Speed-med', 'Speed-min',
       'Speed-max', 'Temp-med', 'Temp-min', 'Temp-max', 'date_part',
       'time_part', 'KP'],
      dtype='object')

In [21]:
kp_data = kp_data[['Source', 'Bt-med', 'Bt-min', 'Bt-max', 'Bx-med', 'Bx-min',
       'Bx-max', 'By-med', 'By-min', 'By-max', 'Bz-med', 'Bz-min', 'Bz-max',
       'Phi-mean', 'Phi-min', 'Phi-max', 'Theta-med', 'Theta-min', 'Theta-max',
       'Dens-med', 'Dens-min', 'Dens-max', 'Speed-med', 'Speed-min',
       'Speed-max', 'Temp-med', 'Temp-min', 'Temp-max', 'KP']]

In [22]:
y = kp_data['KP']
X = kp_data.drop(columns=['KP'])

In [26]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
from sklearn.linear_model import LinearRegression

# Create and fit a Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [28]:
from sklearn.metrics import mean_squared_error, r2_score

# Make predictions on the testing set
y_pred = model.predict(X_test)

# Calculate Mean Squared Error and R-squared
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

Mean Squared Error: 0.7179218992038141
R-squared: 0.3722116437266312
