In [None]:
pip install keras-tuner

In [1]:
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt # plotting
import numpy as np # linear algebra
import os # accessing directory structure
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import tensorflow as tf
print(tf.__version__) 
import keras_tuner as kt
print(kt.__version__)

2.17.0
1.4.7


In [2]:
nRowsRead = 1000 # specify 'None' if want to read whole file
# 06162020_111957.csv may have more rows in reality, but we are only loading/previewing the first 1000 rows
df1 = pd.read_csv('FLY009.csv', delimiter=',', nrows = nRowsRead)
df1.dataframeName = 'p.csv'
nRow, nCol = df1.shape
print(f'There are {nRow} rows and {nCol} columns')

There are 1000 rows and 276 columns


Let's take a quick look at what the data looks like:

In [None]:
df1.head(5)

In [3]:
import pandas as pd

# Assuming the column is already converted to datetime
df1['GPS:dateTimeStamp'] = pd.to_datetime(df1['GPS:dateTimeStamp'], errors='coerce')

# Strip any timezone information to make sure the datetime is timezone-naive
df1['GPS:dateTimeStamp'] = df1['GPS:dateTimeStamp'].dt.tz_localize(None)

# Define the reference date as timezone-naive (1970-01-01)
reference_date = pd.Timestamp('1970-01-01')

# Convert date to float (days since the reference date)
df1['date_float'] = (df1['GPS:dateTimeStamp'] - reference_date).dt.days

# Convert time to float (fraction of a day, based on seconds since midnight)
df1['time_float'] = (
    df1['GPS:dateTimeStamp'].dt.hour * 3600 + 
    df1['GPS:dateTimeStamp'].dt.minute * 60 + 
    df1['GPS:dateTimeStamp'].dt.second
) / 86400  # 86400 seconds in a day

# Display the updated DataFrame with float values for date and time
print(df1[['date_float', 'time_float']])


     date_float  time_float
0           NaN         NaN
1       17381.0    0.756042
2       17381.0    0.756042
3       17381.0    0.756042
4       17381.0    0.756042
..          ...         ...
995     17381.0    0.756435
996     17381.0    0.756435
997     17381.0    0.756435
998     17381.0    0.756435
999     17381.0    0.756435

[1000 rows x 2 columns]


In [4]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Columns: 278 entries, Clock:Tick# to time_float
dtypes: datetime64[ns](1), float64(258), int64(1), object(18)
memory usage: 2.1+ MB


In [5]:
df1=df1.drop('GPS:dateTimeStamp',axis=1)  # axis=1 indicates dropping a column

In [6]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Columns: 277 entries, Clock:Tick# to time_float
dtypes: float64(258), int64(1), object(18)
memory usage: 2.1+ MB


Null values

In [7]:
#pip install missingno
n=df1.isnull().sum().sum()
print(n)


15109


In [None]:
import missingno as msno

# Visualize missing data
msno.matrix(df1)

In [8]:
# Check the number of missing values in each column
missing_data = df1.isnull().sum()

# Display columns with missing values and their count
print(missing_data[missing_data > 0])

IMU_ATTI(0):Longitude             88
IMU_ATTI(0):Latitude              88
IMU_ATTI(0):press:D                1
IMU_ATTI(0):alti:D                 1
IMU_ATTI(0):relativeHeight:C    1000
                                ... 
Attribute|Value                  987
ConvertDatV3                    1000
4.2.7                           1000
date_float                         1
time_float                         1
Length: 275, dtype: int64


In [9]:
# Check the percentage of missing values in each column
missing_percentage = (df1.isnull().sum() / len(df1)) * 100
print(missing_percentage[missing_percentage == 100])

IMU_ATTI(0):relativeHeight:C    100.0
IMU_ATTI(0):absoluteHeight:C    100.0
IMU_ATTI(0):distanceHP:C        100.0
MVO:velZ                        100.0
MVO:posZ                        100.0
MVO:height                      100.0
osd_data:lowVoltage             100.0
ConvertDatV3                    100.0
4.2.7                           100.0
dtype: float64


In [10]:
df1.info()
n=df1.isnull().sum().sum()
print(n)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Columns: 277 entries, Clock:Tick# to time_float
dtypes: float64(258), int64(1), object(18)
memory usage: 2.1+ MB
15109


In [11]:
# Drop columns with more than 50% missing values
threshold = 0.5  # 50%
df1 = df1.loc[:, df1.isnull().mean() < threshold]
df1.info()
n=df1.isnull().sum().sum()
print(n)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Columns: 263 entries, Clock:Tick# to time_float
dtypes: float64(247), int64(1), object(15)
memory usage: 2.0+ MB
1187


In [12]:
df1 = df1.select_dtypes(exclude=['object'])
df1.info()
n=df1.isnull().sum().sum()
print(n)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Columns: 248 entries, Clock:Tick# to time_float
dtypes: float64(247), int64(1)
memory usage: 1.9 MB
1172


1. Forward Fill (ffill):
How it works: This method fills missing values with the last known valid value from earlier in the time series.
Scenario: When a sensor temporarily stops sending data but is expected to continue the trend or pattern from the last valid reading, forward fill can be appropriate.
Use case in drone trajectory:
Example: If a GPS signal is lost briefly, it makes sense to use the last known position until new data is received. The drone's trajectory is likely to continue from its last known state unless there is a sudden change in velocity or direction.
Advantages: It is a safe assumption in many real-time systems where missing data represents a temporary dropout, and the last known state provides a reasonable approximation.
Drawbacks: If there is a rapid change in the trajectory (e.g., a sudden turn or acceleration), forward fill might not capture it, and relying too much on old data can lead to inaccuracies.
python
Copy code
# Forward fill
df1.fillna(method='ffill', inplace=True)
When to use forward fill:

When sensor readings are continuous and missing values represent temporary gaps.
When the system is expected to follow the same trend until new data arrives (e.g., in the absence of GPS data, assume the drone continues along the same path).
If the missing data is sparse and spread over time.

Alternative Approach: Interpolation
In some cases, using interpolation (estimating missing values based on trends between previous and future data points) might give more accurate results than both forward and backward fill:

python
Copy code
# Linear interpolation to estimate missing values
df1.interpolate(method='linear', inplace=True)
This can work well if the drone’s motion is expected to follow a smooth trajectory. Interpolation will estimate the missing data points based on nearby values, creating a continuous trajectory.

In [13]:
# Linear interpolation to estimate missing values
df1.interpolate(method='linear', inplace=True)



In [14]:
df1.info()
n=df1.isnull().sum().sum()
print(n)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Columns: 248 entries, Clock:Tick# to time_float
dtypes: float64(247), int64(1)
memory usage: 1.9 MB
1172


In [15]:
# Forward fill to handle leading nulls
df1.ffill()

Unnamed: 0,Clock:Tick#,Clock:offsetTime,IMU_ATTI(0):Longitude,IMU_ATTI(0):Latitude,IMU_ATTI(0):press:D,IMU_ATTI(0):alti:D,IMU_ATTI(0):numSats,IMU_ATTI(0):roll:C,IMU_ATTI(0):pitch:C,IMU_ATTI(0):yaw:C,...,BatteryInfo:cap_per:D,BatteryInfo:temp:D,BatteryInfo:right:D,BatteryInfo:l_cell:D,BatteryInfo:dyna_cnt:D,BatteryInfo:f_cap:D,BatteryInfo:out_ctl:D,BatteryInfo:out_ctl_f:D,date_float,time_float
0,0,0.000,,,,,,,,,...,,,,,,,,,,
1,165975344,36.883,,,2485.1616,2485.1409,11.0,6.234888,6.825402,-118.591656,...,100.0,35.0,1.0,4274.0,0.0,5914.0,0.0,0.0,17381.0,0.756042
2,166128252,36.917,,,2485.6301,2485.1426,11.0,6.234325,6.832766,-118.588835,...,100.0,35.0,1.0,4274.0,0.0,5914.0,0.0,0.0,17381.0,0.756042
3,166285657,36.952,,,2485.7026,2485.1420,11.0,6.235835,6.832857,-118.584263,...,100.0,35.0,1.0,4274.0,0.0,5914.0,0.0,0.0,17381.0,0.756042
4,166436086,36.986,,,2485.5525,2485.1426,11.0,6.237282,6.826503,-118.582474,...,100.0,35.0,1.0,4274.0,0.0,5914.0,0.0,0.0,17381.0,0.756042
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,321578101,71.462,-106.216296,39.961314,2485.8950,2485.4062,13.0,6.173902,6.811557,-116.178303,...,100.0,36.0,1.0,4270.0,0.0,5914.0,0.0,0.0,17381.0,0.756435
996,321738281,71.497,-106.216296,39.961314,2486.0796,2485.4048,13.0,6.173905,6.808102,-116.176482,...,100.0,36.0,1.0,4270.0,0.0,5914.0,0.0,0.0,17381.0,0.756435
997,321896242,71.533,-106.216296,39.961314,2485.9944,2485.4050,13.0,6.170927,6.816829,-116.174930,...,100.0,36.0,1.0,4270.0,0.0,5914.0,0.0,0.0,17381.0,0.756435
998,322053933,71.568,-106.216296,39.961314,2486.1640,2485.4010,13.0,6.171981,6.818774,-116.173554,...,100.0,36.0,1.0,4270.0,0.0,5914.0,0.0,0.0,17381.0,0.756435


In [16]:
df1.info()
n=df1.isnull().sum().sum()
print(n)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Columns: 248 entries, Clock:Tick# to time_float
dtypes: float64(247), int64(1)
memory usage: 1.9 MB
1172


In [17]:
# Backward fill to handle trailing nulls
df1.bfill()

Unnamed: 0,Clock:Tick#,Clock:offsetTime,IMU_ATTI(0):Longitude,IMU_ATTI(0):Latitude,IMU_ATTI(0):press:D,IMU_ATTI(0):alti:D,IMU_ATTI(0):numSats,IMU_ATTI(0):roll:C,IMU_ATTI(0):pitch:C,IMU_ATTI(0):yaw:C,...,BatteryInfo:cap_per:D,BatteryInfo:temp:D,BatteryInfo:right:D,BatteryInfo:l_cell:D,BatteryInfo:dyna_cnt:D,BatteryInfo:f_cap:D,BatteryInfo:out_ctl:D,BatteryInfo:out_ctl_f:D,date_float,time_float
0,0,0.000,-106.216294,39.961316,2485.1616,2485.1409,11.0,6.234888,6.825402,-118.591656,...,100.0,35.0,1.0,4274.0,0.0,5914.0,0.0,0.0,17381.0,0.756042
1,165975344,36.883,-106.216294,39.961316,2485.1616,2485.1409,11.0,6.234888,6.825402,-118.591656,...,100.0,35.0,1.0,4274.0,0.0,5914.0,0.0,0.0,17381.0,0.756042
2,166128252,36.917,-106.216294,39.961316,2485.6301,2485.1426,11.0,6.234325,6.832766,-118.588835,...,100.0,35.0,1.0,4274.0,0.0,5914.0,0.0,0.0,17381.0,0.756042
3,166285657,36.952,-106.216294,39.961316,2485.7026,2485.1420,11.0,6.235835,6.832857,-118.584263,...,100.0,35.0,1.0,4274.0,0.0,5914.0,0.0,0.0,17381.0,0.756042
4,166436086,36.986,-106.216294,39.961316,2485.5525,2485.1426,11.0,6.237282,6.826503,-118.582474,...,100.0,35.0,1.0,4274.0,0.0,5914.0,0.0,0.0,17381.0,0.756042
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,321578101,71.462,-106.216296,39.961314,2485.8950,2485.4062,13.0,6.173902,6.811557,-116.178303,...,100.0,36.0,1.0,4270.0,0.0,5914.0,0.0,0.0,17381.0,0.756435
996,321738281,71.497,-106.216296,39.961314,2486.0796,2485.4048,13.0,6.173905,6.808102,-116.176482,...,100.0,36.0,1.0,4270.0,0.0,5914.0,0.0,0.0,17381.0,0.756435
997,321896242,71.533,-106.216296,39.961314,2485.9944,2485.4050,13.0,6.170927,6.816829,-116.174930,...,100.0,36.0,1.0,4270.0,0.0,5914.0,0.0,0.0,17381.0,0.756435
998,322053933,71.568,-106.216296,39.961314,2486.1640,2485.4010,13.0,6.171981,6.818774,-116.173554,...,100.0,36.0,1.0,4270.0,0.0,5914.0,0.0,0.0,17381.0,0.756435


In [18]:
n=df1.isnull().sum().sum()
print(n)


1172


In [19]:
# Assuming df1 is your DataFrame

# Select all string (object) columns
string_columns = df1.select_dtypes(include=['object']).columns
print(string_columns)




Index([], dtype='object')


In [23]:
# Check the number of missing values in each column
missing_data = df1.isnull().sum()

# Display columns with missing values and their count
print(missing_data[missing_data > 0])

IMU_ATTI(0):Longitude      88
IMU_ATTI(0):Latitude       88
IMU_ATTI(0):press:D         1
IMU_ATTI(0):alti:D          1
IMU_ATTI(0):numSats         1
                           ..
BatteryInfo:f_cap:D         1
BatteryInfo:out_ctl:D       1
BatteryInfo:out_ctl_f:D     1
date_float                  1
time_float                  1
Length: 246, dtype: int64


In [32]:
# Check the percentage of missing values in each column
missing_percentage = (df1.isnull().sum() / len(df1)) * 100
print(missing_percentage[missing_percentage >0])

IMU_ATTI(0):Longitude      8.8
IMU_ATTI(0):Latitude       8.8
IMU_ATTI(0):press:D        0.1
IMU_ATTI(0):alti:D         0.1
IMU_ATTI(0):numSats        0.1
                          ... 
BatteryInfo:f_cap:D        0.1
BatteryInfo:out_ctl:D      0.1
BatteryInfo:out_ctl_f:D    0.1
date_float                 0.1
time_float                 0.1
Length: 246, dtype: float64


In [20]:
df1.columns.tolist()

['Clock:Tick#',
 'Clock:offsetTime',
 'IMU_ATTI(0):Longitude',
 'IMU_ATTI(0):Latitude',
 'IMU_ATTI(0):press:D',
 'IMU_ATTI(0):alti:D',
 'IMU_ATTI(0):numSats',
 'IMU_ATTI(0):roll:C',
 'IMU_ATTI(0):pitch:C',
 'IMU_ATTI(0):yaw:C',
 'IMU_ATTI(0):accelX',
 'IMU_ATTI(0):accelY',
 'IMU_ATTI(0):accelZ',
 'IMU_ATTI(0):accelComposite:C',
 'IMU_ATTI(0):gyroX',
 'IMU_ATTI(0):gyroY',
 'IMU_ATTI(0):gyroZ',
 'IMU_ATTI(0):gyroComposite:C',
 'IMU_ATTI(0):velN',
 'IMU_ATTI(0):velE',
 'IMU_ATTI(0):velD',
 'IMU_ATTI(0):velComposite:C',
 'IMU_ATTI(0):velH:C',
 'IMU_ATTI(0):magX',
 'IMU_ATTI(0):magY',
 'IMU_ATTI(0):magZ',
 'IMU_ATTI(0):magMod:C',
 'IMU_ATTI(0):temperature',
 'IMU_ATTI(0):yawUnWrapped:C',
 'IMU_ATTI(0):tiltInclination:C',
 'IMU_ATTI(0):tiltDirectionEarthFrame:C',
 'IMU_ATTI(0):tiltDirectionBodyFrame:C',
 'IMU_ATTI(0):yaw360:C',
 'IMU_ATTI(0):magYaw:C',
 'IMU_ATTI(0):Yaw-magYaw:C',
 'IMUCalcs(0):PosN:C',
 'IMUCalcs(0):PosE:C',
 'IMUCalcs(0):PosD:C',
 'IMUCalcs(0):height:C',
 'IMUCalcs(0):velN

In [22]:
# Calculate correlation matrix
correlation_matrix = df1.corr()
print(correlation_matrix)


                         Clock:Tick#  Clock:offsetTime  IMU_ATTI(0):Longitude  \
Clock:Tick#                 1.000000          1.000000              -0.765691   
Clock:offsetTime            1.000000          1.000000              -0.765691   
IMU_ATTI(0):Longitude      -0.765691         -0.765691               1.000000   
IMU_ATTI(0):Latitude       -0.990898         -0.990898               0.714458   
IMU_ATTI(0):press:D         0.427665          0.427665              -0.303923   
...                              ...               ...                    ...   
BatteryInfo:f_cap:D              NaN               NaN                    NaN   
BatteryInfo:out_ctl:D            NaN               NaN                    NaN   
BatteryInfo:out_ctl_f:D          NaN               NaN                    NaN   
date_float                       NaN               NaN                    NaN   
time_float                  0.999583          0.999583              -0.764938   

                         IM

In [None]:
# Visualize the correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Correlation Matrix")
plt.show()


In [None]:
df1.columns

In [None]:
'''For trajectory prediction of a drone using LSTM, your target feature(s) should be the drone's future position. Specifically, the features that represent the drone's location or movement over time are important to predict, such as latitude (lat), longitude (lon), and altitude (alt).

Potential Target Features for Trajectory Prediction:
lat (Latitude): The geographical latitude of the drone.
lon (Longitude): The geographical longitude of the drone.
alt (Altitude): The altitude of the drone.
In a typical drone trajectory prediction task, you'd aim to predict these features based on the previous time steps.

Example Target Feature(s):
If you want to predict the next position of the drone in terms of latitude, longitude, and altitude, you would set:
Target (y): ['lat', 'lon', 'alt']
These would be the dependent variables (targets) you're trying to predict.
Model Setup for Multiple Targets:
Since you're predicting three target variables (latitude, longitude, and altitude), your LSTM model's output layer should have 3 output units (one for each feature).

Example Setup:
Input (Features): The features you provided (like x_gyro, y_gyro, z_gyro, pitch, yaw, etc.) will be used as input data to the LSTM model.
Input (X) = ['x_gyro', 'y_gyro', 'z_gyro', 'x_acc', 'y_acc', 'z_acc', 'north', 'east', 'down', 'pitch', 'yaw', 'roll', 'wind_speed', 'wind_direction', 'year', 'month', 'day', 'hour', 'minute', 'second']
Output (Target): You want to predict the future lat, lon, and alt:
Output (y) = ['lat', 'lon', 'alt']'''

In [None]:
'''Approach 2: Splitting First, Then Standardizing (Recommended)
Here’s why this is generally preferred: By splitting first, you avoid data leakage, where information from the test set could influence the training process. 
It’s better to ensure that the scaling parameters (like mean and standard deviation) are derived only from the training set and then applied to the test set.
Explanation of the Steps:
Split your dataset first using train_test_split().
Fit the StandardScaler on X_train to compute the mean and standard deviation based only on the training data.
Transform both X_train and X_test using the same scaler object:
fit_transform(X_train) to compute the parameters and scale X_train.
transform(X_test) to apply the scaling without recomputing the parameters, ensuring no leakage of information from the test set.
Why this is better:
Avoids data leakage: The test set is never used during training, so by splitting first and then scaling, you ensure the model doesn't have access to the test set's information during training.
Realistic evaluation: You simulate a real-world scenario where the model will not know anything about unseen data before it's scaled or processed.
So, to answer your question: yes, you can use train_test_split after standardization, but it's usually better to split first, then standardize the training and test sets separately.
'''

In [None]:
'''Key Steps for Building and Training an LSTM Model:
-Reshape the data for time series input.
-Define and build the LSTM model.
-Train the model.
-Evaluate the model on the test set.'''

In [None]:
'''1. Data Preparation for LSTM
An LSTM model expects the input data to be in 3D format:

(samples, timesteps, features), where:
-samples is the number of data samples,
-timesteps is the number of past time steps to look back for each prediction,
-features is the number of features (e.g., latitude, longitude, altitude, etc.).
-First, you need to reshape your data into sequences that LSTM can process.'''

In [13]:
dep=['lat', 'lon', 'alt']
ind=['x_gyro', 'y_gyro', 'z_gyro', 'x_acc', 'y_acc', 'z_acc', 'north', 'east', 'down', 'pitch', 'yaw', 'roll', 'wind_speed', 'wind_direction', 'year', 'month', 'day', 'hour', 'minute', 'second']
y= df1[dep]
x=df1[ind]

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [15]:
# Split the dataset first
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=42)


In [None]:
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of y_test: {y_test.shape}")

In [17]:
# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform it
X_train_scaled = scaler.fit_transform(X_train)

# Use the same scaler to transform the test set (but don't fit again!)
X_test_scaled = scaler.transform(X_test)

# Now X_train_scaled and X_test_scaled are standardized


In [None]:
print(f"Shape of X_train_scaled : {X_train_scaled .shape}")
print(f"Shape of X_test_scaled: {X_test_scaled.shape}")


In [None]:
'''from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

# Initialize the model
model = LinearRegression()

# Initialize RFE with the model
rfe = RFE(estimator=model, n_features_to_select=5)  # Select top 5 features
rfe.fit(x, y)  # Make sure to pass DataFrames to RFE

# Get selected features
selected_features = [ind[i] for i in range(len(ind)) if rfe.support_[i]]
print("Selected features:", selected_features)

# Use selected features for further modeling
X_selected = x[selected_features]  # Subset the DataFrame with selected features'''

In [None]:
# Select only the features you want to scale
features_to_scale = [ 'x_gyro', 'y_gyro', 'z_gyro', 'year', 'month', 'day', 'hour', 'minute', 'second']

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler and transform the data
X = scaler.fit_transform(x)
print(x)
# Print the scaled dataframe
#print(df1)



In [21]:
# Define the number of time steps (look-back window)
n_steps = 5 # You can choose the look-back window based on your problem (e.g., 10 previous time steps)

In [22]:
# Convert the training data into sequences for LSTM
def create_sequences(X, y, n_steps):
    Xs, ys = [], []
    for i in range(len(X) - n_steps):
        Xs.append(X[i:i + n_steps].values)  # Get the previous 'n_steps' time steps
        ys.append(y.iloc[i + n_steps])  # The corresponding target value
    return np.array(Xs), np.array(ys)




In [23]:
X_train_seq, y_train_seq = create_sequences(X_train, y_train, n_steps)
X_test_seq, y_test_seq = create_sequences(X_test, y_test, n_steps)


In [None]:
# The shape of the input data should now be (samples, timesteps, features)
print(f"X_train_seq shape: {X_train_seq.shape}")
print(f"y_train_seq shape: {y_train_seq.shape}")
print(f"Shape of X_test_seq: {X_test_seq.shape}")  # Should be (samples,)
print(f"Shape of y_test_seq: {y_test_seq.shape}")  # Should be (samples,)


In [None]:
print(f"Length of X_test: {len(X_test)}")


In [None]:
# Ensure test set has enough data for sequence creation
if len(X_test) > n_steps:
    X_test_seq, y_test_seq = create_sequences(X_test, y_test, n_steps)
    print(f"Shape of X_test_seq: {X_test_seq.shape}")
    print(f"Shape of y_test_seq: {y_test_seq.shape}")
else:
    print("Not enough test data for sequence creation.")


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

# Define the LSTM model
model = Sequential()

# Add the LSTM layer with 50 units (neurons) and input shape corresponding to (timesteps, features)
model.add(LSTM(units=50, return_sequences=False, input_shape=(n_steps, X_train_seq.shape[2])))

# Optionally, add a Dropout layer to prevent overfitting
model.add(Dropout(0.2))

# Add the output layer (1 unit if predicting one value like future latitude, or multiple units if multivariate output)
model.add(Dense(3))  # Replace '1' with the number of outputs if you have more (e.g., 3 for lat, lon, alt)

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Summary of the model
model.summary()


In [None]:
print(f"Shape of X_train_seq: {X_train_seq.shape}")  # Should be (samples, timesteps, features)
print(f"Shape of X_test_seq: {X_test_seq.shape}")
print(f"Shape of y_test_seq: {y_test_seq.shape}")

In [None]:
print(X_train.head())  # To verify the structure
print(y_train.head())  # To verify the structure


In [None]:
print(len(X_train))  # Ensure this is greater than n_steps (e.g., 10)


In [None]:
# LSTM model definition
model = Sequential()
model.add(LSTM(units=50, return_sequences=False, input_shape=(n_steps, X_train_seq.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(3))  # Output layer for single output (e.g., predicting lat, lon, or alt)

model.compile(optimizer='adam', loss='mean_squared_error')


In [None]:
history = model.fit(X_train_seq, y_train_seq, epochs=50, batch_size=32, validation_data=(X_test_seq, y_test_seq))




In [None]:
print(f"Shape of X_test_seq: {X_test_seq.shape}")


In [None]:
if len(X_test_seq) == 0:
    print("Test set is empty, cannot make predictions.")
else:
    y_pred = model.predict(X_test_seq)


In [None]:
# Make predictions on the test set
y_pred = model.predict(X_test_seq)



In [None]:
# Ensure test set has enough data for sequence creation
if len(X_test) > n_steps:
    X_test_seq, y_test_seq = create_sequences(X_test, y_test, n_steps)
    print(f"Shape of X_test_seq: {X_test_seq.shape}")
    print(f"Shape of y_test_seq: {y_test_seq.shape}")
else:
    print("Not enough test data for sequence creation.")


In [None]:
# Evaluate the model
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse = mean_squared_error(y_test_seq, y_pred)
mae = mean_absolute_error(y_test_seq, y_pred)
r2 = r2_score(y_test_seq, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R^2 Score: {r2}")


In [None]:
import matplotlib.pyplot as plt

# Plot the training loss and validation loss over epochs
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend()
plt.show()


In [None]:
from kerastuner.tuners import RandomSearch

def build_model(hp):
    model = Sequential()
    model.add(LSTM(units=hp.Int('units', min_value=32, max_value=128, step=32), 
                   return_sequences=False, 
                   input_shape=(n_steps, X_train_seq.shape[2])))
    model.add(Dropout(hp.Float('dropout', min_value=0.1, max_value=0.5, step=0.1)))
    model.add(Dense(1))  # Output layer
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

tuner = RandomSearch(
    build_model,
    objective='val_loss',
    max_trials=10,  # Increase to explore more hyperparameter combinations
    executions_per_trial=2,  # Average results across multiple runs
    directory='tuning',
    project_name='lstm_trajectory_v2'
)

'''tuner = RandomSearch(
    build_model,
    objective='val_loss',
    max_trials=5,
    executions_per_trial=1,
    directory='tuning',
    project_name='lstm_trajectory'
)'''

# Start the hyperparameter search
tuner.search(X_train_seq, y_train_seq, epochs=20, validation_data=(X_test_seq, y_test_seq))


Step 1: Evaluate the Model on Test Data
After tuning, you can retrieve the best model and evaluate it on your test set (X_test_seq, y_test_seq). This will give you a better idea of how well it generalizes to unseen data.

In [None]:
# Retrieve the best model from the tuning process
best_model = tuner.get_best_models(num_models=1)[0]

# Evaluate the model on test data
test_loss = best_model.evaluate(X_test_seq, y_test_seq)
print(f"Test Loss: {test_loss}")

Step 2: Visualize Predictions
Visualizing the predicted trajectories vs. the actual trajectories will give you a clear idea of how well the model is performing. You can plot some of the predictions against the actual data.

The error message IndexError: index 3 is out of bounds for axis 0 with size 3 indicates that your predictions array or y_test_seq contains fewer than 5 samples. Specifically, it looks like there are only 3 samples in predictions (or y_test_seq), so when you're trying to access predictions[3], it's out of bounds.

How to Fix It
You need to make sure that you're not trying to plot more samples than are available in predictions or y_test_seq. You can modify your code to check the length of predictions or y_test_seq before plotting.

Here’s an updated version of your code that will adjust the range of the plot loop dynamically based on the actual size of the predictions array:
Explanation:
num_samples is calculated as the minimum of the lengths of predictions and y_test_seq, ensuring that you only try to plot as many samples as are available in both arrays.
The loop iterates only up to num_samples, which prevents the IndexError from occurring.
This should solve the error and plot the available samples without exceeding the bounds of your data.

In [None]:
import matplotlib.pyplot as plt

# Make predictions using the best model
predictions = best_model.predict(X_test_seq)

# Determine how many samples are available
num_samples = min(len(predictions), len(y_test_seq))

# Plot the predicted and actual values for available samples
for i in range(num_samples):  # Plot available trajectories
    plt.figure(figsize=(8, 4))
    plt.plot(predictions[i], label='Predicted')
    #plt.plot(y_test_seq[i], label='Actual')
    plt.legend()
    plt.show()


Step 3: Analyze and Adjust
Inspect learning curve: Check the training and validation losses across epochs to see if your model is overfitting (too low training loss but high validation loss) or underfitting (both losses are high).

In [None]:
# Retrieve the best model from the tuning process
best_model = tuner.get_best_models(num_models=1)[0]

# Since the tuner itself doesn't store the training history, 
# you can retrain the best model and capture its history.
history = best_model.fit(
    X_train_seq, y_train_seq,
    validation_data=(X_test_seq, y_test_seq),
    epochs=20,
    batch_size=32,
    verbose=1
)

# Plot the learning curve: Training Loss vs Validation Loss
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Learning Curve')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()


Step 4: Fine-tune Hyperparameters Further
The current tuning process explored a limited range of hyperparameters (e.g., units and dropout). You can further tune the model by:

Adding more trials in the RandomSearch process (max_trials=10, for example).
Exploring other hyperparameters, like the learning rate of the Adam optimizer, or trying different optimizers.

In [49]:
tuner = RandomSearch(
    build_model,
    objective='val_loss',
    max_trials=10,  # Increase to explore more hyperparameter combinations
    executions_per_trial=2,  # Average results across multiple runs
    directory='tuning',
    project_name='lstm_trajectory_v2'
)


Step 1: Make Predictions Using the Best Model
Once the hyperparameter tuning is complete, you can retrieve the best model and make predictions on your test data (X_test_seq).

In [None]:
# Retrieve the best model from the tuning process
best_model = tuner.get_best_models(num_models=1)[0]

# Make predictions on the test set
predictions = best_model.predict(X_test_seq)


Step 2: Calculate Accuracy or Performance Metric
For trajectory prediction, typical accuracy metrics include Mean Squared Error (MSE) or Mean Absolute Error (MAE) because this is a regression problem, not a classification problem. You can calculate the error between the predicted and actual values.

Here’s how to calculate MSE and MAE on your predictions:

In [None]:
print(f"y_test_seq shape: {y_test_seq.shape}")
print(f"predictions shape: {predictions.shape}")

In [59]:
# Reshape predictions to match y_test_seq if necessary
predictions = np.repeat(predictions, y_test_seq.shape[1], axis=1)  # Assuming repeating is valid


In [None]:
print(f"y_test_seq shape: {y_test_seq.shape}")
print(f"predictions shape: {predictions.shape}")

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Calculate MSE and MAE
mse = mean_squared_error(y_test_seq, predictions)
mae = mean_absolute_error(y_test_seq, predictions)

print(f'Mean Squared Error (MSE): {mse}')
print(f'Mean Absolute Error (MAE): {mae}')


Step 3: Visualize Path Prediction
You can visualize the predicted paths against the actual paths to better understand the performance of your model.
predictions[i]: This represents the predicted path for the ith sample from X_test_seq.
y_test_seq[i]: This is the actual path corresponding to the ith sample.
The visual comparison between the predicted and actual trajectories will give you an idea of how well your LSTM model captures the path dynamics.

In [None]:
import matplotlib.pyplot as plt

# Determine the number of samples available (use the minimum of predictions and y_test_seq)
num_samples = min(len(predictions), len(y_test_seq))

# Plot for the available number of samples (max 5 or num_samples)
for i in range(min(5, num_samples)):  # Plot for up to 5 trajectories, or fewer if num_samples < 5
    plt.figure(figsize=(8, 4))
    plt.plot(predictions[i], label='Predicted Path', marker='o')
    plt.plot(y_test_seq[i], label='Actual Path', marker='x')
    plt.title(f'Trajectory {i+1}')
    plt.xlabel('Time Step')
    plt.ylabel('Coordinate Value')
    plt.legend()
    plt.show()


Step 5: (Optional) Save the Model
If the results look good and you are satisfied with the performance, you can save the model for future use.

In [66]:
# Save the best model

model.save('my_model.keras')

You can later load the saved model using:

In [None]:
from keras.models import load_model
loaded_model = load_model('best_lstm_trajectory_model.h5')


Summary of Steps:
Make predictions on the test data.
Calculate MSE and MAE to quantify the prediction error.
Visualize the predicted paths against the actual paths.
Optionally, save the best model for future use.