### Loading Data

In [None]:
# Loading Data

import os
import pickle

folder_path = "path_to_my_folder"

pickle_files = [f for f in os.listdir(folder_path) if f.endswith('.p')]

print("files found in the folder")
print(pickle_files)


def load_pickle(file_path):
    with open(file_path, 'rb') as f:
        data = pickle.load(f)
    return data


all_data = {}

for file in pickle_files:
    file_path = os.path.join(folder_path, file)
    all_data[file] = load_pickle(file_path)

print("\nLoaded pickle files:")
for file_name in all_data:
    print(f"File: {file_name}, Keys: {list(all_data[file_name].keys())}"
          )

### Drop Unnecessary columns

In [None]:
feature_columns = ["rcs", "distance", "angleAzimuth", "angleElevation", "radialVelocity", "radialVelocityDomainMax", "SNR", 'yawrate', 'egospeed' ]
label_column = ["orientation", "x", "y", " width_edge_mean", "length_edge_mean"]


def clean_files(data):

    cleaned_features = data[feature_columns]
    cleaned_labels = data[label_column]
    return cleaned_features, cleaned_labels

first_file_name = list(all_data.keys())[0]
print(f"processing file: {first_file_name}")

raw_data = all_data[first_file_name]
x_raw, y_raw = clean_files(pd.DataFrame(raw_data))

print("cleaned features (first 2 rows:)")
print(x_raw.head(2))
print("\ncleaned labels (first 2 rows):")
print(y_raw.head(2))

### Handle Data Types and Convert to numeric

In [None]:
def convert_to_numeric(x, y):

    for col in x.columns:
        x[col] =x[col].apply(lambda v: np.array(v, dtype=np.float32)) if isinstance(v, list) else v)

    y[label_column] = y[label_column].apply(lambda v: np.array(v, dtype=np.float32) if isinstance(v, list) else v)

    return x, y

x_numeric, y_numeric = convert_to_numeric(x_raw, y_raw)

print("\nFeatures after conversion to numeric (first 2 rows):")
print(x_numeric.head(2))

print("\nlables after conversion to numeric (first 2 rows):")
print(y_numeric.head(2))

### Normalize Data




In [None]:
from torch.nn.utils.rnn import pad_sequence

def normalize_features(x, feature_columns):

    for col in feature_columns:
        feature_values = x[col]
        mean = np.mean(np.concatenate(feature_values))
        std = np.std(np.concatenate(feature_values))
        x[col] = x[col].apply(lambda v: (v-mean) / (std + 1e-8))
    return x


### Pad the data

In [None]:
def pad_sequences(data, fixed_length=None):
    tensor_data = [torch.tensor(seq, dtype = torch.float32) for seq in data]
    if fixed_length:
        padded_data = torch.stack([
            torch.cat([seq, torch.zeros(fixed_length - seq.shape[0], *seq.shape[1:], dtype=torch.float32)])
            if seq.shape[0] < fixed_length else seq[:fixed_length]
            for seq in tensor_data
        ])
    else:
        padded_data = pad_sequence(tensor_data, batch_first=True)
    return padded_data

x_normalized = normalize_features(x_numeric, feature_columns)

x_combined = [
    np.column_stack([
        row["rcs"], row["distance"], row["angleAzimuth"], row["angleElevation"], row["radialVelocity"]
    ])
    for _, row in x_normalized.iterrows()
]

x_padded = pad_sequences(x_combined)
y_padded = pad_sequences(y_numeric[label_column])

print("\nShape of x?padde:", x_padded.shape)
print("\nShape of y_padded:", y_padded.shape)

### Divide data into Train, Validate, test 

In [None]:
from sklearn.model_selection import train_test_split
file_names = list(all_data.keys())
train_files, temp_files = train_test_split(file_names, train_size=0.8, random_state=42)
val_files, test_files = train_test_split(temp_files, train_size=0.5, random_state=42)

print(f"training files: {train_files}")
print(f"valiadation files: {val_files}")
print(f"testing files: {test_files}")

### Preprocess each file based on its split

In [None]:
def preprocess_file(data, feature_columns, label_column, fixed_length=50):
     
     x_raw= data[feature_columns]
     y_raw= data[label_column]

     x_numeric, y_numeric = convert_to_numeric(x_raw, y_raw)

     x_normalized= normalize_features(x_numeric, feature_columns)

     x_combined = [
          np.column_stack([
               row["rcs"], row["distance"], row["angleAzimuth"], row["angleElevation"], row["radialVeocity"]
          ])
          for _, row in x_normalized.iterrows()
     ]

     x_padded = pad_sequences(x_combined, fixed_length=fixed_length)
     y_padded = pad_sequences(y_numeric[label_column], fixed_length=fixed_length)

     return x_padded, y_padded


processed_data_splits = {"train": [], "val":[], "test":[]}

for file_name in file_names:
     raw_data= pd.Dataframe(all_data[file_name])
     x_padded, y_padded = preprocess_file(raw_data, feature_columns, label_column, fixed_length=50)

     if file_name in train_files:
        processed_data_splits["train"].append((x_padded, y_padded))
     elif file_name in val_files:
        processed_data_splits["val"].append(x_padded, y_padded))
     elif file_name in test_files:
        processed_data_splits["test"].append((x_padded, y_padded))
print("files divided into train, val, and test splits")



### combine data within each split

In [None]:
def combine_splits(split_data):

    x_combined = torch.cat([x for x, _in split_data], dim=0)
    y_combined = torch.cat([y for _, y in split_data], dim=0)

    return x_combined, y_combined

x_train_combined, y_train_combined = combine_splits(processed_data_splits["train"])
x_val_combined, y_val_combined = combine_splits(processed_data_splits["val"])
x_test_combined, y_test_combined = combine_splits(processed_data_splits["test"])

print("combined splits ready for training:")
print(f"x_train_shape: {x_train_combined.shape},y_train shape: {y_train_combined.shape}")
print(f"x_val shape: {x_valcombined.shape}, y_val shape: {y_val_combined.shape}")
print(f"x_test shape: {x_test_combined.shape}, y_test shape: {y:test_combined.shape}")