**About**

This is a Time-series Random Forest approach on identifying the type of maneuver for a given dataset. The package sktime is mainly used to train the model.

In [1]:
!pip install sktime

Collecting sktime
  Downloading sktime-0.11.2-py3-none-any.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 5.0 MB/s 
[?25hCollecting deprecated>=1.2.13
  Downloading Deprecated-1.2.13-py2.py3-none-any.whl (9.6 kB)
Collecting numba>=0.53
  Downloading numba-0.55.1-1-cp37-cp37m-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 43.5 MB/s 
Collecting statsmodels>=0.12.1
  Downloading statsmodels-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.8 MB)
[K     |████████████████████████████████| 9.8 MB 27.1 MB/s 
Collecting llvmlite<0.39,>=0.38.0rc1
  Downloading llvmlite-0.38.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.5 MB)
[K     |████████████████████████████████| 34.5 MB 9.5 kB/s 
Installing collected packages: llvmlite, statsmodels, numba, deprecated, sktime
  Attempting uninstall: llvmlite
    Found existing installation: llvmlite 0.34.0
    Uninstalling llvmlite-0.34.0:
      

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sktime
import math
# from sktime.classification.kernel_based import TimeSeriesForestClassifier

In [4]:
# Set the display option
pd.set_option('display.max_rows', None)

**Data Engineering**

The datasets were resampled to contain 800 entries (the average length of the datasets), and the NaN values were filled in based on the nearby values.

In [7]:
def compute_lcm(x, y):
  """
  A helper function to compute the least common multiple of two integers.

  Input:
  x: an integer
  y: an integer

  Output:
  The least common multiple of two input integers.
  """
  if x > y:
      greater = x
  else:
      greater = y

  while(True):
      if((greater % x == 0) and (greater % y == 0)):
          lcm = greater
          break
      greater += 1
  return lcm

def downsample(x,q):
    """
    A helper function to down-sample a given time-series dataset.

    Input: 
    x: Time-series data
    q: The integer ratio that the sample needs to be reduced to.

    Output:
    The down-sampled time-series data.
    """
    if type(q) == float or q <= 0:
        return []
    else:
        result = []
        for i in range(len(x)):
            if i%q == 0:
                result.append(x[i])
        return result

def upsample(x,p):
    """
      A helper function to up-sample a given time-series dataset.

      Input: 
      x: Time-series data
      p: The integer ratio that the sample needs to be increased to.

      Output:
      The up-sampled time-series data.
    """
    if type(p) == float or p <= 0:
        return []
    result = []
    for i in range(len(x)):
        result.append(x[i])
        if i != len(x)-1:
            for j in range(1, p):
                result.append(x[i] + (x[i+1]-x[i]) *j / p) 
        else:
            for j in range(1, p):
                result.append(x[i] + j/p * (x[i] - x[i-1])) 
    return result

def resample(inp,desired_length):
    """
      A helper function to resample a given time-series dataset.

      Input: 
      inp: Time-series data
      desired_length: The number of entries that the sample needs to be resized to.

      Output:
      The resampled time-series data.
    """
    l = compute_lcm(len(inp), desired_length)
    result1 = upsample(inp, int(l/len(inp)))
    result = downsample(result1, int(l/desired_length))
    return result

In [8]:
def fill_in_nan(array, i, j, row, col):
    average = 0
    count = 0
    if i != 0 and not np.isnan(array[i-1][j]):
      average += array[i-1][j]
      count += 1
    if i != row-1  and not np.isnan(array[i+1][j]):
      average += array[i+1][j]
      count += 1
    if count >= 0:
      return average/count

**Read In Dataset**

The dataset is cleaned such that the number of rows were resampled, the NaN values were filled in, and only relevant columns were included. The format of dataset is specified for the further model training.

In [10]:
def read_data(file_locaton, all_files, train=False):
    """
    Read in each dataset and convert them to a pandas dataframe.

    Input: 
      file_location: the file directory for all the sample maneuvers.
      all_files: the file names within the file directory.
    
    Output:
      df: a pandas dataframe containing relevant columns 
    """
    if train:
      factors = ['vx (m/s)', 'vy (m/s)', 'vz (m/s)', 'head (deg)', 'roll (deg)', 'pitch (deg)']
    else:
      factors = [' vx (m/s)', ' vy (m/s)', ' vz (m/s)', ' head (deg)', ' roll (deg)', ' pitch (deg)']
    array = [[[] for _ in range(len(factors))] for _ in range(len(all_files))]
    if train:
      labels = []
    for index in range(len(all_files)):
      file = all_files[index]
      df = pd.read_csv(file_location+"/"+file, sep='\t')
      df = df[factors]
      df_array = df.to_numpy()
      for i in range(len(df_array)):
        for j in range(len(df_array[0])):
          if np.isnan(df_array[i][j]):
            fill_value = fill_in_nan(df_array, i, j, len(df_array), len(df_array[0]))
            array[index][j].append(fill_value)
          else:
            array[index][j].append(df_array[i][j])
      if train:
        labels.append(file[11:-8])

    for index in range(len(all_files)):
      for l in range(len(array[index])):
        array[index][l] = resample(array[index][l], 800) #800 is the average length of the training datasets
        array[index][l] = pd.Series(array[index][l])
    if train:
      return pd.DataFrame(array, dtype=object), pd.Series(labels)
    else:
      return pd.DataFrame(array, dtype=object)


**Model Training**

Train the multivariate model by concatenating the variables.
Use the model to output the desired label.

In [11]:
from sklearn.pipeline import Pipeline
from sktime.transformations.panel.compose import ColumnConcatenator
from sktime.classification.interval_based import TimeSeriesForestClassifier
from sktime.classification.compose import ColumnEnsembleClassifier
from sktime.classification.dictionary_based import BOSSEnsemble

file_location = "/content/drive/MyDrive/ExpertDemo_tsv"
X_train, y_train = read_data(file_location , os.listdir("/content/drive/MyDrive/ExpertDemo_tsv"), True)

In [13]:
steps = [
    ("concatenate", ColumnConcatenator()),
    ("classify", TimeSeriesForestClassifier(n_estimators=100)),
]
clf = Pipeline(steps)
clf.fit(X_train, y_train)

file_name = ['12000001002.min.tsv']
file_location = "/content/drive/MyDrive/12000000000_tsv_good"
X_test = read_data(file_location, file_name)
y_pred = clf.predict(X_test)
y_pred


array(['StraightIn'], dtype='<U10')