In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
import plotly.express as px
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectFromModel

import warnings
warnings.filterwarnings("ignore")


In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Configuration

`FILE_PATH`: file containing the data. \
`FS`: the hertz used in the file. \
`ADL_TITLE`: The activity that were done. The index is the activity ID.


In [None]:
FILE_PATH = '/content/drive/My Drive/MASTER/RAWFILES/UiS4ADL-50hz.csv'
FS = 50
ADL_TITLE = ["No activity","Drink water", "Eat meal", "Open a bottle", "Open a box", "Brush teeth", "Brush hair",
             "Take off a jacket", "Put on a jacket", "Put on a shoe", "Take off a shoe", "Put on glasses", "Take off glasses",
             "Sit down", "Stand up", "Writing", "Phone call", "Type on a keyboard", "Salute (wave hand)",
             "Sneeze cough", "Blow nose", "Washing hands", "Dusting", "Ironing", "Washing dishes"]

## Load the Data

In [None]:
data = pd.read_csv(FILE_PATH)

### Data to drop
Dropping subjects data because of incorrect data recording. \
**Change variable `subjects_to_drop` accordingly to goal.**

In [None]:
subjects_to_drop = [1727,1826,2097]
to_drop = data[data['subject'].isin(subjects_to_drop)]
data.drop(to_drop.index, inplace=True)
data

### Analysis if there's missing data, and drop it if there's

In [None]:
to_drop = data[data.isna().any(axis=1)]
print(to_drop.subject.unique(),to_drop.session.unique(),to_drop.adl.unique())

In [None]:
data.drop(to_drop.index,inplace=True)
data

In [None]:
data.info()

In [None]:
f, ax = plt.subplots(nrows=1, ncols=1, figsize=(16,5))
sns.heatmap(data.T.isna(), cmap='Blues')
ax.set_title('Fields with Missing Values', fontsize=16)
for tick in ax.yaxis.get_major_ticks():
    tick.label.set_fontsize(14)
plt.show()

## Overview of the datasets

In [None]:
print("Dataset Overview:")
display((data.head()))
print()

# Check for missing values
print("Missing Values:")
print(data.isnull().sum())
print()

# Basic statistics of the numerical columns
print("Basic Statistics:")
display(data.describe().iloc[0:,1:-4])
print()

### Distribution information

In [None]:
number_samples = []
adl_id = []

for i in range(len(ADL_TITLE)):
  number_samples.append(data[data['adl'] == i].count().unique()[0])
  adl_id.append(i)


fig, ax = plt.subplots(1, 1, figsize=(30, 20))
ax.bar(adl_id, number_samples)
ax.set_title("Total ADLs samples", {'size': 25})
ax.set_xlabel("ADL", {'size': 25})
ax.set_ylabel("Samples", {'size': 25})
ax.xaxis.set_tick_params(labelsize=25)
ax.yaxis.set_tick_params(labelsize=25)
ax.set_xticks(range(0,len(ADL_TITLE)))

for bar in ax.patches:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width() / 2.0, height, f' {height:.0f}', fontsize=20,
            ha='center', va='bottom', rotation=15)

In [None]:
distribution_seconds = np.sum(np.array(number_samples)/FS)
print("Total ADLs samples in seconds: " + str(distribution_seconds) + " seconds")

In [None]:
number_subjects = []
session_count = []
session_for_adl = [5,5,3,5,5,5,5,3,3,3,3,5,5,5,5,5,5,5,5,5,5,5,5,5,5]


for i in range(len(ADL_TITLE)):
  adl_selector = data[data['adl'] == i]
  number_subjects.append(len(adl_selector.subject.unique()))
  session_count.append(len(adl_selector.fileID.unique()))


fig, ax = plt.subplots(2, 1, figsize=(30, 20))
ax[0].bar(adl_id, number_subjects)
ax[0].set_title("Unique subjects performing ADLs", {'size': 25})
ax[0].set_xlabel("ADL", {'size': 25})
ax[0].set_ylabel("Unique subjects count", {'size': 25})
ax[0].xaxis.set_tick_params(labelsize=25)
ax[0].yaxis.set_tick_params(labelsize=25)
ax[0].set_xticks(range(0,len(ADL_TITLE)))
for i in range(len(number_subjects)):
  ax[0].hlines(y=len(data.subject.unique()), xmin=i-0.4, xmax=i+0.4, linewidth=2, color='r')

for bar in ax[0].patches:
    height = bar.get_height()
    ax[0].text(bar.get_x() + bar.get_width() / 2.0, height, f' {height:.0f}', fontsize=20,
            ha='center', va='bottom')

ax[1].bar(adl_id, session_count)
ax[1].set_title("Total ADLs sessions", {'size': 25})
ax[1].set_xlabel("ADL", {'size': 25})
ax[1].set_ylabel("Sessions count", {'size': 25})
ax[1].xaxis.set_tick_params(labelsize=25)
ax[1].yaxis.set_tick_params(labelsize=25)
ax[1].set_xticks(range(0,len(ADL_TITLE)))
for i in range(len(number_subjects)):
  if i == 0:
    continue
  ax[1].hlines(y=number_subjects[i]*session_for_adl[i], xmin=i-0.4, xmax=i+0.4, linewidth=2, color='r')

for bar in ax[1].patches:
    height = bar.get_height()
    ax[1].text(bar.get_x() + bar.get_width() / 2.0, height, f' {height:.0f}', fontsize=20,
            ha='center', va='bottom')


In [None]:
session_samples = []
session_id = []

for s in data.session.unique():
  session_samples.append(len(data[data['session'] == s]))
  session_id.append(s)


fig, ax = plt.subplots(1, 1, figsize=(30, 20))
ax.bar(session_id, session_samples)
ax.set_title("Total sessions samples", {'size': 25})
ax.set_xlabel("Session", {'size': 25})
ax.set_ylabel("Samples", {'size': 25})
ax.xaxis.set_tick_params(labelsize=25)
ax.yaxis.set_tick_params(labelsize=25)

for bar in ax.patches:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width() / 2.0, height, f' {height:.0f}', fontsize=20,
            ha='center', va='bottom')

In [None]:
number_subjects = []
total_session_count = []
session_id = []

for s in data.session.unique():
  session_selector = data[data['session'] == s]
  total_session_count.append(len(session_selector.fileID.unique()))
  number_subjects.append(len(session_selector.subject.unique()))
  session_id.append(s)


fig, ax = plt.subplots(2, 1, figsize=(30, 20))
ax[0].bar(session_id, number_subjects)
ax[0].set_title("Unique subjects performing sessions", {'size': 25})
ax[0].set_xlabel("Session", {'size': 25})
ax[0].set_ylabel("Unique subjects count", {'size': 25})
ax[0].xaxis.set_tick_params(labelsize=25)
ax[0].yaxis.set_tick_params(labelsize=25)

for bar in ax[0].patches:
    height = bar.get_height()
    ax[0].text(bar.get_x() + bar.get_width() / 2.0, height, f' {height:.0f}', fontsize=20,
            ha='center', va='bottom')

ax[1].bar(session_id, total_session_count)
ax[1].set_title("Total sessions", {'size': 25})
ax[1].set_xlabel("Session", {'size': 25})
ax[1].set_ylabel("Sessions count", {'size': 25})
ax[1].xaxis.set_tick_params(labelsize=25)
ax[1].yaxis.set_tick_params(labelsize=25)

for bar in ax[1].patches:
    height = bar.get_height()
    ax[1].text(bar.get_x() + bar.get_width() / 2.0, height, f' {height:.0f}', fontsize=20,
            ha='center', va='bottom')

In [None]:
subject_samples = []
subject_id = []

for s in data.subject.unique():
  subject_samples.append(len(data[data['subject'] == s]))
  subject_id.append(str(s))

fig, ax = plt.subplots(1, 1, figsize=(30, 20))
ax.bar(subject_id, subject_samples)
ax.set_title("Total subject samples", {'size': 25})
ax.set_xlabel("Subject", {'size': 25})
ax.set_ylabel("Samples", {'size': 25})
ax.xaxis.set_tick_params(labelsize=25, rotation=270)
ax.yaxis.set_tick_params(labelsize=25)

for bar in ax.patches:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width() / 2.0, height, f' {height:.0f}', fontsize=20,
            ha='center', va='bottom')

In [None]:
subject_ADL = []
subject_sessions = []
subject_id = []

for s in data.subject.unique():
  subject_selector = data[data['subject'] == s]
  subject_ADL.append(len(subject_selector.adl.unique()))
  subject_sessions.append(len(subject_selector.fileID.unique()))
  subject_id.append(str(s))


fig, ax = plt.subplots(2, 1, figsize=(30, 30))
ax[0].bar(subject_id, subject_ADL)
ax[0].set_title("Total ADLs performed by subjects", {'size': 25})
ax[0].set_xlabel("Subject", {'size': 25})
ax[0].set_ylabel("ADLs count", {'size': 25})
ax[0].xaxis.set_tick_params(labelsize=25, rotation=270)
ax[0].yaxis.set_tick_params(labelsize=25)

for bar in ax[0].patches:
    height = bar.get_height()
    ax[0].text(bar.get_x() + bar.get_width() / 2.0, height, f' {height:.0f}', fontsize=20,
            ha='center', va='bottom')

ax[1].bar(subject_id, subject_sessions)
ax[1].set_title("Total sessions performed by subjects", {'size': 25})
ax[1].set_xlabel("Session", {'size': 25})
ax[1].set_ylabel("Sessions count", {'size': 25})
ax[1].xaxis.set_tick_params(labelsize=25, rotation=270)
ax[1].yaxis.set_tick_params(labelsize=25)

for bar in ax[1].patches:
    height = bar.get_height()
    ax[1].text(bar.get_x() + bar.get_width() / 2.0, height, f' {height:.0f}', fontsize=20,
            ha='center', va='bottom')

### ADL visualization plots
**Change variable `columns` and `rows` accordingly to how the plot layout should be.**

In [None]:
correlation_dataframe = pd.DataFrame()

rows = 5
columns = 5
row = 0
fig, ax = plt.subplots(rows, columns, figsize=(30, 20))
for i in range(len(ADL_TITLE)):
    if i%columns == 0:
        if i > columns-1:
            row += 1

    top_features_mean = data[data['adl'] == i].mean()
    top_features_mean = top_features_mean[top_features_mean.index[1:-4]]

    sns.lineplot(y=top_features_mean, x=top_features_mean.index, ax=ax[row][i%columns])
    correlation_dataframe[i] = top_features_mean
    ax[row][i%columns].set_title(ADL_TITLE[i])
    ax[row][i%columns].set_xticklabels([])

In [None]:
plt.figure(figsize=(26, 22))
sns.heatmap(correlation_dataframe.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('ADL Correlation Heatmap')
plt.grid(False)
plt.show()

### Dataset Distrubution Study x Session (in seconds)

In [None]:
table_1 = pd.crosstab(data["adl"],data["session"])/FS
table_1['Total'] = table_1.sum(axis=1)
table_1.loc['Total'] = table_1.sum(axis=0).values
table_1 = table_1.astype('float').round(2)

table_1.T

In [None]:
table_1.T.iloc[:-1,:-1].describe().iloc[1:,0:]

In [None]:
table_1.iloc[:-1,:-1].describe().iloc[1:,0:]

### Dataset Distrubution Study x Subject (in seconds)

In [None]:
table_1 = pd.crosstab(data["adl"],data["subject"])/FS
table_1['Total'] = table_1.sum(axis=1)
table_1.loc['Total'] = table_1.sum(axis=0).values
table_1 = table_1.astype('float').round(2)
table_1.T


In [None]:
table_1.T.iloc[:-1,:-1].describe().iloc[1:,0:]

In [None]:
table_1.iloc[:-1,:-1].describe().iloc[1:,0:]

### Dataset Distrubution Study x Subject and Session (in seconds)

In [None]:
table_1 = pd.crosstab(data["subject"],data["session"])/FS
table_1['Total'] = table_1.sum(axis=1)
table_1.loc['Total'] = table_1.sum(axis=0).values
table_1 = table_1.astype('float').round(2)
table_1

In [None]:
table_1.T.iloc[:-1,:-1].describe().iloc[1:,0:]

In [None]:
table_1.iloc[:-1,:-1].describe().iloc[1:,0:]

### Specific ADL and session analytics

In [None]:
data_series = data.iloc[0:,-4:][~(data.iloc[0:,-4:].shift() == data.iloc[0:,-4:]).all(axis=1)]
display(data_series)

In [None]:
#Adding samples and duration to dataframe
stats_samples = []
for i in range(0,data_series.shape[0]-1):
    stats_samples.append(int(data_series.index[i+1]-data_series.index[i]))

stats_samples.append(0)
data_series['Samples'] = stats_samples
data_series['Duration (sec)'] = data_series['Samples']/FS

display(data_series)

### Outliers using IQR
Outliers in percentage

In [None]:
for i in range(len(ADL_TITLE)):
  adl_selector = data_series[data_series['adl'] == i]
  Q1 = adl_selector['Duration (sec)'].quantile(0.25)
  Q3 = adl_selector['Duration (sec)'].quantile(0.75)
  IQR = Q3 - Q1

  threshold = 1.5
  outliers = adl_selector[(adl_selector['Duration (sec)'] < Q1 - threshold * IQR) | (adl_selector['Duration (sec)'] > Q3 + threshold * IQR)]

  print('ADL ' + str(i) + ": " + str(round(len(outliers)/len(adl_selector),4)))

ADL specific outliers

In [None]:
adl_selector = data_series[data_series['adl'] == 12]
Q1 = adl_selector['Duration (sec)'].quantile(0.25)
Q3 = adl_selector['Duration (sec)'].quantile(0.75)
IQR = Q3 - Q1

threshold = 1.5
outliers = adl_selector[(adl_selector['Duration (sec)'] < Q1 - threshold * IQR) | (adl_selector['Duration (sec)'] > Q3 + threshold * IQR)]

display(outliers)

## Find window size and group

In [None]:
adls = ['A'+str(i) for i in range(0,len(ADL_TITLE))]
adls_stats = {}
for adl_id in data_series.adl.unique():
    tmp = data_series[data_series.adl==adl_id]
    skew = tmp.skew().iloc[5]
    tmp = tmp.describe().iloc[1:,5]
    tmp.columns = adl_id
    tmp['skew'] = skew
    adls_stats[adl_id] = tmp

sorted_adls_stats = dict(sorted(adls_stats.items()))
adls_stats = [pd.DataFrame(sorted_adls_stats[key]) for key in sorted_adls_stats]
adls_stats = pd.concat(adls_stats, axis=1)
adls_stats = adls_stats.set_axis(adls, axis=1)
adls_stats = adls_stats.round(2)
adls_stats

**Change variable `columns` and `rows` accordingly to how the plot layout should be.**

In [None]:
rows = 5
columns = 5
row = 0

fig, ax = plt.subplots(rows, columns, figsize=(20, 20))
fig.subplots_adjust(hspace=0.5)
for adls in range(0,len(ADL_TITLE)):
    if adls%columns == 0:
        if adls > columns-1:
            row += 1
    sns.lineplot(y = list(adls_stats.iloc[:,adls]), x = ['mean', 'std', 'min', '25%', '50%', '75%', 'max', 'skew'], ax=ax[row][adls%columns], marker='o')
    ax[row][adls%columns].set_title("adl " + str(adls))
    ax[row][adls%columns].set_ylim(0, max(list(adls_stats.iloc[:,adls])))

**Change variable `columns` and `rows` accordingly to how the plot layout should be.**

In [None]:
rows = 5
columns = 5
row = 0

fig, ax = plt.subplots(rows, columns, figsize=(20, 20))
fig.subplots_adjust(hspace=0.5)
for adls in range(0,len(ADL_TITLE)):
    if adls%columns == 0:
        if adls > columns-1:
            row += 1
    statistics = adls_stats.iloc[:,adls]

    sns.histplot(data = data_series[data_series['adl'] == adls], x = "Duration (sec)", ax=ax[row][adls%columns])
    ax[row][adls%columns].axvline(x=statistics[3], color='r', linestyle='--')
    ax[row][adls%columns].axvline(x=statistics[4], color='b', linestyle='--')
    ax[row][adls%columns].axvline(x=statistics[5], color='g', linestyle='--')

    red_legend = mpatches.Patch(color='red', label='25%')
    blue_legend = mpatches.Patch(color='blue', label='50%')
    green_legend = mpatches.Patch(color='green', label='75%')

    ax[row][adls%columns].legend(handles=[red_legend, blue_legend, green_legend])

    ax[row][adls%columns].set_title("ADL " + str(adls))

In [None]:
adls_stats = adls_stats.T
adls_stats['25%']

In [None]:
display(round(adls_stats['25%'].describe(),2))

**Change variable `thresholds` and `stat_type` accordingly to goal.**

Treshold examples:
- ["5", "5-10", "10"]
    - threshold <=5 , threshold > 5 < 10, threshold >= 10
- ["5", "10"]
    - threshold <=5, threshold >= 10
- ["5", "5-7", "7-9", "10"]
    - threshold <=5, threshold > 5 < 7, threshold > 7 < 9, threshold >= 10

Removing adl 0 since not wanting to use it.

In [None]:
adls_stats.drop('A0', inplace=True)

In [None]:
thresholds = ["5", "5-10", "10"]
stat_type = '25%'
threshold_results = []

if(len(thresholds) > 1):

    X = adls_stats.index
    X_axis = np.arange(1,len(X)+1)

    fig, ax = plt.subplots(1, len(thresholds), figsize=(20, 20))

    for i in range(len(thresholds)):

        threshold_split = thresholds[i].split("-")

        if i == 0:
            threshold_result = [1 if x in (adls_stats[adls_stats[stat_type]<=int(thresholds[i])].index) else 0 for x in X]
            result_array = (adls_stats[stat_type]*threshold_result)
            threshold_results.append(result_array)
            ax[i].set_title("threshold <= " + thresholds[i])
            ax[i].set_xlabel("ADLs")
            ax[i].set_ylabel("Duration")

        else:
            if len(threshold_split) == 2:

                first_threshold = int(threshold_split[0])
                second_threshold = int(threshold_split[1])

                threshold_result = [1 if x in (adls_stats[adls_stats[stat_type].between(first_threshold,second_threshold)].index) else 0 for x in X]
                result_array = (adls_stats[stat_type]*threshold_result)
                threshold_results.append(result_array)
                ax[i].set_title("threshold > " + threshold_split[0] + " < " + threshold_split[1])
                ax[i].set_xlabel("ADLs")
                ax[i].set_ylabel("Duration")

            else:

                threshold_result = [1 if x in (adls_stats[adls_stats[stat_type]>=int(thresholds[i])].index) else 0 for x in X]
                result_array = (adls_stats[stat_type]*threshold_result)
                threshold_results.append(result_array)
                ax[i].set_title("threshold >= " + thresholds[i])
                ax[i].set_xlabel("ADLs")
                ax[i].set_ylabel("Duration")

        ax[i].bar(X_axis, result_array, 0.4, label = stat_type)

In [None]:
for result in threshold_results:
    plotted_result = result[result>0]
    display(round(adls_stats[adls_stats.index.isin(plotted_result.index)],2))
    print("Size: " + str(len(plotted_result)))
    print(plotted_result.index)
    print()

**Change variable `segment_thresholds` accordingly to goal.**

In [None]:
segment_thresholds = [2, 5, 10]

if(len(segment_thresholds) != 0):
    for i in range(len(segment_thresholds)):

        segment_dataframe = threshold_results[i]
        segment_dataframe = segment_dataframe[segment_dataframe > 0]

        print("threshold < " + str(segment_thresholds[i]))
        adls_in_threshold = [int(label[1:]) for label in segment_dataframe.index]
        adls_dataframe = data_series[data_series['adl'].isin(adls_in_threshold)]
        val = round(len(adls_dataframe[(adls_dataframe['Duration (sec)']<segment_thresholds[i])])/len(adls_dataframe),4)*100
        print(val, " % of lost segments for all ADL")

        for j in range(len(adls_in_threshold)):
            val = round(len(adls_dataframe[(adls_dataframe['adl'] == adls_in_threshold[j]) &
                (adls_dataframe['Duration (sec)']<segment_thresholds[i])])/len(adls_dataframe[adls_dataframe['adl'] == adls_in_threshold[j]]),4)*100
            print(val, " % of lost segments for ADL ", adls_in_threshold[j])
        print()


## Feature selection - correlation and variance

**Change variable `groups` accordingly to observations from previous observations.**

In [None]:
groups = pd.DataFrame({
    'windowSize': [2,5,10],
    'adl': [[1, 3, 4, 7, 10, 11, 12, 13, 14, 16, 19, 20], [6, 8, 9, 18, 21, 24], [2, 5, 15, 17, 22, 23]]
})
groups

### Feature variance for all ADLs

In [None]:
variance_features_df = {
   "Feature": data.columns.to_list()[1:-4]
    }
variance_features_df = pd.DataFrame(variance_features_df)

variance_features = data.iloc[0:,1:-4].var()
variance_features_df['Variance'] = variance_features.values
sorted_variances = variance_features_df.sort_values('Variance')

display(sorted_variances)

### Feature variance based on ADL ID
List the feature to remove for each ADL ID based on variance. \
1 is remove, 0 is to keep.
**Change variable `variance_threshold` accordingly goal.**

In [None]:
variance_threshold = 0.00
feature_remove = {
   "Feature_remove": data.columns.to_list()[1:-4]
    }
feature_remove = pd.DataFrame(feature_remove)

for i in range(1,len(ADL_TITLE)):
    variance_features = data[data['adl'] == i].iloc[0:,1:-4].var()
    feature_keep_index = variance_features[variance_features > variance_threshold].index.tolist()

    result = []
    for feature in feature_remove['Feature_remove']:
        if feature in feature_keep_index:
            result.append(0)
        else:
            result.append(1)
    feature_remove[i] = result

feature_remove['Total'] = feature_remove.iloc[0:,1:].sum(axis=1)
feature_column = feature_remove.sum(axis=0).values
feature_column[0] = 'Total'
feature_remove.loc[len(feature_remove)] = feature_column
display(feature_remove)

Remove the features which have been recognized to have low variance in one of the ADLs.

In [None]:
drop_features = []

new_feature_remove = feature_remove.iloc[:-1, :]
for i in range(len(new_feature_remove['Total'])):
  if new_feature_remove['Total'][i] > 0:
    drop_features.append(new_feature_remove['Feature_remove'][i])

feature_remove = new_feature_remove[~new_feature_remove['Feature_remove'].isin(drop_features)]
feature_remove.reset_index(drop=True, inplace=True)

feature_column = feature_remove.sum(axis=0).values
feature_column[0] = 'Total'
feature_remove.loc[len(feature_remove)] = feature_column

data = data.drop(columns=drop_features, axis=1)

### Feature correlation based on ADL ID

The features with highest variance will be checked first for correlation. \

**Change variable `correlation_threshold` accordingly to goal**

In [None]:
correlation_threshold = 0.95

new_feature_remove = feature_remove.iloc[:-1, :-1]
for i in range(1,len(ADL_TITLE)):
    feature_columns = data.columns.tolist()[1:-4]

    #Finding variance and sort by descending
    variance_features_df = {
    "Feature": feature_columns
    }
    variance_features_df = pd.DataFrame(variance_features_df)

    #Adding adl such that we can select based on adl.
    feature_columns.extend(["adl"])
    data_copy = data[feature_columns]

    variance_features = data_copy[data_copy['adl'] == i].var()
    variance_features = variance_features.drop("adl")
    variance_features_df['Variance'] = variance_features.values

    variance_features_df = variance_features_df.sort_values('Variance', ascending=False)
    highest_variance_features = variance_features_df['Feature'].tolist()

    #Adding adl such that we can select based on adl.
    highest_variance_features.extend(["adl"])

    #Restructure data copy to have same order as highest variance features.
    data_copy = data_copy.reindex(columns=highest_variance_features)

    correlation_matrix = data_copy[data_copy['adl'] == i].corr()
    correlation_matrix = correlation_matrix.iloc[:-1,:-1]
    #Remove the high correlation features and only keep one left.
    #Keep the one with highest variance.

    correlation_features_remove = []
    for column in correlation_matrix.columns:
        if column in correlation_features_remove:
            continue

        feature_column = correlation_matrix[column].abs()
        selected_features = feature_column[feature_column > correlation_threshold].index.tolist()
        filtered_features = [item for item in selected_features if item != column]

        for feature in filtered_features:
            if feature not in correlation_features_remove:
                correlation_features_remove.append(feature)

    for feature in correlation_features_remove:
        for j in range(len(new_feature_remove['Feature_remove'].values)):
            if feature == new_feature_remove['Feature_remove'].values[j]:
                new_feature_remove[i][j] = 1
                break

new_feature_remove['Total'] = new_feature_remove.iloc[0:,1:].sum(axis=1)
feature_column = new_feature_remove.sum(axis=0).values
feature_column[0] = 'Total'
new_feature_remove.loc[len(new_feature_remove)] = feature_column
feature_remove = new_feature_remove
display(feature_remove)


Remove the features which have been recognized to have low correlation in one of the ADLs.

In [None]:
drop_features = []

new_feature_remove = feature_remove.iloc[:-1, :]
for i in range(len(new_feature_remove['Total'])):
  if new_feature_remove['Total'][i] > 0:
    drop_features.append(new_feature_remove['Feature_remove'][i])

feature_remove = new_feature_remove[~new_feature_remove['Feature_remove'].isin(drop_features)]
feature_remove.reset_index(drop=True, inplace=True)

feature_column = feature_remove.sum(axis=0).values
feature_column[0] = 'Total'
feature_remove.loc[len(feature_remove)] = feature_column

data = data.drop(columns=drop_features, axis=1)

### Feature selection methods

In [None]:
def get_important_features_GRF(data):
    # Split data into features and target
    X = data.drop(columns=['adl'])
    y = data['adl']

    # Define parameter grid for GridSearchCV
    param_grid = {
        'n_estimators': [100, 200, 300],  # Number of trees in the forest
        'max_depth': [None, 10, 20],  # Maximum depth of the tree
        'min_samples_split': [2, 5, 10]  # Minimum number of samples required to split an internal node
    }

    # Initialize Random Forest classifier
    clf = RandomForestClassifier(random_state=42)

    # Initialize GridSearchCV
    grid_search = GridSearchCV(clf, param_grid, cv=5, n_jobs=-1, verbose=2)

    # Train the classifier
    grid_search.fit(X, y)

    # Get the best estimator from GridSearchCV
    best_clf = grid_search.best_estimator_

    # Get feature importances
    feature_importances = best_clf.feature_importances_

    #Discard irrelevant features.
    model = SelectFromModel(best_clf, prefit=True)
    X_new = model.transform(X)

    #Getting top feature names
    cols_idxs = model.get_support(indices=True)
    top_features = X.iloc[:,cols_idxs].columns

    #Getting feature importance of the top features.
    feature_importance_dict = {}
    for feature in top_features:
      id = X.columns.get_loc(feature)
      feature_importance_dict[feature] = feature_importances[id]

    feature_importance_dict = dict(sorted(feature_importance_dict.items(), key=lambda item: item[1], reverse=True))

    if len(feature_importance_dict) % 2 != 0:
      feature_importance_dict.popitem()

    x_labels = list(feature_importance_dict.keys())
    y_values = list(feature_importance_dict.values())

    # Plot the data using a bar plot
    plt.figure(figsize=(8, 6))  # Set figure size
    plt.bar(x_labels, y_values, color='skyblue')  # Create a bar plot

    # Customize the plot
    plt.title('Feature importance MDI')
    plt.xlabel('')
    plt.ylabel('Mean decrease impurity')

    return x_labels

Finding top features to use for each group based on the remaining features.

In [None]:
adl_list = groups['adl'].values

for i in range(len(groups)):
    data_copy = data[data['adl'].isin(adl_list[i])]
    drop_features = ["timestamp", "session", "subject", "fileID"]
    data_copy = data_copy.drop(columns=drop_features, axis=1)

    important_features = get_important_features_GRF(data_copy)

    print("Important features (" + str(groups['windowSize'].values[i]) + '): ' + str(important_features))
