In [1]:
# All useful libraries
import os
import librosa
import numpy as np
import sklearn
import pandas as pd

from models.linear_regression import LinearRegression
from sklearn.model_selection import train_test_split

## Load the dataset

We need to load the dataset from Human1. Deconvoled file represents that we only have the audio signal we need to work on, no background noise only the first clap and the reverb.

In [2]:
DATASET_PATH = "LivingRoom_preprocessed_hack/Human1"

centroid = np.load(os.path.join(DATASET_PATH, "centroid.npy"))
print("Shape of Centroid:")
print(centroid.shape)

#Loading Room Impulse Response (1000 human locations x 10 microphones x M time samples)
RIRs = np.load(os.path.join(DATASET_PATH, "deconvoled_trim.npy"), mmap_mode='r')
print("Shape of RIRs:")
print(RIRs.shape)

Shape of Centroid:
(1000, 2)
Shape of RIRs:
(1000, 4, 667200)


Compute the RMS of the first value of the first channel (mic) as exemple :

In [3]:
rms_values = np.sqrt(np.mean(RIRs[0,0]**2, axis=-1))
rms_values

0.0005250508

Define the RMS function to apply to each value :

In [4]:
def rms(x):
    return np.sqrt(np.mean(x**2, axis=-1))

In [5]:
targets = list(map(tuple,centroid))

In [None]:
c1 = []
c2 = []
c3 = []
c4 = []
# iterate through all data
for i in range(RIRs.shape[0]):
    c1.append(rms(RIRs[i,0]))
    c2.append(rms(RIRs[i,1]))
    c3.append(rms(RIRs[i,2]))
    c4.append(rms(RIRs[i,3]))

We now have all the RMS value for each microphone:

In [None]:
X_all = np.array([c1,c2,c3,c4])
X_all

The Y are the position of the Human that we have in the centroid array:

In [None]:
Y_all = centroid
Y_all

We do not want to scale the position with each other we want to scale them to the size of the room that are provided in the research paper, we define the following functions:

In [None]:
def min_max_scale(x, min, max):
    return (x - min) / (max - min)


def min_max_unscale(x, min, max):
    return x * (max - min) + min

In [None]:
Y_all[:,0] = min_max_scale(Y_all[:,0],-4000,500)
Y_all[:,1] = min_max_scale(Y_all[:,1],-4000,2000)
Y_all

We normalize the channels data that are mostly 10e-5 because if we keep them like that we lose precision since the model works between 0 and 1:

In [None]:
X_all = sklearn.preprocessing.normalize(X_all, axis=0)
X_all

Split the dataset 80% training and 20% testing:

In [None]:
X_all = X_all.reshape(X_all.shape[1], X_all.shape[0])

X_train, X_test, y_train, y_test = train_test_split(X_all, Y_all, test_size=0.2, shuffle=True)

In [None]:
X_train.shape

In [None]:
y_train.shape

Define the model:

In [None]:
model = LinearRegression()

In [None]:
model.fit(X_train,y_train)

In [None]:
res = model.predict(X_test)

In [None]:
print(res)

We have to convert back the position:

In [None]:
y_test[:,0] = min_max_unscale(y_test[:,0], -4000,500)
y_test[:,1] = min_max_unscale(y_test[:,1], -4000,2000)
y_test

In [None]:
res[:,0] = min_max_unscale(res[:,0],-4000,500)
res[:,1] = min_max_unscale(res[:,1], -4000,2000)
print(res)

Compute the Euclidean distance of each position:

In [None]:
dist = np.linalg.norm(res - y_test,axis=1)

Get the average to compute the error:

In [None]:
avg = np.average(dist)
std = np.std(dist)
print("Distance difference from real position in centimeters:", avg / 10)