## Libraries

In [1]:
# For handling data
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# For plotting
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# For Outlier Detection
from sklearn.ensemble import IsolationForest

# For Dimensionality Reduction
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# For evaluating the performance of the preprocessing
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor

def mean_euclidean_distance_error(estimator, X_test, y_test):
    from scipy.spatial.distance import euclidean
    import numpy as np
    y_pred = estimator.predict(X_test)
    euclidean_distances = [euclidean(y_test_i, y_pred_i) for y_test_i, y_pred_i in zip(y_test, y_pred)]
    return np.mean(euclidean_distances), np.std(euclidean_distances)



# Understanding the database ```development.csv```

In [2]:
path = '/kaggle/input/winterproject/'
df = pd.read_csv(path + 'development.csv')

In [3]:
# Let's see what we have
df.head()

Unnamed: 0,x,y,pmax[0],negpmax[0],area[0],tmax[0],rms[0],pmax[1],negpmax[1],area[1],...,pmax[16],negpmax[16],area[16],tmax[16],rms[16],pmax[17],negpmax[17],area[17],tmax[17],rms[17]
0,200.0,200.0,4.066907,-17.690173,2.847932,0.6,2.007042,5.409161,-17.72121,4.538778,...,607.109118,-36.282996,583.899899,72.373094,0.374498,614.916861,-39.848523,591.852768,72.331028,0.405595
1,200.0,200.0,5.916801,-4.717111,5.792778,79.765174,1.564535,4.414289,-4.736827,3.720435,...,630.348007,-39.715988,580.042799,71.029155,0.403258,624.950701,-41.266681,586.569646,71.089058,0.40589
2,200.0,200.0,4.286652,-5.316132,2.35639,74.6,1.404622,4.567191,-5.985437,3.49049,...,613.880342,-40.679678,580.407491,71.892264,0.568777,596.437125,-42.712286,574.091695,71.943934,0.498019
3,200.0,200.0,4.003635,-4.617459,2.189005,43.0,1.512162,5.019058,-4.229949,6.7862,...,600.714957,-43.206601,579.882635,72.357388,0.255483,591.763739,-50.68194,584.099483,72.333282,0.336454
4,200.0,200.0,4.448146,-4.848743,3.997002,196.667482,1.101113,3.250262,-5.783587,2.449456,...,609.723785,-43.570892,590.156125,71.24913,0.413855,606.917023,-49.923819,584.316142,71.242904,0.293824


In [4]:
# Let's split the data into X and y (We have two variables to predict: x and y)
X = df.loc[:, 'pmax[0]':]
y = df[['x', 'y']]

In [5]:
# There is not na data
X.isna().sum().sum()

0

In [9]:
# Let's split X, y into train_val and test sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X.values, y.values, test_size=0.3, random_state=8)

# Let's split X_train_val and y_train_val into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=8)

## [Feature reduction] Applying SFS on each feature extracted
We have seen that dropping the correlated features does not improve the performance of the regressor, now, let's try another approach:

Apply Sequential Feature Selection on each feature extracted (On ```pmax```, ```negpmax```, ...), this could be a solution for reducing noise in our dataset because in the description of the project says that there are only 12 pads but 18 readings of each features are provided (A subset of the 18 features, as such, does not contain actual readings but rather noise).

In [22]:
features_extracted = ['pmax', 'negpmax', 'tmax', 'area', 'rms']
SFS_params = { feature: {
    'estimator': MultiOutputRegressor(KNeighborsRegressor(n_jobs=-1), n_jobs=-1),
    'X': pd.DataFrame(X_train, columns=X.columns)[[f'{feature}[{i}]' for i in range(0, 17 + 1)]].values
    } for feature in features_extracted
}

In [23]:
SFSs = { feature: SequentialFeatureSelector(SFS_params[feature]['estimator'], n_features_to_select=12, n_jobs=-1) for feature in features_extracted }

In [21]:
for feature in features_extracted:
    SFSs[feature].fit(SFS_params[feature]['X'], y_train)

In [48]:
for i, feature in enumerate(features_extracted):
    print(feature, SFSs[feature].get_support())
    pd.DataFrame(SFS_params[feature]['X'], columns=[f'{feature}[{i}]' for i in range(0, 17 + 1)]).loc[:, SFSs[feature].get_support()].to_csv(path + f'X_{feature}.csv', index=False)