In [9]:
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import roc_auc_score, r2_score
from sklearn.model_selection import train_test_split
import pandas as pd
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

from sklearn.multioutput import MultiOutputRegressor

In [20]:
data = pd.read_csv('dataset/development.csv')
data.shape, data.columns

((385500, 92),
 Index(['x', 'y', 'pmax[0]', 'negpmax[0]', 'area[0]', 'tmax[0]', 'rms[0]',
        'pmax[1]', 'negpmax[1]', 'area[1]', 'tmax[1]', 'rms[1]', 'pmax[2]',
        'negpmax[2]', 'area[2]', 'tmax[2]', 'rms[2]', 'pmax[3]', 'negpmax[3]',
        'area[3]', 'tmax[3]', 'rms[3]', 'pmax[4]', 'negpmax[4]', 'area[4]',
        'tmax[4]', 'rms[4]', 'pmax[5]', 'negpmax[5]', 'area[5]', 'tmax[5]',
        'rms[5]', 'pmax[6]', 'negpmax[6]', 'area[6]', 'tmax[6]', 'rms[6]',
        'pmax[7]', 'negpmax[7]', 'area[7]', 'tmax[7]', 'rms[7]', 'pmax[8]',
        'negpmax[8]', 'area[8]', 'tmax[8]', 'rms[8]', 'pmax[9]', 'negpmax[9]',
        'area[9]', 'tmax[9]', 'rms[9]', 'pmax[10]', 'negpmax[10]', 'area[10]',
        'tmax[10]', 'rms[10]', 'pmax[11]', 'negpmax[11]', 'area[11]',
        'tmax[11]', 'rms[11]', 'pmax[12]', 'negpmax[12]', 'area[12]',
        'tmax[12]', 'rms[12]', 'pmax[13]', 'negpmax[13]', 'area[13]',
        'tmax[13]', 'rms[13]', 'pmax[14]', 'negpmax[14]', 'area[14]',
        'tmax[

In [21]:
oolumn_drop = ['pmax[0]', 'negpmax[0]', 'area[0]', 'tmax[0]', 'rms[0]',
        'pmax[7]', 'negpmax[7]', 'area[7]', 'tmax[7]', 'rms[7]',
        'pmax[12]', 'negpmax[12]', 'area[12]', 'tmax[12]', 'rms[12]',
        'pmax[16]', 'negpmax[16]', 'area[16]', 'tmax[16]', 'rms[16]',
        'pmax[17]', 'negpmax[17]', 'area[17]', 'tmax[17]', 'rms[17]' ]
data.drop(columns=oolumn_drop, inplace=True, axis=1)
data.columns

Index(['x', 'y', 'pmax[1]', 'negpmax[1]', 'area[1]', 'tmax[1]', 'rms[1]',
       'pmax[2]', 'negpmax[2]', 'area[2]', 'tmax[2]', 'rms[2]', 'pmax[3]',
       'negpmax[3]', 'area[3]', 'tmax[3]', 'rms[3]', 'pmax[4]', 'negpmax[4]',
       'area[4]', 'tmax[4]', 'rms[4]', 'pmax[5]', 'negpmax[5]', 'area[5]',
       'tmax[5]', 'rms[5]', 'pmax[6]', 'negpmax[6]', 'area[6]', 'tmax[6]',
       'rms[6]', 'pmax[8]', 'negpmax[8]', 'area[8]', 'tmax[8]', 'rms[8]',
       'pmax[9]', 'negpmax[9]', 'area[9]', 'tmax[9]', 'rms[9]', 'pmax[10]',
       'negpmax[10]', 'area[10]', 'tmax[10]', 'rms[10]', 'pmax[11]',
       'negpmax[11]', 'area[11]', 'tmax[11]', 'rms[11]', 'pmax[13]',
       'negpmax[13]', 'area[13]', 'tmax[13]', 'rms[13]', 'pmax[14]',
       'negpmax[14]', 'area[14]', 'tmax[14]', 'rms[14]', 'pmax[15]',
       'negpmax[15]', 'area[15]', 'tmax[15]', 'rms[15]'],
      dtype='object')

In [23]:
data.to_csv('dataset/2_development.csv')

In [16]:
# Combine 'x' and 'y' into a new 'target' column
data = data.assign(target=list(zip(data['x'], data['y'])))
# Drop the original 'x' and 'y' columns if needed
data = data.drop(['x', 'y'], axis=1)
# Convert 'target' to a 2D array
y = pd.DataFrame(data['target'].tolist(), columns=['x', 'y'])

In [13]:
# separate train and test sets

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['target'], axis=1),
    y,
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((269850, 90), (115650, 90))

In [None]:
# remove correlated features to reduce the feature space

def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

corr_features = correlation(X_train, 0.8)
print('correlated features: ', len(set(corr_features)) )

In [14]:

sfs = SFS(MultiOutputRegressor(RandomForestClassifier(n_estimators=10, n_jobs=4, random_state=0)), 
           k_features=60, # the more features we want, the longer it will take to run
           forward=True, 
           floating=False, # see the docs for more details in this parameter
           verbose=2, # this indicates how much to print out intermediate steps
           scoring='roc_auc',
           cv=2,
         )

sfs = sfs.fit(X_train, y_train)

Traceback (most recent call last):
  File "C:\mine\Sinaptica\flask_project\dsl\Resistive_Silicon_Detector_2D_surface_detector_regression_predictor\.venv\lib\site-packages\sklearn\metrics\_scorer.py", line 136, in __call__
    score = scorer._score(
  File "C:\mine\Sinaptica\flask_project\dsl\Resistive_Silicon_Detector_2D_surface_detector_regression_predictor\.venv\lib\site-packages\sklearn\metrics\_scorer.py", line 452, in _score
    raise ValueError("{0} format is not supported".format(y_type))
ValueError: multiclass-multioutput format is not supported

Traceback (most recent call last):
  File "C:\mine\Sinaptica\flask_project\dsl\Resistive_Silicon_Detector_2D_surface_detector_regression_predictor\.venv\lib\site-packages\sklearn\metrics\_scorer.py", line 136, in __call__
    score = scorer._score(
  File "C:\mine\Sinaptica\flask_project\dsl\Resistive_Silicon_Detector_2D_surface_detector_regression_predictor\.venv\lib\site-packages\sklearn\metrics\_scorer.py", line 452, in _score
    r