In [2]:
# General
# ! pip install biosppy
# ! pip install neurokit2
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import pandas as pd
import numpy as np

# ML - Processing, Metrics and Evaluation
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
from imblearn.under_sampling import RandomOverSampler
# ML - Models
from sklearn.svm import SVC
import xgboost as xgb

# Custom
import sys,os
sys.path.append( '.' )
sys.path.append( '..' )
import Components.Normalisation as Normalisation
import Components.data_fetching as data_fetching
import Components.Feature_Engineering as Feature_Engineering


# CAREFUL:
# If you make changes to a custom module, you have to reload it, i.e rerun this cell
import importlib
importlib.reload(Normalisation)
importlib.reload(data_fetching)
importlib.reload(Feature_Engineering)

<module 'Components.Feature_Engineering' from '../Components/Feature_Engineering.py'>

# Data Fetching

In [6]:
X, y = data_fetching.get_engineered_train_data()
y = np.ravel(y)
#x_test = data_fetching.get_test_data()

# Data Processing

In [None]:
X = StandardScaler().fit_transform(X)

In [16]:
# TODO:
# 1. Separate noise class in y from others: set all classes in y < 3 to 0, and 3 to 1
# 2. Get X noise and X other classes separately
# 3. Generate new X noise samples to balance the classes
# 
y_noise = np.array([int(x > 2) for x in y])
y_noise_df = pd.DataFrame(y_noise, columns=['y'])
data = pd.concat([X, y_noise_df], axis = 1)

data_other = data.loc[data.y == 0]
data_noise = data.loc[data.y == 1]

X_other = data_other.drop(['y'], axis=1)
X_noise = data_noise.drop(['y'], axis=1)

print(X_other)
print(X_noise)

          0           1            2             3           4             5   \
0        0.0   68.630849  1703.000000  9.262802e+05  320.555556  30973.802469   
1        1.0   61.726883  4091.680000  5.982348e+06  328.750000   9966.687500   
2        2.0   84.916201  4084.166667  5.727385e+06  229.314286   4131.929796   
3        3.0   67.432567  4591.500000  5.375670e+06  287.037037  11692.702332   
4        4.0   59.495192  1515.857143  1.277013e+06  553.666667  73455.555556   
...      ...         ...          ...           ...         ...           ...   
5110  5110.0   47.716592  4245.136364  7.260853e+06  396.333333  54141.936508   
5113  5113.0  100.600962  8503.159574  2.338257e+07  178.967742    125.364551   
5114  5114.0   83.731456  4222.263158  5.604869e+06  216.027027    224.945215   
5115  5115.0   72.080089  7115.640000  1.543488e+07  275.346939   5942.063307   
5116  5116.0   80.042689  2948.230769  2.859654e+06  225.000000    261.440000   

               6           

# Training

In [7]:
scaler = StandardScaler()
X_2 = scaler.fit_transform(X,y)

est = SVC(kernel='rbf')
cv_score = cross_val_score(est, X_2, y, cv=10, scoring='f1_micro')
print(cv_score)
print(np.mean(cv_score))

[0.66015625 0.67382812 0.66796875 0.65234375 0.67578125 0.65234375
 0.65429688 0.66144814 0.68493151 0.67123288]
0.6654331274461839
