In [None]:
import os

import pandas as pd
import numpy as np

# from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer#, IterativeImputer
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import IsolationForest## exp needed!
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score

## 0. Parameters

### 0.1 Global Params

In [None]:
root_path = './'
X_train_path = 'X_train.csv'
X_test_path = 'X_test.csv'
y_train_path = 'y_train.csv'
y_test_path = 'y_test_yutong_v8.csv'

val_size = 10
random_state = 30

### 0.2 Feature Selection

In [None]:
num_features = 225# np.arange(175,251)## about 200 real features
n_estimators = 150# np.arange(100,176)

### 0.3 Regresion & Model Selection

In [None]:
num_KFold = 10

svr_param_grid = {
    'C': np.arange(80,101), 
    'gamma': np.arange(1e-3, 1e-2, step=1e-3)}

## 1. Data Loading

In [None]:
def data_raw(root_path, data_path):
    return pd.read_csv(os.path.join(root_path, data_path)).values[:,1:]

X_train_raw = data_raw(root_path, X_train_path)
X_test_raw = data_raw(root_path, X_test_path)
y_train = data_raw(root_path, y_train_path)

## 2. Data Wrangling & Preprocessing

### 2.1 Imputation Methods

In [None]:
def median_imp(X_raw):
    imp_med = SimpleImputer(missing_values=np.nan, strategy='median')
    return imp_med.fit_transform(X_raw)

def knn_imp(X_raw):##to-do, still a comp intensive methods and suffers from curse of dimensionality and outliers
    pass

def mice_imp(X_raw):#use after fea sel! this method is comp expensive, and is an unstable implementation based on docs
    imp_mice = IterativeImputer(missing_values=np.nan, initial_strategy='median')
    return imp_mice.fit_transform(X_raw)

In [None]:
X_train_raw = median_imp(X_train_raw)
X_test_raw = median_imp(X_test_raw)

### 2.2 Feature Selection Methods

In [None]:
def feat_sel(X_raw, y, score_func, num_features):
    return SelectKBest(score_func = score_func, k=num_features).fit(X_raw, y.ravel())

In [None]:
selector = feat_sel(X_train_raw, y_train, f_regression, num_features)

X_train_raw = selector.transform(X_train_raw)
X_test = selector.transform(X_test_raw)

### 2.3 Outlier Detection Methods

In [None]:
def train_val_split(X_train, y_train, val_size):
    return train_test_split(X_train, y_train, test_size=val_size, random_state=random_state)

def iforest(X_raw, y, n_estimators):
    iso = IsolationForest(n_estimators = n_estimators, random_state=random_state).fit_predict(X_raw)
    return X_raw[np.where(iso==1)], y[np.where(iso==1)]

In [None]:
X_train_raw, X_val, y_train, y_val = train_val_split(X_train_raw, y_train, val_size)

X_train, y_train = iforest(X_train_raw, y_train, n_estimators)

### 2.4 Scaling

In [None]:
X_train = StandardScaler().fit_transform(X_train)
X_val = StandardScaler().fit_transform(X_val)
X_test = StandardScaler().fit_transform(X_test)

## 3. Regression & Param Tuning

In [None]:
svr = SVR()
reg = GridSearchCV(svr, svr_param_grid, scoring='r2', n_jobs=-1, cv=num_KFold).fit(X_train, y_train.ravel())

## 4. Evaluation

In [None]:
print("Best Estimator: ", reg.best_estimator_)
print("Best Score: ", reg.best_score_)

In [None]:
y_val_pred = reg.predict(X_val)

score = r2_score(y_val, y_val_pred)
print(score)

## 5. Results