In [1]:
import os

import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectFromModel, SelectKBest, f_regression
from sklearn.linear_model import Lasso
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score

## 0. Parameters

### 0.1 Global Params

In [2]:
root_path = './'
X_train_path = 'X_train.csv'
X_test_path = 'X_test.csv'
y_train_path = 'y_train.csv'
y_test_path = 'y_test_yutong_v14.csv'


val_size = 10
random_state = 30

### 0.2 Feature Selection & Outlier Detection

In [3]:
num_features = 200# np.arange(175,251)## about 200 real features
l1_lambda = 0.2
max_features = 25

n_estimators = 100# np.arange(100,176)

### 0.3 Regresion & Model Selection

In [4]:
num_KFold = 10

svr_param_grid = {
    'C': np.arange(80,110),
    'gamma': np.arange(1e-3, 1e-2, step=1e-3)}

## 1. Data Loading

In [5]:
def data_raw(root_path, data_path):
    return pd.read_csv(os.path.join(root_path, data_path)).values[:,1:]

X_train_raw = data_raw(root_path, X_train_path)
X_test_raw = data_raw(root_path, X_test_path)
y_train = data_raw(root_path, y_train_path).ravel()

## 2. Data Wrangling & Preprocessing

### 2.1 Imputation Methods

In [6]:
def imputation(X_raw):
    imp_med = SimpleImputer(missing_values=np.nan, strategy='median')
    return imp_med.fit_transform(X_raw)

X_train_raw = imputation(X_train_raw)
X_test_raw = imputation(X_test_raw)

### 2.2 Feature Selection Methods

In [7]:
def feature_selection_fabian(X_train, y_train): 
    normalizedX_train = StandardScaler().fit_transform(X_train)
    
    lasso = Lasso(alpha=l1_lambda, random_state=random_state).fit(normalizedX_train, y_train)
    selectionModel = SelectFromModel(lasso, prefit=True, max_features=max_features)
    
    return selectionModel.get_support(indices=True)

def feature_selection_yutong(X_train, y_train):
    kbest = SelectKBest(score_func = f_regression, k=num_features).fit(X_train, y_train)
    
    return kbest.get_support(indices=True)

def feature_selection(X_test, X_train, y_train):
    selectedFeaturesFabian = feature_selection_fabian(X_train, y_train)
    selectedFeaturesYutong = feature_selection_yutong(X_train, y_train)
    
    selectedIdxs = selectedFeaturesYutong
    selectedIdxs = np.union1d(selectedFeaturesFabian, selectedFeaturesYutong)
    print(f"selected {len(selectedIdxs)} out of {X_train.shape[1]} features")
    
    selectedX_train = X_train[:, selectedIdxs]
    selectedX_test = X_test[:, selectedIdxs]
    
    return selectedX_test, selectedX_train

X_test_raw, X_train_raw = feature_selection(X_test_raw, X_train_raw, y_train)

selected 209 out of 832 features


  X_norms = np.sqrt(row_norms(X.T, squared=True) - n_samples * X_means ** 2)
  correlation_coefficient /= X_norms


### 2.3 Outlier Detection Methods

In [8]:
def outlier_detection(X_raw, y, n_estimators):
    iso = IsolationForest(n_estimators = n_estimators, random_state=random_state).fit_predict(X_raw)
    return X_raw[iso == 1], y[iso == 1]

# def train_val_split(X_train, y_train, val_size):
#     return train_test_split(X_train, y_train, test_size=val_size, random_state=random_state)
# X_train_raw, X_val_raw, y_train, y_val = train_val_split(X_train_raw, y_train, val_size=val_size)

X_train_raw, y_train = outlier_detection(X_train_raw, y_train, n_estimators)

### 2.4 Scaling

In [9]:
X_train = StandardScaler().fit_transform(X_train_raw)
# X_val = StandardScaler().fit_transform(X_val_raw)
X_test = StandardScaler().fit_transform(X_test_raw)

X_train, y_train = shuffle(X_train, y_train, random_state=random_state)

## 3. Regression & Param Tuning

In [10]:
svr = SVR()
reg = GridSearchCV(svr, svr_param_grid, scoring='r2', n_jobs=-1, cv=num_KFold).fit(X_train, y_train)

## 4. Evaluation

y_val_pred = reg.predict(X_val)
print("Val R2 Score: ", r2_score(y_val, y_val_pred))

In [11]:
print("Best Estimator: ", reg.best_estimator_)
print("Best Score: ", reg.best_score_)

Best Estimator:  SVR(C=101, gamma=0.004)
Best Score:  0.6911622121838135


## 5. Results

In [12]:
y_test_pred = reg.predict(X_test)

df_result = pd.DataFrame(data = y_test_pred, columns=['y'])
df_result.to_csv(path_or_buf=os.path.join(root_path,y_test_path), index_label='id')