# Project 1

## Import Libraries

In [1]:
# import libraries
import numpy as np
import pandas as pd
import joblib
import matplotlib.pyplot as plt

import sklearn
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.feature_selection import RFECV, SelectKBest, r_regression, f_regression
from sklearn.pipeline import Pipeline
from sklearn.gaussian_process.kernels import Matern, RBF, CompoundKernel, Product, Sum, ExpSineSquared, RationalQuadratic
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, IsolationForest, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, make_scorer
from sklearn.decomposition import PCA

# outlier
from sklearn.base import OutlierMixin
from sklearn.covariance import EllipticEnvelope
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM

import xgboost as xgb
import catboost as cat
import lightgbm as lgb

import torch
from torch import nn
from torch.nn import Module, Linear, Dropout
from torch.nn.functional import tanh, softmax, mse_loss, relu, sigmoid
from torch.optim import Adam, SGD

## Load Data

In [2]:
# load and split data (run)
data_X_train = pd.read_csv('Data/X_train.csv', header=0, index_col=0)
data_y_train = pd.read_csv('Data/y_train.csv', header=0, index_col=0)
data_X_test = pd.read_csv('Data/X_test.csv', header=0, index_col=0)

In [3]:
# data info (run)
data_X_train.describe()
"""
Data Shape: 1212 x 832
Data Lost: a lot
data scale: large
"""

'\nData Shape: 1212 x 832\nData Lost: a lot\ndata scale: large\n'

## Data Preprocessing

In [4]:
# 自建噪声探测函数
class OutlierDetector(object):
    def __init__(self, error_coefficient: float = 2) -> None:
        self.error_coefficient = error_coefficient
        self.outlier_ids = []
    
    def fit(self, X: np.array, y: np.array) -> None:
        self.outlier_ids = []
        errors = []
        for row_i in range(X.shape[0]):
            X_row = X[row_i]
            y_row = y[row_i]

            sub_y_X = X[(y >= y_row - 5) & (y <= y_row + 5)]
            if sub_y_X.shape[0] > 10 and (y_row < 80 and y_row > 50):
                sub_y_X_mean = np.average(sub_y_X, axis=0)
                sub_y_X_std = np.std(sub_y_X, axis=0)
                sub_y_X_error = (X_row > sub_y_X_mean + self.error_coefficient * sub_y_X_std) | (X_row < sub_y_X_mean - self.error_coefficient * sub_y_X_std)
                errors.append(sub_y_X_error.sum())
                if sub_y_X_error.sum() > X.shape[1] * 0.15:
                    self.outlier_ids.append(row_i)
            else:
                continue
    
    def transform(self, X: np.array, y: np.array) -> np.array:
        return np.delete(X, self.outlier_ids, axis=0), np.delete(y, self.outlier_ids, axis=0)
    
    def fit_transform(self, X: np.array, y: np.array) -> np.array:
        self.fit(X, y)
        return self.transform(X, y)

In [5]:
# transfer data to numpy (run)
X_train = data_X_train.to_numpy()
y_train = data_y_train.to_numpy().ravel()
X_test = data_X_test.to_numpy()

train_pipeline = Pipeline(
    [
        ("imputer", SimpleImputer(strategy='median')), 
        ("scalar", RobustScaler()), 
        ("feature_selection", SelectKBest(f_regression, k=200)), 
    ]
)
X_train = train_pipeline.fit_transform(X_train, y_train)
X_test = train_pipeline.transform(X_test)

# outlier detection

# outlier_detector = LocalOutlierFactor(n_neighbors=3)
# outlier_ids = outlier_detector.fit_predict(X_train)
# X_train = X_train[outlier_ids == 1]
# y_train = y_train[outlier_ids == 1]

outlier_detector = OutlierDetector()
X_train, y_train = outlier_detector.fit_transform(X_train, y_train)

## Model Selection

In [6]:
# valid model class
class KfoldValidModel(object):
    def __init__(self, X: np.array, y: np.array, scorer: sklearn.metrics, n_splits: int = 5, random_state: int = 0):
        self.X = X
        self.y = y
        self.scorer = scorer
        self.kfold = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    
    def valid(self, model: sklearn.base.BaseEstimator):
        scores = []
        pred_results = []
        y_valid_list = []
        for train_index, valid_index in self.kfold.split(self.X):
            X_train, X_valid = self.X[train_index], self.X[valid_index]
            y_train, y_valid = self.y[train_index], self.y[valid_index]
            
            model.fit(X_train, y_train)
            y_pred = model.predict(X_valid)
            pred_results.append(y_pred)
            y_valid_list.append(y_valid)
            score = self.scorer(y_valid, y_pred)
            scores.append(score)
        return np.mean(scores), {"scores": scores, "pred_results": pred_results, "y_valid_list": y_valid_list}

    def __call__(self, model: sklearn.base.BaseEstimator):
        return self.valid(model)

kf_val = KfoldValidModel(X_train, y_train, r2_score, n_splits=5, random_state=0)

In [7]:
cat_val, _ = kf_val(cat.CatBoostRegressor(iterations=2000, learning_rate=0.05, max_depth=5, subsample=0.6, rsm=0.6))
cat_val

0:	learn: 9.5656786	total: 155ms	remaining: 5m 9s
1:	learn: 9.3950106	total: 161ms	remaining: 2m 41s
2:	learn: 9.2457898	total: 167ms	remaining: 1m 51s
3:	learn: 9.0929801	total: 172ms	remaining: 1m 25s
4:	learn: 8.9592797	total: 176ms	remaining: 1m 10s
5:	learn: 8.8210191	total: 180ms	remaining: 59.9s
6:	learn: 8.6920110	total: 185ms	remaining: 52.5s
7:	learn: 8.5900871	total: 189ms	remaining: 47.1s
8:	learn: 8.4760565	total: 194ms	remaining: 42.9s
9:	learn: 8.3735985	total: 198ms	remaining: 39.5s
10:	learn: 8.2688615	total: 202ms	remaining: 36.6s
11:	learn: 8.1763745	total: 207ms	remaining: 34.3s
12:	learn: 8.0882128	total: 211ms	remaining: 32.2s
13:	learn: 7.9949648	total: 215ms	remaining: 30.5s
14:	learn: 7.8992710	total: 220ms	remaining: 29.1s
15:	learn: 7.8032855	total: 224ms	remaining: 27.8s
16:	learn: 7.7139935	total: 229ms	remaining: 26.7s
17:	learn: 7.6387170	total: 233ms	remaining: 25.7s
18:	learn: 7.5560806	total: 237ms	remaining: 24.7s
19:	learn: 7.4894215	total: 241ms	rem

0.5414546094190138