# Project 1

## Import Libraries

In [None]:
# import libraries (run)
import numpy as np
import pandas as pd
import joblib
import matplotlib.pyplot as plt

import sklearn
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.feature_selection import RFECV, SelectKBest, r_regression, f_regression
from sklearn.gaussian_process.kernels import Matern, RBF, CompoundKernel, Product, Sum, ExpSineSquared, RationalQuadratic
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, IsolationForest, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, make_scorer
from sklearn.decomposition import PCA

# outlier
from sklearn.base import OutlierMixin
from sklearn.covariance import EllipticEnvelope
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM

import xgboost as xgb
import catboost as cat
import lightgbm as lgb

import torch
from torch import nn
from torch.nn import Module, Linear, Dropout
from torch.nn.functional import tanh, softmax, mse_loss, relu, sigmoid
from torch.optim import Adam, SGD

## Load Data

In [None]:
# load and split data (run)
data_X_train = pd.read_csv('Data/X_train.csv', header=0, index_col=0)
data_y_train = pd.read_csv('Data/y_train.csv', header=0, index_col=0)
data_X_test = pd.read_csv('Data/X_test.csv', header=0, index_col=0)

In [None]:
# data info (run)
data_X_train.describe()
"""
Data Shape: 1212 x 832
Data Lost: a lot
data scale: large
"""

In [None]:
# transfer data to numpy (run)
X_train = data_X_train.to_numpy()
y_train = data_y_train.to_numpy()
X_test = data_X_test.to_numpy()

## Data Preprocessing

### 处理缺省值

In [None]:
# KNN Imputer (run)
imputer = SimpleImputer(strategy='median')
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

### 归一化

In [None]:
# scalar (run)
x_scalar = RobustScaler()
X_train = x_scalar.fit_transform(X_train)
X_test = x_scalar.transform(X_test)

### 特征选择

#### 删除变化过小的列

In [None]:
# scalar (run)
del_columns_id_all0 = np.where(X_train.sum(axis=0) == 0)
X_train = np.delete(X_train, del_columns_id_all0, axis=1)
X_test = np.delete(X_test, del_columns_id_all0, axis=1)

#### 保留特征

In [None]:
# 使用selectkbest方法选择特征 (run)
skb = SelectKBest(f_regression, k=200)
X_train = skb.fit_transform(X_train, y_train.ravel())
X_test = skb.transform(X_test)
print(X_train.shape)

### 噪声探测

这里的噪声检测方法只适合用于验证，不适合用于最终的模型，都不需要运行

In [None]:
# EllipticEnvelope
outlier_detector = EllipticEnvelope()
outlier_ids = outlier_detector.fit_predict(X_train, y_train)
non_outlier_ids = np.where(outlier_ids != -1)
X_train = X_train[non_outlier_ids]
y_train = y_train[non_outlier_ids]
X_train.shape

In [None]:
# LocalOutlierFactor
outlier_detector = LocalOutlierFactor(n_neighbors=3)
outlier_ids = outlier_detector.fit_predict(X_train, y_train)
non_outlier_ids = np.where(outlier_ids != -1)
X_train = X_train[non_outlier_ids]
y_train = y_train[non_outlier_ids]
X_train.shape

In [None]:
# OneClassSVM
outlier_detector = OneClassSVM(kernel="linear")
outlier_ids = outlier_detector.fit_predict(X_train, y_train)
non_outlier_ids = np.where(outlier_ids != -1)
X_train = X_train[non_outlier_ids]
y_train = y_train[non_outlier_ids]
X_train.shape

## Model Selection

### Gaussian Process Regressor

In [None]:
# Gaussian Process Regressor (Matern)
fold_num = 5

kf = KFold(n_splits=fold_num)
fold_scores = []
for i, (train_ids, valid_ids) in enumerate(kf.split(X_train)):
    # split validation data
    fold_X_train = X_train[train_ids]
    fold_y_train = y_train[train_ids]
    fold_X_valid = X_train[valid_ids]
    fold_y_valid = y_train[valid_ids]

    # train model
    param = {'alpha': 1e-09, 'kernel': Matern(length_scale=0.5, nu=1.5) + RBF(length_scale=1)}
    model = GaussianProcessRegressor(**param)
    model.fit(fold_X_train, fold_y_train)
    fold_y_pred = model.predict(fold_X_valid)

    # calculate score
    fold_scores.append(r2_score(fold_y_valid, fold_y_pred))
fold_score = np.average(fold_scores)
print(fold_score)

In [None]:
# Grid Search CV
model = GaussianProcessRegressor(random_state=0)
params = {
    "kernel": [Matern(nu=1.5), Matern(nu=0.5), Matern(nu=2.5), Sum(Matern(nu=1.5), Matern(nu=0.5)), Sum(Matern(nu=1.5), RBF()), Sum(Matern(nu=0.5), RBF()), Sum(Matern(nu=2.5), RBF()), ExpSineSquared(), RationalQuadratic()], 
    "alpha": [1e-10, 2e-10, 5e-10, 1e-9, 2e-9, 5e-9, 1e-8, 2e-8, 5e-8], 
}
scorer = make_scorer(r2_score, greater_is_better=True)
regressor = GridSearchCV(estimator=model, param_grid=params, cv=5, scoring=scorer)
regressor.fit(X_train, y_train)

In [None]:
# Gaussian Process Regressor (RBF)
fold_num = 5

kf = KFold(n_splits=fold_num)
fold_scores = []
for i, (train_ids, valid_ids) in enumerate(kf.split(X_train)):
    # split validation data
    fold_X_train = X_train[train_ids]
    fold_y_train = y_train[train_ids]
    fold_X_valid = X_train[valid_ids]
    fold_y_valid = y_train[valid_ids]

    # train model
    model = GaussianProcessRegressor(kernel=RBF(length_scale=10), random_state=0)
    model.fit(fold_X_train, fold_y_train)
    fold_y_pred = model.predict(fold_X_valid)

    # calculate score
    fold_scores.append(r2_score(fold_y_valid, fold_y_pred))
fold_score = np.average(fold_scores)
print(fold_score)

### Boost Algorithms

In [None]:
# Isolation Forest Regressor
fold_num = 5

kf = KFold(n_splits=fold_num)
fold_scores = []
for i, (train_ids, valid_ids) in enumerate(kf.split(X_train)):
    # split validation data
    fold_X_train = X_train[train_ids]
    fold_y_train = y_train[train_ids]
    fold_X_valid = X_train[valid_ids]
    fold_y_valid = y_train[valid_ids]

    # train model
    model = IsolationForest(n_estimators=150, random_state=0)
    model.fit(fold_X_train, fold_y_train)
    fold_y_pred = model.predict(fold_X_valid)

    # calculate score
    fold_scores.append(r2_score(fold_y_valid, fold_y_pred))
fold_score = np.average(fold_scores)
print(fold_score)

In [None]:
# Gradient Boosting Regressor
fold_num = 5

kf = KFold(n_splits=fold_num)
fold_scores = []
for i, (train_ids, valid_ids) in enumerate(kf.split(X_train)):
    # split validation data
    fold_X_train = X_train[train_ids]
    fold_y_train = y_train[train_ids]
    fold_X_valid = X_train[valid_ids]
    fold_y_valid = y_train[valid_ids]

    # train model
    model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1)
    model.fit(fold_X_train, fold_y_train.ravel())
    fold_y_pred = np.round(model.predict(fold_X_valid))

    # calculate score
    fold_scores.append(r2_score(fold_y_valid, fold_y_pred))
fold_score = np.average(fold_scores)
print(fold_score)

In [None]:
# Adaboost Regressor
fold_num = 5

kf = KFold(n_splits=fold_num)
fold_scores = []
for i, (train_ids, valid_ids) in enumerate(kf.split(X_train)):
    # split validation data
    fold_X_train = X_train[train_ids]
    fold_y_train = y_train[train_ids]
    fold_X_valid = X_train[valid_ids]
    fold_y_valid = y_train[valid_ids]

    # train model
    model = AdaBoostRegressor(n_estimators=100, learning_rate=0.1, loss="square")
    model.fit(fold_X_train, fold_y_train.ravel())
    fold_y_pred = model.predict(fold_X_valid)

    # calculate score
    fold_scores.append(r2_score(fold_y_valid, fold_y_pred))
fold_score = np.average(fold_scores)
print(fold_score)

### XGBoost

In [None]:
# XGBoost
fold_num = 5

kf = KFold(n_splits=fold_num)
fold_scores = []
for i, (train_ids, valid_ids) in enumerate(kf.split(X_train)):
    # split validation data
    fold_X_train = X_train[train_ids]
    fold_y_train = y_train[train_ids]
    fold_X_valid = X_train[valid_ids]
    fold_y_valid = y_train[valid_ids]

    # train model
    model = xgb.XGBRegressor(n_estimators=150, max_depth=5, learning_rate=0.11, n_jobs=20)
    model.fit(fold_X_train, fold_y_train.ravel())
    fold_y_pred = np.round(model.predict(fold_X_valid))

    # calculate score
    fold_scores.append(r2_score(fold_y_valid, fold_y_pred))
fold_score = np.average(fold_scores)
print(fold_score)

### CatBoost(最优)

In [None]:
# CatBoost (run)
# 模型：换loss function
fold_num = 5

kf = KFold(n_splits=fold_num)
fold_scores = []
for i, (train_ids, valid_ids) in enumerate(kf.split(X_train)):
    # split validation data
    fold_X_train = X_train[train_ids]
    fold_y_train = y_train[train_ids]
    fold_X_valid = X_train[valid_ids]
    fold_y_valid = y_train[valid_ids]

    # train model
    param = {
        'iterations': 1500,
        'learning_rate': 0.05,
        'subsample': 0.6,
        'rsm': 0.6, 
        "max_depth": 5, 
    }
    model = cat.CatBoostRegressor(**param)
    model.fit(fold_X_train, fold_y_train.ravel())
    fold_y_pred = model.predict(fold_X_valid)

    # calculate score
    fold_scores.append(r2_score(fold_y_valid, fold_y_pred))
fold_score = np.average(fold_scores)
print(fold_score)

In [None]:
# Grid Search CV
model = cat.CatBoostRegressor()
params = {
    "kernel": [Matern(nu=1.5), Matern(nu=0.5), Matern(nu=2.5), RBF(), Matern(nu=1.5) + Matern(nu=0.5), Matern(nu=1.5) + RBF(), Matern(nu=0.5) + RBF()], 
    "alpha": [1e-11, 2e-11, 5e-11, 1e-10, 2e-10, 5e-10, 1e-9, 2e-9, 5e-9, 1e-8, 2e-8, 5e-8], 
}
scorer = make_scorer(r2_score, greater_is_better=True)
regressor = GridSearchCV(estimator=model, param_grid=params, cv=5, scoring=scorer)
regressor.fit(X_train, y_train)
regressor.cv_results_["rank_test_score"]

### LGBM

In [None]:
# CatBoost
fold_num = 5

kf = KFold(n_splits=fold_num)
fold_scores = []
for i, (train_ids, valid_ids) in enumerate(kf.split(X_train)):
    # split validation data
    fold_X_train = X_train[train_ids]
    fold_y_train = y_train[train_ids]
    fold_X_valid = X_train[valid_ids]
    fold_y_valid = y_train[valid_ids]

    # train model
    model = lgb.LGBMRegressor(n_estimators=50)
    model.fit(fold_X_train, fold_y_train.ravel())
    fold_y_pred = model.predict(fold_X_valid)

    # calculate score
    fold_scores.append(r2_score(fold_y_valid, fold_y_pred))
fold_score = np.average(fold_scores)
print(fold_score)

### 组合模型(最终使用)

In [None]:
class MixModel(object):
    def __init__(self):
        # 尽量选一些性能表现比较好的model
        self.cat = cat.CatBoostRegressor(iterations=2000, learning_rate=0.05, subsample=0.6, rsm=0.6, max_depth=5)
        self.xgb = xgb.XGBRegressor(n_estimators=150, max_depth=5, learning_rate=0.05, n_jobs=20)
        self.lgbm = lgb.LGBMRegressor(n_estimators=150)
        self.gp1 = GaussianProcessRegressor(alpha=1e-09, kernel=Sum(Matern(length_scale=0.5, nu=1.5), RBF(length_scale=1)))
        self.gp2 = GaussianProcessRegressor(alpha=1e-09, kernel=Matern(length_scale=0.5, nu=0.5))
        self.gp3 = GaussianProcessRegressor(alpha=1e-09, kernel=RBF(length_scale=1))
        self.lr = LinearRegression()
        self.isf = IsolationForest(n_estimators=150, random_state=0)
        self.gbr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1)
        self.basic_models = [self.cat, self.xgb, self.lgbm, self.gp1, self.gp2, self.gp3, self.lr, self.isf, self.gbr]
        # self.basic_models = [self.cat, self.xgb, self.lgbm, self.gp1, self.gp2, self.gp3, self.lr, self.isf, self.gbr]
        # self.basic_models = [self.cat, self.xgb, self.lgbm, self.gp1, self.gp2, self.gp3, self.lr]
        self.intermediate_prediction = []

        self.mix_model = cat.CatBoostRegressor(iterations=1500, learning_rate=0.05, subsample=0.6, rsm=0.6, max_depth=5)
    
    def fit(self, X: np.array, y: np.array) -> None:
        for model in self.basic_models:
            model.fit(X, y)
            self.intermediate_prediction.append(model.predict(X).reshape((-1, 1)))
        
        self.intermediate_prediction = np.concatenate(self.intermediate_prediction, axis=1)

        self.mix_model.fit(self.intermediate_prediction, y)

    def predict(self, X: np.array) -> np.array:
        intermediate_prediction = []
        for model in self.basic_models:
            intermediate_prediction.append(model.predict(X).reshape((-1, 1)))
        intermediate_prediction = np.concatenate(intermediate_prediction, axis=1)
        return self.mix_model.predict(intermediate_prediction)

# 最终使用该模型
class MixModelCL(object):
    def __init__(self):
        # 尽量选一些性能表现比较好的model
        self.cat = cat.CatBoostRegressor(iterations=2000, learning_rate=0.05, subsample=0.6, rsm=0.6, max_depth=5)
        self.xgb = xgb.XGBRegressor(n_estimators=150, max_depth=5, learning_rate=0.05, n_jobs=20)
        self.lgbm = lgb.LGBMRegressor(n_estimators=150)
        self.gp1 = GaussianProcessRegressor(alpha=1e-09, kernel=Sum(Matern(length_scale=0.5, nu=1.5), RBF(length_scale=1)))
        self.gp2 = GaussianProcessRegressor(alpha=1e-09, kernel=Matern(length_scale=0.5, nu=0.5))
        self.gp3 = GaussianProcessRegressor(alpha=1e-09, kernel=RBF(length_scale=1))
        self.lr = LinearRegression()
        self.isf = IsolationForest(n_estimators=150, random_state=0)
        self.gbr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1)
        self.basic_models = [self.cat, self.xgb, self.lgbm, self.gp1, self.gp2, self.gp3, self.lr, self.isf, self.gbr]
        # self.basic_models = [self.cat, self.xgb, self.lgbm, self.gp1, self.gp2, self.gp3, self.lr, self.isf, self.gbr]
        # self.basic_models = [self.cat, self.xgb, self.lgbm, self.gp1, self.gp2, self.gp3, self.lr]
        self.intermediate_prediction = []

        self.mix_model = cat.CatBoostRegressor(iterations=1500, learning_rate=0.05, subsample=0.6, rsm=0.6, max_depth=5)

        self.high_model = cat.CatBoostRegressor(iterations=2000, learning_rate=0.05, subsample=0.6, rsm=0.6, max_depth=5)
        self.low_model = cat.CatBoostRegressor(iterations=2000, learning_rate=0.05, subsample=0.6, rsm=0.6, max_depth=5)

        self.classify_high_model = cat.CatBoostClassifier(iterations=2000, learning_rate=0.05, subsample=0.6, rsm=0.6, max_depth=5)
        self.classify_low_model = cat.CatBoostClassifier(iterations=2000, learning_rate=0.05, subsample=0.6, rsm=0.6, max_depth=5)
    
    def fit(self, X: np.array, y: np.array) -> None:
        for model in self.basic_models:
            model.fit(X, y)
            self.intermediate_prediction.append(model.predict(X).reshape((-1, 1)))
        
        self.intermediate_prediction = np.concatenate(self.intermediate_prediction, axis=1)

        self.mix_model.fit(self.intermediate_prediction, y)

        # process imbalanced data
        high_ids = np.where(y >= 80)
        low_ids = np.where(y <= 50)
        high_y_ones = (y >= 80).astype(int)
        low_y_ones = (y <= 50).astype(int)
        self.classify_high_model.fit(X, high_y_ones)
        self.classify_low_model.fit(X, low_y_ones)

        self.high_model.fit(X[high_ids], y[high_ids])
        self.low_model.fit(X[low_ids], y[low_ids])

    def predict(self, X: np.array) -> np.array:
        intermediate_prediction = []
        for model in self.basic_models:
            intermediate_prediction.append(model.predict(X).reshape((-1, 1)))
        intermediate_prediction = np.concatenate(intermediate_prediction, axis=1)
        pred = self.mix_model.predict(intermediate_prediction)

        # process imbalanced data
        high_id_mask = self.classify_high_model.predict(X)
        low_id_mask = self.classify_low_model.predict(X) == 1
        cooccur_mask = high_id_mask * low_id_mask
        high_id_mask = high_id_mask - cooccur_mask
        low_id_mask = low_id_mask - cooccur_mask

        high_pred = self.high_model.predict(X) * high_id_mask
        low_pred = self.low_model.predict(X) * low_id_mask
        cooccur_mask = high_id_mask * low_id_mask

        result = (pred + high_pred + low_pred + pred * (1 - high_id_mask) * (1 - low_id_mask)) / 2
        
        return result

## 预测

In [None]:
# 用mix model进行预测
model = MixModel()
model.fit(X_train, y_train.ravel())
# y_pred = np.round(model.predict(X_test))
y_pred = model.predict(X_test)
y_pred_df = pd.DataFrame(y_pred, columns=["y"], index=data_X_test.index).reset_index()
y_pred_df["id"] = y_pred_df["id"].astype(int)
y_pred_df

In [None]:
# 用mix model with classifier进行预测
model = MixModelCL()
model.fit(X_train, y_train.ravel())
# y_pred = np.round(model.predict(X_test))
y_pred = model.predict(X_test)
y_pred_df = pd.DataFrame(y_pred, columns=["y"], index=data_X_test.index).reset_index()
y_pred_df["id"] = y_pred_df["id"].astype(int)
y_pred_df

In [None]:
# 用catboost进行预测
param = {
        'learning_rate': 0.05,
        'subsample': 0.6,
        'rsm': 0.6, 
        "max_depth": 5, 
        "iterations": 2000,
    }
model = cat.CatBoostRegressor(**param)
model.fit(X_train, y_train.ravel())
# y_pred = np.round(model.predict(X_test))
y_pred = model.predict(X_test)
y_pred_df = pd.DataFrame(y_pred, columns=["y"], index=data_X_test.index).reset_index()
y_pred_df["id"] = y_pred_df["id"].astype(int)
y_pred_df

### 输出模型结果

In [None]:
y_pred_df.to_csv("mix_model_withround.csv", index=False)