# Project 1

## Import Libraries

In [422]:
!pip install catboost



In [423]:
# import libraries (run)
import numpy as np
import pandas as pd
import joblib
import matplotlib.pyplot as plt

import sklearn
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.feature_selection import RFECV, SelectKBest, r_regression, f_regression
from sklearn.gaussian_process.kernels import Matern, RBF, CompoundKernel, Product, Sum, ExpSineSquared, RationalQuadratic
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, IsolationForest, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, make_scorer
from sklearn.decomposition import PCA

# outlier
from sklearn.base import OutlierMixin
from sklearn.covariance import EllipticEnvelope
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM

import xgboost as xgb
import catboost as cat
import lightgbm as lgb

import torch
from torch import nn
from torch.nn import Module, Linear, Dropout
from torch.nn.functional import tanh, softmax, mse_loss, relu, sigmoid
from torch.optim import Adam, SGD

## Load Data

In [424]:
# load and split data (run)
data_X_train = pd.read_csv('/content/X_train.csv', header=0, index_col=0)
data_y_train = pd.read_csv('/content/y_train.csv', header=0, index_col=0)
data_X_test = pd.read_csv('/content/X_test.csv', header=0, index_col=0)

In [425]:
X_train = data_X_train.to_numpy()
y_train = data_y_train.to_numpy()
X_test = data_X_test.to_numpy()

## Data Preprocessing

### 处理缺省值

In [426]:
# KNN Imputer (run)
# imputer = SimpleImputer(strategy='median')
imputer = KNNImputer(n_neighbors=5)
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

### 归一化

In [427]:
# scalar (run)
x_scalar = RobustScaler()
X_train = x_scalar.fit_transform(X_train)
X_test = x_scalar.transform(X_test)

### 特征选择

#### 删除变化过小的列

In [428]:
# scalar (run)
del_columns_id_all0 = np.where(X_train.sum(axis=0) == 0)
X_train = np.delete(X_train, del_columns_id_all0, axis=1)
X_test = np.delete(X_test, del_columns_id_all0, axis=1)

#### 保留特征

In [429]:
# 使用selectkbest方法选择特征 (run)
skb = SelectKBest(f_regression, k=200)
X_train = skb.fit_transform(X_train, y_train.ravel())
X_test = skb.transform(X_test)
print(X_train.shape)

(1212, 200)


### 组合模型(最终使用)

In [441]:
class MixModel(object):
    def __init__(self):
        # 尽量选一些性能表现比较好的model
        self.cat = cat.CatBoostRegressor(iterations=2000, learning_rate=0.05, subsample=0.6, rsm=0.6, max_depth=5)
        self.xgb = xgb.XGBRegressor(n_estimators=200, max_depth=5, learning_rate=0.05, n_jobs=20)
        self.lgbm = lgb.LGBMRegressor(n_estimators=150)
        self.gp1 = GaussianProcessRegressor(alpha=1e-09, kernel=Sum(Matern(length_scale=0.5, nu=1.5), RBF(length_scale=1)))
        self.gp2 = GaussianProcessRegressor(alpha=1e-09, kernel=Matern(length_scale=0.5, nu=0.5))
        self.gp3 = GaussianProcessRegressor(alpha=1e-09, kernel=RBF(length_scale=1))
        self.lr = LinearRegression()
        self.isf = IsolationForest(n_estimators=150, random_state=0)
        self.gbr = GradientBoostingRegressor(n_estimators=200, learning_rate=0.1)
        self.basic_models = [self.cat, self.xgb, self.lgbm, self.gp1, self.gp2, self.gp3, self.lr, self.gbr]
        # self.basic_models = [self.cat, self.xgb, self.lgbm, self.gp1, self.gp2, self.gp3, self.lr, self.isf, self.gbr]
        # self.basic_models = [self.cat, self.xgb, self.lgbm, self.gp1, self.gp2, self.gp3, self.lr]
        self.intermediate_prediction = []

        self.mix_model = cat.CatBoostRegressor(iterations=1500, learning_rate=0.05, subsample=0.6, rsm=0.6, max_depth=5)

    def fit(self, X: np.array, y: np.array) -> None:
        for model in self.basic_models:
            model.fit(X, y)
            self.intermediate_prediction.append(model.predict(X).reshape((-1, 1)))

        self.intermediate_prediction = np.concatenate(self.intermediate_prediction, axis=1)

        self.mix_model.fit(self.intermediate_prediction, y)

    def predict(self, X: np.array) -> np.array:
        intermediate_prediction = []
        for model in self.basic_models:
            intermediate_prediction.append(model.predict(X).reshape((-1, 1)))
        intermediate_prediction = np.concatenate(intermediate_prediction, axis=1)
        return self.mix_model.predict(intermediate_prediction)

# 最终使用该模型
class MixModelCL(object):
    def __init__(self):
        # 尽量选一些性能表现比较好的model
        self.cat = cat.CatBoostRegressor(iterations=2000, learning_rate=0.05, subsample=0.6, rsm=0.6, max_depth=5)
        self.xgb = xgb.XGBRegressor(n_estimators=200, max_depth=5, learning_rate=0.05, n_jobs=20)
        self.lgbm = lgb.LGBMRegressor(n_estimators=200)
        self.gp1 = GaussianProcessRegressor(alpha=1e-09, kernel=Sum(Matern(length_scale=0.5, nu=1.5), RBF(length_scale=1)))
        self.gp2 = GaussianProcessRegressor(alpha=1e-09, kernel=Matern(length_scale=0.5, nu=0.5))
        self.gp3 = GaussianProcessRegressor(alpha=1e-09, kernel=RBF(length_scale=1))
        self.gp4 = GaussianProcessRegressor(alpha=1e-09, kernel=Matern(length_scale=1.0, nu=0.5))
        self.gp5 = GaussianProcessRegressor(alpha=1e-09, kernel=Matern(length_scale=1.5, nu=0.5))
        self.gp6 = GaussianProcessRegressor(alpha=1e-09, kernel=RBF(length_scale=0.5))
        self.lr = LinearRegression()
        self.isf = IsolationForest(n_estimators=200, random_state=0)
        self.gbr = GradientBoostingRegressor(n_estimators=200, learning_rate=0.1)
        self.basic_models = [self.cat, self.xgb, self.lgbm, self.gp1, self.gp2, self.gp3, self.lr, self.gbr]
        # self.basic_models = [self.cat, self.xgb, self.lgbm, self.gp1, self.gp2, self.gp3, self.lr, self.isf, self.gbr]
        # self.basic_models = [self.cat, self.xgb, self.lgbm, self.gp1, self.gp2, self.gp3, self.lr]
        self.intermediate_prediction = []

        self.mix_model = cat.CatBoostRegressor(iterations=1500, learning_rate=0.05, subsample=0.6, rsm=0.6, max_depth=5)

        self.high_model = cat.CatBoostRegressor(iterations=2000, learning_rate=0.05, subsample=0.6, rsm=0.6, max_depth=5)
        self.low_model = cat.CatBoostRegressor(iterations=2000, learning_rate=0.05, subsample=0.6, rsm=0.6, max_depth=5)

        self.classify_high_model = cat.CatBoostClassifier(iterations=2000, learning_rate=0.05, subsample=0.6, rsm=0.6, max_depth=5)
        self.classify_low_model = cat.CatBoostClassifier(iterations=2000, learning_rate=0.05, subsample=0.6, rsm=0.6, max_depth=5)
        self.skb = SelectKBest(f_regression, k=30)

    def fit(self, X: np.array, y: np.array) -> None:
        for model in self.basic_models:
            model.fit(X, y)
            self.intermediate_prediction.append(model.predict(X).reshape((-1, 1)))

        self.intermediate_prediction = np.concatenate(self.intermediate_prediction, axis=1)

        less_X = skb.fit_transform(X, y.ravel())
        self.intermediate_prediction = np.concatenate((less_X, self.intermediate_prediction), axis=1)

        self.mix_model.fit(self.intermediate_prediction, y)

        # process imbalanced data
        high_ids = np.where(y >= 80)
        low_ids = np.where(y <= 50)
        high_y_ones = (y >= 80).astype(int)
        low_y_ones = (y <= 50).astype(int)
        self.classify_high_model.fit(X, high_y_ones)
        self.classify_low_model.fit(X, low_y_ones)

        self.high_model.fit(X[high_ids], y[high_ids])
        self.low_model.fit(X[low_ids], y[low_ids])

    def predict(self, X: np.array) -> np.array:
        intermediate_prediction = []
        for model in self.basic_models:
            intermediate_prediction.append(model.predict(X).reshape((-1, 1)))
        intermediate_prediction = np.concatenate(intermediate_prediction, axis=1)

        less_X = skb.transform(X)
        intermediate_prediction = np.concatenate((less_X, intermediate_prediction), axis=1)

        pred = self.mix_model.predict(intermediate_prediction)

        # process imbalanced data
        high_id_mask = self.classify_high_model.predict(X)
        low_id_mask = self.classify_low_model.predict(X) == 1
        cooccur_mask = high_id_mask * low_id_mask
        high_id_mask = high_id_mask - cooccur_mask
        low_id_mask = low_id_mask - cooccur_mask

        high_pred = self.high_model.predict(X) * high_id_mask
        low_pred = self.low_model.predict(X) * low_id_mask
        cooccur_mask = high_id_mask * low_id_mask

        result = (pred + high_pred + low_pred + pred * (1 - high_id_mask) * (1 - low_id_mask)) / 2

        return result

## 预测

In [443]:
# 用mix model with classifier进行预测
model = MixModelCL()
model.fit(X_train, y_train.ravel())
# y_pred = np.round(model.predict(X_test), 2)
y_pred = model.predict(X_test)
decimal_part = y_pred - np.floor(y_pred)
y_pred = np.where((decimal_part < 0.15) | (decimal_part > 0.85), np.round(y_pred), np.round(y_pred, 3))
y_pred_df = pd.DataFrame(y_pred, columns=["y"], index=data_X_test.index).reset_index()
y_pred_df["id"] = y_pred_df["id"].astype(int)
y_pred_df

[1;30;43m流式输出内容被截断，只能显示最后 5000 行内容。[0m
1000:	learn: 0.0016580	total: 33.2s	remaining: 33.1s
1001:	learn: 0.0016567	total: 33.2s	remaining: 33.1s
1002:	learn: 0.0016564	total: 33.2s	remaining: 33s
1003:	learn: 0.0016557	total: 33.3s	remaining: 33s
1004:	learn: 0.0016530	total: 33.3s	remaining: 33s
1005:	learn: 0.0016525	total: 33.3s	remaining: 32.9s
1006:	learn: 0.0016519	total: 33.3s	remaining: 32.9s
1007:	learn: 0.0016465	total: 33.4s	remaining: 32.8s
1008:	learn: 0.0016432	total: 33.4s	remaining: 32.8s
1009:	learn: 0.0016424	total: 33.4s	remaining: 32.8s
1010:	learn: 0.0016402	total: 33.5s	remaining: 32.7s
1011:	learn: 0.0016374	total: 33.5s	remaining: 32.7s
1012:	learn: 0.0016361	total: 33.5s	remaining: 32.7s
1013:	learn: 0.0016355	total: 33.5s	remaining: 32.6s
1014:	learn: 0.0016286	total: 33.6s	remaining: 32.6s
1015:	learn: 0.0016271	total: 33.6s	remaining: 32.5s
1016:	learn: 0.0016252	total: 33.6s	remaining: 32.5s
1017:	learn: 0.0016223	total: 33.7s	remaining: 32.5s
1018:	learn

Unnamed: 0,id,y
0,0,61.000
1,1,73.458
2,2,70.000
3,3,75.222
4,4,73.000
...,...,...
771,771,65.524
772,772,72.391
773,773,75.000
774,774,71.000


### 输出模型结果

In [445]:
y_pred_df.to_csv("result.csv", index=False)