# Project 1

## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import joblib
import matplotlib.pyplot as plt

import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.model_selection import KFold
from sklearn.feature_selection import RFECV, SelectKBest, r_regression
from sklearn.gaussian_process.kernels import Matern, RBF
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, IsolationForest, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

import xgboost as xgb

import torch
from torch.nn import Module, Linear, Dropout
from torch.nn.functional import tanh, softmax, mse_loss, relu
from torch.optim import Adam, SGD

## Load Data

In [2]:
# load and split data
data_X_train = pd.read_csv('Data/X_train.csv', header=0, index_col=0)
data_y_train = pd.read_csv('Data/y_train.csv', header=0, index_col=0)
data_X_test = pd.read_csv('Data/X_test.csv', header=0, index_col=0)

In [3]:
# data info
data_X_train.describe()
"""
Data Shape: 1212 x 832
Data Lost: a lot
data scale: large
"""

'\nData Shape: 1212 x 832\nData Lost: a lot\ndata scale: large\n'

In [4]:
# transfer data to numpy
X_train = data_X_train.to_numpy()
y_train = data_y_train.to_numpy()
X_test = data_X_test.to_numpy()

## Data Preprocessing

### 归一化

In [5]:
scalar = StandardScaler()
X_train = scalar.fit_transform(X_train)
X_test = scalar.transform(X_test)

### 处理缺省值

In [6]:
# KNN Imputer
imputer = KNNImputer(n_neighbors=5)
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

### 特征选择

#### 删除变化过小的列

In [7]:
del_columns_id_all0 = np.where(X_train.sum(axis=0) == 0)
X_train = np.delete(X_train, del_columns_id_all0, axis=1)
X_test = np.delete(X_test, del_columns_id_all0, axis=1)

#### 皮尔森系数

In [8]:
cc = r_regression(X_train, y_train.ravel())
del_columns_id_pearson = np.where(np.absolute(cc) <= 1e-2) # 删除pearson系数小于0.01的特征
X_train = np.delete(X_train, del_columns_id_pearson, axis=1)
X_test = np.delete(X_test, del_columns_id_pearson, axis=1)

#### 1. 删减特征

In [None]:
# RFECV
estimator = LinearRegression()
selector = RFECV(estimator, step=1, cv=5, scoring='r2')
selector.fit(X_train, y_train)
feature_ranks = selector.ranking_

In [7]:
# store the feature ranking
# pd.DataFrame(selector.ranking_).to_csv("Temp/feature_ranking.csv", header=False, index=False)
feature_ranks = pd.read_csv("Temp/feature_ranking.csv", header=None, index_col=None).to_numpy().reshape(-1)

In [8]:
# choose features (top 95%)
def select_features(x, rank, threshold=0.8):
    drop_feature_ids = np.where(rank > int(threshold * max(rank)))
    selected_x = np.delete(x, drop_feature_ids, axis=1)
    return selected_x

X_train = select_features(X_train, feature_ranks, 0.95)
X_test = select_features(X_test, feature_ranks, 0.95)

#### 保留特征

In [9]:
# 使用selectkbest方法选择特征
skb = SelectKBest(k=200)
X_train = skb.fit_transform(X_train,y_train.ravel())
X_test = skb.transform(X_test)
print(X_train.shape)

(1212, 200)


### 噪声探测

In [31]:
del_rows_id_all0 = np.where(X_train.std(axis=1) >= 1.5)
# X_train = np.delete(X_train, del_columns_id_all0, axis=1)
del_rows_id_all0[0]

array([167, 207, 213, 681, 740, 805], dtype=int64)

In [20]:
"""
！！！该方法具有较高的不确定性，不能保证有效
思路：
    1. 以每5年为一个period，计算数据到中心点的距离
    2. 剔除距离最大的5%的数据
"""
deleted_sample_ids = []
for start_year in np.arange(y_train.min(), y_train.max() + 1, 5):
    cluster_x_train = X_train[y_train >= start_year and y_train < start_year + 5] 
    center_point = np.mean(cluster_x_train, axis=0)

array([[1],
       [4]])

## Model Selection

### Linear Regression

In [15]:
# LR
fold_num = 5

kf = KFold(n_splits=fold_num)
fold_score = 0
for i, (train_ids, valid_ids) in enumerate(kf.split(X_train)):
    # split validation data
    fold_X_train = X_train[train_ids]
    fold_y_train = y_train[train_ids]
    fold_X_valid = X_train[valid_ids]
    fold_y_valid = y_train[valid_ids]

    # train model
    model = LinearRegression()
    model.fit(fold_X_train, fold_y_train)
    fold_y_pred = model.predict(fold_X_valid)

    # calculate score
    fold_score += r2_score(fold_y_valid, fold_y_pred)
fold_score /= fold_num
print(fold_score)

-2.890191269931073


### Kernel Ridge Regression

In [20]:
# Ridge Regression
fold_num = 5

kf = KFold(n_splits=fold_num)
fold_score = 0
for i, (train_ids, valid_ids) in enumerate(kf.split(X_train)):
    # split validation data
    fold_X_train = X_train[train_ids]
    fold_y_train = y_train[train_ids]
    fold_X_valid = X_train[valid_ids]
    fold_y_valid = y_train[valid_ids]

    # train model
    model = KernelRidge(kernel="rbf")
    model.fit(fold_X_train, fold_y_train)
    fold_y_pred = model.predict(fold_X_valid)

    # calculate score
    fold_score += r2_score(fold_y_valid, fold_y_pred)
fold_score /= fold_num
print(fold_score)

0.019787009919412422


### Gaussian Process Regressor

In [32]:
# Gaussian Process Regressor (Matern)
fold_num = 5

kf = KFold(n_splits=fold_num)
fold_scores = []
for i, (train_ids, valid_ids) in enumerate(kf.split(X_train)):
    # split validation data
    fold_X_train = X_train[train_ids]
    fold_y_train = y_train[train_ids]
    fold_X_valid = X_train[valid_ids]
    fold_y_valid = y_train[valid_ids]

    # train model
    model = GaussianProcessRegressor(kernel=Matern(nu=0.5, length_scale=1), random_state=0)
    model.fit(fold_X_train, fold_y_train)
    fold_y_pred = model.predict(fold_X_valid)

    # calculate score
    fold_scores.append(r2_score(fold_y_valid, fold_y_pred))
fold_score = np.average(fold_scores)
print(fold_score)

0.3749308857165471


In [38]:
# Gaussian Process Regressor (RBF)
fold_num = 5

kf = KFold(n_splits=fold_num)
fold_scores = []
for i, (train_ids, valid_ids) in enumerate(kf.split(X_train)):
    # split validation data
    fold_X_train = X_train[train_ids]
    fold_y_train = y_train[train_ids]
    fold_X_valid = X_train[valid_ids]
    fold_y_valid = y_train[valid_ids]

    # train model
    model = GaussianProcessRegressor(kernel=RBF(length_scale=10), random_state=0)
    model.fit(fold_X_train, fold_y_train)
    fold_y_pred = model.predict(fold_X_valid)

    # calculate score
    fold_scores.append(r2_score(fold_y_valid, fold_y_pred))
fold_score = np.average(fold_scores)
print(fold_score)

-0.01484695627455086


### SVR

In [47]:
# SVR
fold_num = 5

kf = KFold(n_splits=fold_num)
fold_scores = []
for i, (train_ids, valid_ids) in enumerate(kf.split(X_train)):
    # split validation data
    fold_X_train = X_train[train_ids]
    fold_y_train = y_train[train_ids]
    fold_X_valid = X_train[valid_ids]
    fold_y_valid = y_train[valid_ids]

    # train model
    model = SVR(kernel="rbf")
    model.fit(fold_X_train, fold_y_train.ravel())
    fold_y_pred = model.predict(fold_X_valid)

    # calculate score
    fold_scores.append(r2_score(fold_y_valid, fold_y_pred))
fold_score = np.average(fold_scores)
print(fold_score)

0.27278034373557075


### Random Forest

In [62]:
# Random Forest Regressor
fold_num = 5

kf = KFold(n_splits=fold_num)
fold_scores = []
for i, (train_ids, valid_ids) in enumerate(kf.split(X_train)):
    # split validation data
    fold_X_train = X_train[train_ids]
    fold_y_train = y_train[train_ids]
    fold_X_valid = X_train[valid_ids]
    fold_y_valid = y_train[valid_ids]

    # train model
    model = RandomForestRegressor(n_estimators=100, max_depth=20, random_state=0, criterion="squared_error")
    model.fit(fold_X_train, fold_y_train.ravel())
    fold_y_pred = model.predict(fold_X_valid)

    # calculate score
    fold_scores.append(r2_score(fold_y_valid, fold_y_pred))
fold_score = np.average(fold_scores)
print(fold_score)

0.45761414755344837


In [67]:
# Isolation Forest Regressor
fold_num = 5

kf = KFold(n_splits=fold_num)
fold_scores = []
for i, (train_ids, valid_ids) in enumerate(kf.split(X_train)):
    # split validation data
    fold_X_train = X_train[train_ids]
    fold_y_train = y_train[train_ids]
    fold_X_valid = X_train[valid_ids]
    fold_y_valid = y_train[valid_ids]

    # train model
    model = IsolationForest(n_estimators=100, max_features=0.6, random_state=0)
    model.fit(fold_X_train, fold_y_train)
    fold_y_pred = model.predict(fold_X_valid)

    # calculate score
    fold_scores.append(r2_score(fold_y_valid, fold_y_pred))
fold_score = np.average(fold_scores)
print(fold_score)

-50.88352200279015


In [11]:
# Gradient Boosting Regressor
fold_num = 5

kf = KFold(n_splits=fold_num)
fold_scores = []
for i, (train_ids, valid_ids) in enumerate(kf.split(X_train)):
    # split validation data
    fold_X_train = X_train[train_ids]
    fold_y_train = y_train[train_ids]
    fold_X_valid = X_train[valid_ids]
    fold_y_valid = y_train[valid_ids]

    # train model
    model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1)
    model.fit(fold_X_train, fold_y_train.ravel())
    fold_y_pred = np.round(model.predict(fold_X_valid))

    # calculate score
    fold_scores.append(r2_score(fold_y_valid, fold_y_pred))
fold_score = np.average(fold_scores)
print(fold_score)

0.5033211490279413


In [72]:
# Adaboost Regressor
fold_num = 5

kf = KFold(n_splits=fold_num)
fold_scores = []
for i, (train_ids, valid_ids) in enumerate(kf.split(X_train)):
    # split validation data
    fold_X_train = X_train[train_ids]
    fold_y_train = y_train[train_ids]
    fold_X_valid = X_train[valid_ids]
    fold_y_valid = y_train[valid_ids]

    # train model
    model = AdaBoostRegressor(n_estimators=100, learning_rate=0.1, loss="square")
    model.fit(fold_X_train, fold_y_train.ravel())
    fold_y_pred = model.predict(fold_X_valid)

    # calculate score
    fold_scores.append(r2_score(fold_y_valid, fold_y_pred))
fold_score = np.average(fold_scores)
print(fold_score)

0.4110280800135076


### XGBoost

In [10]:
# XGBoost
fold_num = 5

kf = KFold(n_splits=fold_num)
fold_scores = []
for i, (train_ids, valid_ids) in enumerate(kf.split(X_train)):
    # split validation data
    fold_X_train = X_train[train_ids]
    fold_y_train = y_train[train_ids]
    fold_X_valid = X_train[valid_ids]
    fold_y_valid = y_train[valid_ids]

    # train model
    model = xgb.XGBRegressor(n_estimators=200, max_depth=7, learning_rate=0.1, n_jobs=20)
    model.fit(fold_X_train, fold_y_train.ravel())
    fold_y_pred = np.round(model.predict(fold_X_valid))

    # calculate score
    fold_scores.append(r2_score(fold_y_valid, fold_y_pred))
fold_score = np.average(fold_scores)
print(fold_score)

0.5046748548918308


In [10]:
model = xgb.XGBRegressor(n_estimators=200, max_depth=7, learning_rate=0.1, n_jobs=20)
model.fit(X_train, y_train.ravel())
y_pred = np.round(model.predict(X_test))
y_pred_df = pd.DataFrame(y_pred, columns=["y"], index=data_X_test.index).reset_index()
y_pred_df["id"] = y_pred_df["id"].astype(int)
y_pred_df

Unnamed: 0,id,y
0,0,61.0
1,1,77.0
2,2,65.0
3,3,73.0
4,4,73.0
...,...,...
771,771,63.0
772,772,74.0
773,773,73.0
774,774,69.0


In [11]:
y_pred_df.to_csv("result.csv", index=False)