# Project 1

## Import Libraries

In [1]:
import numpy as np
import pandas as pd

import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.model_selection import KFold, train_test_split
from sklearn.feature_selection import RFECV
from sklearn.cluster import KMeans
from sklearn.gaussian_process.kernels import Matern, RBF
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

import torch
from torch.nn import Module, Linear, Dropout
from torch.nn.functional import tanh, softmax, mse_loss, relu
from torch.optim import Adam, SGD

## Load Data

In [2]:
# load and split data
data_X_train = pd.read_csv('Data/X_train.csv', header=0, index_col=0)
data_y_train = pd.read_csv('Data/y_train.csv', header=0, index_col=0)
data_X_test = pd.read_csv('Data/X_test.csv', header=0, index_col=0)

In [3]:
# data info
data_X_train.describe()
"""
Data Shape: 1212 x 832
Data Lost: a lot
data scale: large
"""

'\nData Shape: 1212 x 832\nData Lost: a lot\ndata scale: large\n'

In [4]:
# transfer data to numpy
X_train = data_X_train.to_numpy()
y_train = data_y_train.to_numpy()
X_test = data_X_test.to_numpy()

## Data Preprocessing

## 归一化

In [5]:
scalar = StandardScaler()
X_train = scalar.fit_transform(X_train)
X_test = scalar.transform(X_test)

### 处理缺省值

In [6]:
# KNN Imputer
imputer = KNNImputer(n_neighbors=5)
X_train = imputer.fit_transform(X_train)

### 特征选择

In [7]:
# RFECV
estimator = LinearRegression()
selector = RFECV(estimator, step=1, cv=5, scoring='r2')
selector.fit(X_train, y_train)
feature_ranks = selector.ranking_

In [24]:
# store the feature ranking
pd.DataFrame(selector.ranking_).to_csv("Temp/feature_ranking.csv", header=False, index=False)
feature_ranks = pd.read_csv("Temp/feature_ranking.csv", header=None, index_col=None).to_numpy().reshape(-1)

In [21]:
# choose features (top 95%)
def select_features(x, rank, threshold=0.95):
    drop_feature_ids = np.where(rank > int(threshold * max(rank)))
    selected_x = np.delete(x, drop_feature_ids, axis=1)
    return selected_x

X_train = select_features(X_train, feature_ranks, 0.95)
X_test = select_features(X_test, feature_ranks, 0.95)

### 噪声探测

In [8]:
test = np.array([[1, 2, 3], [4, 5, 6]])
test.mean(axis=0)

array([2.5, 3.5, 4.5])

In [20]:
"""
！！！该方法具有较高的不确定性，不能保证有效
思路：
    1. 以每5年为一个period，计算数据到中心点的距离
    2. 剔除距离最大的5%的数据
"""
deleted_sample_ids = []
for start_year in np.arange(y_train.min(), y_train.max() + 1, 5):
    cluster_x_train = X_train[y_train >= start_year and y_train < start_year + 5] 
    center_point = np.mean(cluster_x_train, axis=0)

array([[1],
       [4]])