In [26]:
import pandas as pd
import numpy as np
import sklearn.metrics as metrics
from random import randint
import datetime
from common import get_full_data
from ucimlrepo import fetch_ucirepo 
from df_encodings import label_encode

In [27]:
# DATASETS
def generated(noise_proportion=0.1):
    def f(x,y): 
        xy = x*y
        return xy+np.cos(3*xy)/2-np.sin(2*y**2)/3
    def add_noise(y):
        random_ind = np.random.uniform(0,1,len(y))<noise_proportion
        y=np.copy(y)
        rnd_points = y[random_ind]
        y[random_ind]=rnd_points+np.random.uniform(-1,1,size=len(rnd_points))
        return y
    # xy = np.random.uniform(-2,2,(10000,2))
    xy = np.random.uniform(-2,2,(10000,2))
    
    target = f(xy[:,0],xy[:,1]).flatten()
    mat = xy
    X = pd.DataFrame(mat,columns=['x','y'])
    std = np.std(target)
    target += np.random.uniform(-std,std,len(target))/10
    target=add_noise(target)
    target = pd.Series(target.flatten(),name='f_xy')
    return X,target
# load data
def steel_strength():
    df = pd.read_csv("dataset/steel_strength.csv")
    # get dependent and independent features
    X=df.iloc[:,1:-3]
    y=df.iloc[:,-2]
    return get_full_data(X,y)

def renewable():
    df = pd.read_csv("dataset/Renewable.csv")
    time = df["Time"].apply(lambda x: datetime.datetime.fromisoformat(x))
    df=df.drop(columns=["Time"])
    df["month"] = time.apply(lambda t: t.month)
    df["day"] = time.apply(lambda t: t.day)
    df["hour"] = time.apply(lambda t: t.hour)
    df["minute"] = time.apply(lambda t: t.minute)
    return df.iloc[:,1:], df.iloc[:,0]

def covertype():
    dataset_id = 31
    # load dataset
    annealing = fetch_ucirepo(id=dataset_id) 
    
    # load pandas from it
    X : pd.DataFrame = annealing.data.features 
    y = annealing.data.targets 
    # create concat dataset
    df = pd.concat([X,y],axis=1)

    # replace class label with Elevation to do regression
    return df.drop(columns=['Elevation']),df['Elevation']
def bikes():
    dataset_id = 560
    # load dataset
    annealing = fetch_ucirepo(id=dataset_id) 
    
    # load pandas from it
    X : pd.DataFrame = annealing.data.features 
    y = annealing.data.targets 

    d = X['Date'].apply(lambda t: t.split('/'))
    X=X.drop(columns=["Date"])

    X['days'] = d.apply(lambda t:t[0])
    X['months'] = d.apply(lambda t:t[1])
    X['years'] = d.apply(lambda t:t[2])
    df = pd.concat([label_encode(X)[0],label_encode(y)[0]],axis=1)
    return df.drop(columns=['Rented Bike Count']),df['Rented Bike Count']

In [28]:
from xgboost import XGBRegressor
from sklearn.preprocessing import RobustScaler, StandardScaler
# choose dataset to use
X,y = generated()

# for high-dimensional data use `gpu` for device if you have one
special_model = XGBRegressor(device='cpu',n_jobs=-1)
scaler = RobustScaler()
X_norm = scaler.fit_transform(X.to_numpy())

In [29]:
# search model optimal hyperparameters
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from common import XGB_search_params

params = XGB_search_params()
state = randint(0,1000)
search = RandomizedSearchCV(
    special_model,
    params,
    n_iter=150,
    cv=5,
    random_state=state,
    n_jobs=-1
)

# amount of samples used for parameters search
search_space_samples=7000

if search_space_samples>=len(X):
    search_space_samples=len(X)-1

_,X_search,_,y_search = train_test_split(X,y,test_size=search_space_samples/len(X))

search.fit(X_search,y_search)
special_model=search.best_estimator_

In [37]:
# do render of original data and model r2 score
from common import cross_val_score_mean_std
from kernel_pca_search import KernelPCASearchCV, kernel_pca_scorer
from render import *
from sklearn.model_selection import RepeatedKFold, cross_val_score

scoring_metric = metrics.r2_score
scoring_name = scoring_metric.__name__
scoring = metrics.make_scorer(scoring_metric)

setup="2D"

if setup=="3D":
    # 3d setup
    render_shuffle = [0,1,5,2,3,4]
    dot_size=3
    n_components=5
    axis_names = ['d1','d2',y.name]
    plot_method = plot_3d_rgb
    axis_sizes = [None,None,(min(y),max(y))]
if setup=="2D":
    # 2d setup    
    render_shuffle = [0,4,1,2,3]
    dot_size=5
    n_components=4
    axis_names = ['d1',y.name]
    plot_method = plot_2d_rgb
    axis_sizes = [None,None]

max_render = 10000
max_kpca_fit_values = 3000
cv = RepeatedKFold(n_splits=5, n_repeats=2, random_state=50)

def render_results(X,y,scaler,outliers_mask,title="clean data"):
    X_clean = np.array(X)
    y_clean = np.array(y)
    
    if outliers_mask is not None:
        X_clean = X_clean[~outliers_mask]
        y_clean = y_clean[~outliers_mask]

    scoring = metrics.make_scorer(scoring_metric)
    cleaned_data_score=cross_val_score(special_model,X_clean,y_clean,cv=cv,scoring=scoring)

    y_clean = y_clean[:,np.newaxis]
    X_clean_small = pca.transform(scaler.transform(X_clean[:max_render]))
    to_render=np.concatenate([X_clean_small,y_clean[:max_render]],axis=1)
    removed_size = 1-len(y_clean)/len(y)
    if outliers_mask is not None:
        print("removed ",removed_size)
        if removed_size==0:
            return
    print(f"{scoring_name} on",title)
    cross_val_score_mean_std(cleaned_data_score,y.name)
    plot_method(to_render[:max_render,render_shuffle],title,axis_names, template='plotly_dark',dot_size=dot_size,axis_sizes=axis_sizes)

indices = np.arange(len(X_norm))
np.random.shuffle(indices)
indices_small=indices[:max_kpca_fit_values]
pca = KernelPCASearchCV(n_components=n_components,n_iter=20,kernel='rbf')
pca = pca.fit(X_norm[indices_small]).kpca

render_results(X,y,scaler,None,"original data")
#how good given KPCA is at handling dimensionality reduction 0 is bad, 1 is perfect
print("Dim reduction quality",kernel_pca_scorer(pca,X_norm[indices][:max_render]))

render.
r2_score on original data
-----------f_xy-----------
Mean  0.9760691620128465
Std  0.0015085568787799244


Dim reduction quality 0.9999994481108312


In [39]:
# find outliers by iterative filtering
from common import find_outliers
X_numpy = X.to_numpy()
y_numpy = y.to_numpy()

outliers_to_remove=0.1

outliers_mask, pred_loss, score = find_outliers(
    X_numpy,
    y_numpy,
    special_model,
    outliers_to_remove=outliers_to_remove,
    iterations=5,
    gamma=0.99,
    evaluate_loss=metrics.mean_squared_error,
    cv=5,
    repeats=3,
    plot=False
)

render_results(X,y,scaler,outliers_mask,"iterative filtering")

render.
removed  0.09970000000000001
r2_score on iterative filtering
-----------f_xy-----------
Mean  0.9941549841430204
Std  0.000389860354094782


In [32]:
# z-score method
from scipy import stats
data = pd.concat([X,y],axis=1)
z = np.abs(stats.zscore(data))
threshold = 2
outliers_mask = ~(z < threshold).all(axis=1)
render_results(X,y,scaler,outliers_mask,"z-score filtering")

render.
removed  0.06000000000000005
r2_score on z-score filtering
-----------f_xy-----------
Mean  0.9694715861083127
Std  0.0015898675012498737


In [33]:
from sklearn.ensemble import IsolationForest

clf = IsolationForest(random_state=10,contamination=outliers_to_remove)
outliers_mask=clf.fit_predict(data)==-1
render_results(X,y,scaler,outliers_mask,"isolation forest filtering")

render.
removed  0.09999999999999998
r2_score on isolation forest filtering
-----------f_xy-----------
Mean  0.9648022110643044
Std  0.0023221085233968183


In [34]:
from sklearn.cluster import DBSCAN

def get_outliers_dbscan(X, eps=0.17, min_samples=15):
    db = DBSCAN(eps=eps, min_samples=min_samples)
    y_db = db.fit_predict(X)
    return y_db == -1 
dbscan_outliers = get_outliers_dbscan(data)
render_results(X,y,scaler,dbscan_outliers,"dbscan filtering")

render.
removed  0.134
r2_score on dbscan filtering
-----------f_xy-----------
Mean  0.9916734000375007
Std  0.0005668238977198187


In [35]:
from sklearn.svm import OneClassSVM

def outliers_svm(X):
    svm = OneClassSVM(nu=0.05, kernel="rbf", gamma=0.1)
    y_pred = svm.fit_predict(X)
    return y_pred == -1

svm_outliers = outliers_svm(data)
render_results(X,y,scaler,svm_outliers,"one class svm filtering")

render.
removed  0.050000000000000044
r2_score on one class svm filtering
-----------f_xy-----------
Mean  0.973779341398826
Std  0.002411889862620522
