In [1]:
import pandas as pd
import numpy as np
import sklearn.metrics as metrics
from random import randint
from matplotlib import pyplot as plt

In [38]:
import datetime
from common import get_full_data
from ucimlrepo import fetch_ucirepo 
from df_encodings import label_encode

def generated(noise_proportion=0.1):
    def f(x,y): 
        xy = x*y
        return xy+np.cos(3*xy)/2-np.sin(2*y**2)/3
    def add_noise(y):
        random_ind = np.random.uniform(0,1,len(y))<noise_proportion
        y=np.copy(y)
        rnd_points = y[random_ind]
        y[random_ind]=rnd_points+np.random.uniform(-1,1,size=len(rnd_points))
        return y
    # xy = np.random.uniform(-2,2,(10000,2))
    xy = np.random.uniform(-2,2,(10000,2))
    
    target = f(xy[:,0],xy[:,1]).flatten()
    mat = xy
    X = pd.DataFrame(mat,columns=['x','y'])
    std = np.std(target)
    target += np.random.uniform(-std,std,len(target))/10
    target=add_noise(target)
    target = pd.Series(target.flatten(),name='f_xy')
    return X,target
# load data
def steel_strength():
    df = pd.read_csv("dataset/steel_strength.csv")
    # get dependent and independent features
    X=df.iloc[:,1:-3]
    y=df.iloc[:,-2]
    return get_full_data(X,y)

def renewable():
    df = pd.read_csv("dataset/Renewable.csv")
    time = df["Time"].apply(lambda x: datetime.datetime.fromisoformat(x))
    df=df.drop(columns=["Time"])
    df["month"] = time.apply(lambda t: t.month)
    df["day"] = time.apply(lambda t: t.day)
    df["hour"] = time.apply(lambda t: t.hour)
    df["minute"] = time.apply(lambda t: t.minute)
    return df.iloc[:,1:], df.iloc[:,0]

def covertype():
    dataset_id = 31
    # load dataset
    annealing = fetch_ucirepo(id=dataset_id) 
    
    # load pandas from it
    X : pd.DataFrame = annealing.data.features 
    y = annealing.data.targets 
    # create concat dataset
    df = pd.concat([X,y],axis=1)

    # replace class label with Elevation to do regression
    return df.drop(columns=['Elevation']),df['Elevation']
def bikes():
    dataset_id = 560
    # load dataset
    annealing = fetch_ucirepo(id=dataset_id) 
    
    # load pandas from it
    X : pd.DataFrame = annealing.data.features 
    y = annealing.data.targets 

    d = X['Date'].apply(lambda t: t.split('/'))
    X=X.drop(columns=["Date"])

    X['days'] = d.apply(lambda t:t[0])
    X['months'] = d.apply(lambda t:t[1])
    X['years'] = d.apply(lambda t:t[2])
    df = pd.concat([label_encode(X)[0],label_encode(y)[0]],axis=1)
    return df.drop(columns=['Rented Bike Count']),df['Rented Bike Count']

In [39]:
from xgboost import XGBRegressor
X,y = generated()

# for high-dimensional data use `gpu` for device if you have one
special_model = XGBRegressor(device='cpu')

In [40]:
from sklearn.decomposition import PCA, KernelPCA as KPCA
from render import *
from sklearn.preprocessing import StandardScaler

setup="3D"

if setup=="3D":
    # 3d setup
    render_shuffle = [0,1,5,2,3,4]
    dot_size=3
    n_components=5
    axis_names = ['d1','d2',y.name]
    plot_method = plot_3d_rgb
if setup=="2D":
    # 2d setup    
    render_shuffle = [0,4,1,2,3]
    dot_size=5
    n_components=4
    axis_names = ['d1',y.name]
    plot_method = plot_2d_rgb


scaler = StandardScaler()
X_norm = scaler.fit_transform(X)

max_render = 10000

pca = KPCA(n_components=n_components,kernel='rbf')
# pca = PCA(n_components=n_components)
X_small = pca.fit_transform(X_norm[:max_render])
y_n = y.to_numpy()[:max_render,np.newaxis]
X_small=np.concatenate([X_small,y_n],axis=1)
# print(sum(pca.explained_variance_ratio_))
plot_method(X_small[:,render_shuffle],"original data",axis_names, template='plotly_dark',dot_size=dot_size)

In [41]:
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from common import XGB_search_params

params = XGB_search_params()
state = randint(0,1000)
search = RandomizedSearchCV(
    special_model,
    params,
    n_iter=150,
    cv=5,
    random_state=state,
    n_jobs=-1
)

# amount of samples used for parameters search
search_space_samples=7000

if search_space_samples>=len(X):
    search_space_samples=len(X)-1

_,X_search,_,y_search = train_test_split(X,y,test_size=search_space_samples/len(X))

search.fit(X_search,y_search)
special_model=search.best_estimator_

In [42]:
# do repeated stratified k-fold cross-validation with classification report
from sklearn.model_selection import RepeatedKFold, cross_val_score
from common import cross_val_score_mean_std

cv = RepeatedKFold(n_splits=5, n_repeats=2, random_state=50)
r2_scoring = metrics.make_scorer(metrics.r2_score)
print("r2 scoring")
cross_val_score_mean_std(cross_val_score(special_model,X,y,cv=cv,scoring=r2_scoring),y.name)

r2 scoring
-----------f_xy-----------
Mean  0.9773188704974057
Std  0.0022086513736600767


In [43]:
# New method
from common import find_outliers
X_numpy = X.to_numpy()
y_numpy = y.to_numpy()

outliers_mask, score = find_outliers(
    X_numpy,
    y_numpy,
    special_model,
    outliers_to_remove=0.1,
    iterations=5,
    gamma=0.99,
    evaluate_loss=metrics.mean_squared_error,
    cv=5,
    repeats=3,
    plot=False
)
print("removed ",np.sum(outliers_mask)/len(y))
X_clean = X_numpy[~outliers_mask]
y_clean = y_numpy[~outliers_mask]

r2_scoring = metrics.make_scorer(metrics.r2_score)
print("r2 score")
cross_val_score_mean_std(cross_val_score(special_model,X_clean,y_clean,cv=cv,scoring=r2_scoring),y.name)


removed  0.0997
r2 score
-----------f_xy-----------
Mean  0.9941582692683436
Std  0.00025953804924918613


In [44]:
X_clean = X_numpy[~outliers_mask]
y_clean = y_numpy[~outliers_mask][:,np.newaxis]

X_clean_small = pca.transform(scaler.transform(X_clean[:max_render]))
to_render=np.concatenate([X_clean_small,y_clean[:max_render]],axis=1)
plot_method(to_render[:max_render,render_shuffle],"clean data",axis_names, template='plotly_dark',dot_size=dot_size)


X does not have valid feature names, but StandardScaler was fitted with feature names



In [140]:
from common import test_ridge_fit
print("original data:")
test_ridge_fit(X,y,degree=3)
print()
print("cleaned data:")
test_ridge_fit(X_clean,pd.Series(y_clean[:,0],name=y.name),degree=2)

original data:
r2 score of poly-features ridge regression
-----------f_xy-----------
Mean  0.16138429262408432
Std  0.7508054671089902

cleaned data:
r2 score of poly-features ridge regression
-----------f_xy-----------
Mean  0.34583676627532794
Std  0.8909556822958924


In [25]:
# # z-score method
# from scipy import stats
# data = pd.concat([X,y],axis=1)
# z = np.abs(stats.zscore(data))
# threshold = 100
# data_clean = data[(z < threshold).all(axis=1)]
# X_clean=data_clean.iloc[:,:-1]
# y_clean=data_clean.iloc[:,-1]
# y_clean = np.array(y_clean)[:,np.newaxis]
# X_cleana=scaler.transform(X_clean)

# # # r2_scoring = metrics.make_scorer(metrics.r2_score)
# # # print("r2 score")
# # # cross_val_score_mean_std(cross_val_score(special_model,X_clean,y_clean,cv=cv,scoring=r2_scoring),y.name)

# X_clean_small = pca.transform(X_clean)
# to_render=np.concatenate([X_clean_small,y_clean],axis=1)
# plot_2d_rgb(to_render[:,[4,3,2,1,0]],"z-score data",["d1","d2",'d3'],template='plotly_dark',dot_size=8)

In [26]:
# X_clean_small.shape

In [27]:
# import pandas as pd

# # Assuming 'data' is a Pandas DataFrame
# Q1 = data.quantile(0.05)
# Q3 = data.quantile(0.95)
# IQR = Q3 - Q1
# data_clean = data[~((data < (Q1 - 1.5 * IQR)) | (data > (Q3 + 1.5 * IQR))).any(axis=1)]

# X_clean=data_clean.iloc[:,:-2]
# y_clean=data_clean.iloc[:,-1]

# r2_scoring = metrics.make_scorer(metrics.r2_score)
# cross_val_score_mean_std(cross_val_score(special_model,X_clean,y_clean,cv=cv,scoring=r2_scoring),y.name)

In [28]:
# from sklearn.ensemble import IsolationForest

# clf = IsolationForest(random_state=50)
# outliers_pred=clf.fit_predict(data)

# data_clean = data[outliers_pred==1]

# X_clean=data_clean.iloc[:,:-2]
# y_clean=data_clean.iloc[:,-1]

# r2_scoring = metrics.make_scorer(metrics.r2_score)
# cross_val_score_mean_std(cross_val_score(special_model,X_clean,y_clean,cv=cv,scoring=r2_scoring),y.name)