In [1]:
import os
import numpy as np 
import pandas as pd

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
import scipy
import math
from sklearn.model_selection import GridSearchCV
from scipy.stats import wilcoxon
from RTER import RegressionTree
from comparison.ensemble import RegressionTreeBoosting, RegressionTreeEnsemble
from comparison.EKNN import EKNN
from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from time import time


data_file_dir = "./data/"
#data_file_name_seq = ['ionosphere.csv','adult.csv','abalone.csv', 'australian.csv', 'breast-cancer.csv', 'credit.csv', 'parkinsons.csv', 'winequality-red.csv', 'winequality-white.csv', 'winequality.csv']
data_file_name_seq = ['possum.csv']
#data_file_name_seq=["lympho.csv","cardio.csv", "thyroid.csv","vowels.csv", "glass.csv", "musk.csv","letter.csv", "pima.csv", "satellite.csv", "pendigits.csv", "yeast.csv", "heart.csv"]
#data_file_name_seq=['ionosphere.csv','adult.csv', 'winequality.csv']

log_file_dir = "./realdata_result/"

def anll(pdf):
    return -np.log(pdf).mean()


for data_file_name in data_file_name_seq:
    # load dataset
    data_file_path = os.path.join(data_file_dir, data_file_name)
    data = pd.read_csv(data_file_path)
    #data = np.array(data)
    # dataset status
    data_name = os.path.splitext(data_file_name)[0]
    if data_name == 'possum':
        data = data.drop(columns=['case','Pop','sex'])
        #print(data)
        #import pdb;pdb.set_trace()

    num_samples = data.shape[0]
    num_features = data.shape[1]-1
    colname = list(data.columns)
    # transformation
    scaler = MinMaxScaler()
    data = scaler.fit_transform(data)
    
    data = data[np.logical_not(np.isnan(data).any(axis=1))]
    #import pdb;pdb.set_trace()

    #
    repeat_times = 5


    ##################################缩放的时候是否把训练集测试集同时缩放了？
    for i in range(repeat_times):

        # pca
        train_X, test_X = train_test_split(data, train_size=0.7, test_size=0.3, random_state= i )
        if data_name == 'possum':
            #df_train = pd.DataFrame(train_X, columns = colname)
            train_ys = train_X[:,2]
            train_xs = np.delete(train_X,2, axis=1)
            #df_test = pd.DataFrame(test_X, columns = colname)
            test_ys = test_X[:,2]
            test_xs = np.delete(test_X,2, axis=1)

        #import pdb;pdb.set_trace()

        # estimation

        
        time_start=time()
        parameters={"truncate_ratio_low":[0], "truncate_ratio_up":[0.4,0.6,0.8 ],
           "min_samples_split":[10,30], "max_depth":[1,2,4,6],
           "order":[0,1,3,6],"splitter":["varreduction"],
            "estimator":["pointwise_extrapolation_estimator"],
           "r_range_low":[0],"r_range_up":[1],
           "step":[1,2,4,8],"lamda":[0.001,0.01,0.1,1,5]}
        RTER_model=RegressionTree(min_samples_split=30, max_depth=3,parallel_jobs=0)
        RTER_model.fit(train_xs, train_ys) ##############
        mse_score=-RTER_model.score(test_xs, test_ys)
        time_end=time()
     
        log_file_name = "{}.csv".format("RTER")
        log_file_path = os.path.join(log_file_dir, log_file_name)
        
        with open(log_file_path, "a") as f:
            logs= "{},{},{},{}\n".format(data_name,
                                          mse_score, time_end-time_start,
                                          i)
            f.writelines(logs)
        
        # boosting
        time_start=time()
        parameters={"rho":[0.01,0.05,0.1], "boost_num":[50,100,200], "min_samples_split":[10], "max_depth":[2,5,8],"splitter":["maxedge"]}
        cv_model_boosting=GridSearchCV(estimator=RegressionTreeBoosting(),param_grid=parameters, cv=10, n_jobs=-1)
        cv_model_boosting.fit(train_xs, train_ys)
        boosting_model = cv_model_boosting.best_estimator_
        mse_score= - boosting_model.score(test_xs, test_ys)
        log_file_name = "{}.csv".format("boosting")
        log_file_path = os.path.join(log_file_dir, log_file_name)
        time_end=time()
        
        log_file_name = "{}.csv".format("boosting")
        log_file_path = os.path.join(log_file_dir, log_file_name)
        
        with open(log_file_path, "a") as f:
            logs= "{},{},{},{}\n".format(data_name,
                                          mse_score, time_end-time_start,
                                          i)
            f.writelines(logs)
         
        # ensemble
        time_start=time()
        parameters={ "ensemble_num":[50,100,200], "min_samples_split":[10], "max_depth":[2,5,8],"splitter":["maxedge"]}
        cv_model_ensemble=GridSearchCV(estimator=RegressionTreeEnsemble(),param_grid=parameters, cv=10, n_jobs=-1)
        cv_model_ensemble.fit(train_xs, train_ys)
        ensemble_model = cv_model_ensemble.best_estimator_
        mse_score= - ensemble_model.score(test_xs, test_ys)
        time_end=time()
        
        log_file_name = "{}.csv".format("ensemble")
        log_file_path = os.path.join(log_file_dir, log_file_name)
        
        with open(log_file_path, "a") as f:
            logs= "{},{},{},{}\n".format(data_name,
                                          mse_score, time_end-time_start,
                                          i)
            f.writelines(logs)
         
        # GBRT
        time_start=time()
        parameters= {"n_estimators":[500,1000,2000], "learning_rate":[0.01,0.05]}
        cv_model_GBRT=GridSearchCV(estimator=GradientBoostingRegressor(),param_grid=parameters, cv=10, n_jobs=-1)
        cv_model_GBRT.fit(train_xs, train_ys)
        model_GBRT = cv_model_GBRT.best_estimator_
        model_GBRT.fit(train_xs, train_ys.ravel())
        y_hat=model_GBRT.predict(test_xs)
        mse_score = MSE(y_hat, test_ys)
        time_end=time()
        
        log_file_name = "{}.csv".format("GBRT")
        log_file_path = os.path.join(log_file_dir, log_file_name)
        
        with open(log_file_path, "a") as f:
            logs= "{},{},{},{},{},{}\n".format(data_name,
                                          mse_score, time_end-time_start,
                                          i)
            f.writelines(logs)
            
            
        # RF
        time_start=time()
        parameters = {"n_estimators":[10,100,200]}
        cv_model_RFR = GridSearchCV(estimator=RandomForestRegressor(),param_grid=parameters, cv=10, n_jobs=-1) 
        cv_model_RFR.fit(train_xs, train_ys)
        model_RFR = cv_model_RFR.best_estimator_
        model_RFR.fit(train_xs, train_ys)
        y_hat=model_RFR.predict(test_xs)
        mse_score = MSE(y_hat, test_ys)
        time_end=time()
        
        log_file_name = "{}.csv".format("RFR")
        log_file_path = os.path.join(log_file_dir, log_file_name)
        
        with open(log_file_path, "a") as f:
            logs= "{},{},{},{}\n".format(data_name,
                                          mse_score, time_end-time_start,
                                          i)
            f.writelines(logs)
            
            
        '''   
        # EKNN
        time_start=time()
        parameters = {"V":[4,8,12,16], "C":[1,3,5,7,9,11],"alpha":[0.01,0.05,0.1]}
        cv_model_EKNN = GridSearchCV(estimator=EKNN(),param_grid=parameters, cv=10, n_jobs=-1) 
        cv_model_EKNN.fit(X_train, Y_train)
        model_EKNN = cv_model_EKNN.best_estimator_
        model_EKNN.fit(X_train, Y_train)
        y_hat=model_EKNN.predict(X_test)
        mse_score = MSE(y_hat, Y_test)
        time_end=time()
        
        log_file_name = "{}.csv".format("EKNN")
        log_file_path = os.path.join(log_file_dir, log_file_name)
        
        with open(log_file_path, "a") as f:
            logs= "{},{},{},{},{},{}\n".format(distribution_index,
                                          mse_score, time_end-time_start,
                                          iterate,n_train,n_test)
            f.writelines(logs)
        
        '''  

FileNotFoundError: [Errno 2] No such file or directory: './realdata_result/RTER.csv'

In [2]:
train_xs

array([[0.83333333, 0.25      , 0.24731183, 0.25581395, 0.36363636,
        0.13068182, 0.20754717, 0.7       , 0.3       , 0.23333333],
       [1.        , 0.375     , 0.35483871, 0.62790698, 0.5       ,
        0.23863636, 0.32075472, 0.42      , 0.5       , 0.4       ],
       [0.66666667, 0.        , 0.12903226, 0.25581395, 0.27272727,
        0.09659091, 0.13207547, 0.26      , 0.35      , 0.33333333],
       [0.5       , 0.125     , 0.70967742, 0.81395349, 0.54545455,
        0.69318182, 0.28930818, 0.72      , 0.85      , 0.73333333],
       [0.        , 0.375     , 0.3655914 , 0.55813953, 0.22727273,
        0.73295455, 0.79874214, 0.5       , 0.5       , 0.33333333],
       [0.        , 0.5       , 0.29032258, 0.72093023, 0.27272727,
        0.74431818, 0.85534591, 0.48      , 0.6       , 0.46666667],
       [0.5       , 0.125     , 0.41397849, 0.60465116, 0.63636364,
        0.15909091, 0.42138365, 0.32      , 0.4       , 0.33333333],
       [0.66666667, 0.375     , 0.4086021

In [8]:
np.isnan(data).sum()

0