In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import scale
from sklearn.utils import resample
from sklearn.neural_network import MLPRegressor
from hyperopt import STATUS_OK, hp, fmin, tpe, Trials, space_eval

from time import time
import operator

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import itertools
import tensorflow as tf

from scipy.stats import norm

tf.logging.set_verbosity(tf.logging.ERROR)

In [20]:

# IMPORT DATA

def load_data():
    full_data = pd.read_csv("X.csv")
    train_y = pd.read_csv("ytr.csv")
    # Rename columns to something more interpretable
    columns = (["reflectance_" + str(i) for i in range(7)]
               + ["solar_" + str(i) for i in range(5)] + ["id"])
    full_data.columns = columns
    # Add y to the data frame
    split = 98000
    y_id_dict = train_y.set_index("Id")["y"].to_dict()
    full_data.loc[:(split-1), "y"] = full_data.loc[:(split-1), "id"].replace(y_id_dict)

    train, test = full_data[:split], full_data[split:]
    return (train, test)

train, test = load_data()
random_seed = 8888
count = 0

In [21]:
cols_excl = ["id", "y"]
cols_orig = [c for c in train.columns if c not in cols_excl]


train[cols_orig] = scale(train[cols_orig])
test[cols_orig] = scale(test[cols_orig])

In [23]:
cols_dnn = cols_orig

average_RMSE = 0.0
n_splits = 5
    

kf = KFold(n_splits=n_splits)
nb_fold = 0
for train_index, validation_index in kf.split(train):
    nb_fold += 1
    train_fold, validation_fold = train.loc[train_index], train.loc[validation_index]
        

    def input_fn(data_set):
        feature_cols = {k: tf.constant(data_set[k].values) for k in cols_dnn}
        labels = tf.constant(data_set["y"].values)
        return feature_cols, labels
        
      
    model_dnn = MLPRegressor(hidden_layer_sizes=(10,),
                                 max_iter=1000,
                                 early_stopping=True,
                                 alpha=parameters["l2_reg"],
                                 random_state=random_seed)
    
    model_dnn.fit(train_fold[cols_dnn], train_fold["y"])

    train_pred = train_fold[["id"]].assign(y_hat=0)
        
    y_hat = model_dnn.predict(train_fold[cols_dnn])
        
    train_pred["y_hat"] = y_hat

    y_hat_med = train_pred.groupby("id").median()["y_hat"].to_dict()

    RMSE = np.sqrt(mean_squared_error(train_pred["id"].map(y_hat_med).values, train_fold["y"]))
    print("Pruning {0} RMSE: {1}".format(count, RMSE))
        
    # Prune outliers
    RMSE_decreasing = True
    while (RMSE_decreasing):
        count +=1
        train_pred["y_med"] = train_pred["id"].map(y_hat_med)
        train_pred["y"] = train_pred["id"].map(train_fold["y"])

        # Distance from the median for each bag
        train_pred["score"] = (train_pred["y_hat"] - train_pred["y_med"])**2
        #Rank of each instance by bag
        train_pred["rank"] = train_pred.groupby("id")["score"].rank()
        bag_size_dict = train_pred.groupby("id")["score"].count().to_dict()
        train_pred["bag_size"] = train_pred["id"].map(bag_size_dict)
        train_pred["rank"] = train_pred["rank"] / train_pred["bag_size"]

        # Remove outliers
        outliers_index = train_pred["rank"] > (1 - 0.02)
        train_pred = train_pred.loc[~outliers_index, :].reset_index(drop=True)
        train_fold = train_fold.loc[~outliers_index, :].reset_index(drop=True)
        
        # Remove bags
        y_hat_mean = train_pred.groupby("id").mean()["y_hat"].to_dict()
        train_pred["y_mean"] = train_pred["id"].map(y_hat_mean)
        train_pred["bag_score"] = (train_pred["y_mean"] - train_pred["y"])**2
        
        train_pred["bag_rank"] = train_pred.groupby("id")["bag_score"].rank()
        number_of_bags = len(train_pred.id.unique())
        train_pred["bag_rank"] = train_pred["bag_rank"] / number_of_bags
        
        outliers_index2 = train_pred["bag_rank"] > (1 - 0.02)
        #train_pred = train_pred.loc[~outliers_index2, :].reset_index(drop=True)
        train_fold = train_fold.loc[~outliers_index2, :].reset_index(drop=True)
            
        model_dnn = MLPRegressor(hidden_layer_sizes=(10,),
                                     max_iter=1000,
                                     early_stopping=True,
                                    alpha=parameters["l2_reg"],
                                     random_state=random_seed)
        
        model_dnn.fit(train_fold[cols_dnn], train_fold["y"])
            
            # Compute new RMSE
        train_pred = train_fold[["id"]].assign(y_hat=0)
            
            
        y_hat = model_dnn.predict(train_fold[cols_dnn])
            #np.array(list(itertools.islice(temp, 0, None)))
        train_pred["y_hat"] = y_hat

            # Use median value by id
        y_hat_mean = train_pred.groupby("id").mean()["y_hat"].to_dict()

        new_RMSE = np.sqrt(mean_squared_error(train_pred["id"].map(y_hat_mean), train_fold["y"]))
        print("Pruning {0} RMSE: {1}".format(count, new_RMSE))
            
        if (abs(new_RMSE - RMSE) > 0.0025):
            
            RMSE = new_RMSE
        else:
            RMSE_decreasing = False
        
    validation_pred = validation_fold[["id"]].assign(y_hat=0)
    
    y_hat = model_dnn.predict(validation_fold[cols_dnn])
            #np.array(list(itertools.islice(temp, 0, None)))
        
    # PRUNE VAL SET
        
    validation_pred["y_hat"] = y_hat

    y_hat_med = validation_pred.groupby("id").median()["y_hat"].to_dict()

    RMSE = np.sqrt(mean_squared_error(validation_pred["id"].map(y_hat_med).values, validation_fold["y"]))
    print("Pruning {0} RMSE: {1}".format(count, RMSE))
        
    # Prune outliers
    RMSE_decreasing = True
    while (RMSE_decreasing):
        count +=1
        validation_pred["y_med"] = validation_pred["id"].map(y_hat_med)

        # Distance from the median for each bag
        validation_pred["score"] = (validation_pred["y_hat"] - validation_pred["y_med"])**2
        #Rank of each instance by bag
        validation_pred["rank"] = validation_pred.groupby("id")["score"].rank()
        bag_size_dict = validation_pred.groupby("id")["score"].count().to_dict()
        validation_pred["bag_size"] = validation_pred["id"].map(bag_size_dict)
        validation_pred["rank"] = validation_pred["rank"] / validation_pred["bag_size"]

        # Remove outliers
        outliers_index = validation_pred["rank"] > (1 - 0.02)
        #validation_pred = validation_pred.loc[~outliers_index, :].reset_index(drop=True)
        validation_fold = validation_fold.loc[~outliers_index, :].reset_index(drop=True)
            
            # Compute new RMSE
        validation_pred = validation_fold[["id"]].assign(y_hat=0)
            
            
        y_hat = model_dnn.predict(validation_fold[cols_dnn])
            #np.array(list(itertools.islice(temp, 0, None)))
        validation_pred["y_hat"] = y_hat

            # Use median value by id
        y_hat_mean = validation_pred.groupby("id").mean()["y_hat"].to_dict()

        new_RMSE = np.sqrt(mean_squared_error(validation_pred["id"].map(y_hat_mean), validation_fold["y"]))
        print("Pruning {0} RMSE: {1}".format(count, new_RMSE))
            
        if (abs(new_RMSE - RMSE) > 0.0025):
            
            RMSE = new_RMSE
        else:
            RMSE_decreasing = False
            
    y_hat = model_dnn.predict(validation_fold[cols_dnn])
    
    validation_pred["y_hat"] = y_hat
    y_hat_mean = validation_pred.groupby("id").mean()["y_hat"].to_dict()
    RMSE = np.sqrt(mean_squared_error(validation_fold["id"].map(y_hat_mean).values, validation_fold["y"]))
        
    average_RMSE += RMSE
    
    print("Validation fold {0} RMSE: {1}".format(nb_fold, RMSE))

average_RMSE /= n_splits

print("Cross-validation score: {0}\n".format(average_RMSE))
    


Pruning 22 RMSE: 0.664492983216352
Pruning 23 RMSE: 0.6592802332252014
Pruning 24 RMSE: 0.6488767212490174
Pruning 25 RMSE: 0.6440220044258622
Pruning 26 RMSE: 0.6448665249174638
Pruning 26 RMSE: 0.6337929111353886
Pruning 27 RMSE: 0.6338485033862755
Validation fold 1 RMSE: 0.6338485033862755
Pruning 27 RMSE: 0.6408909647727296
Pruning 28 RMSE: 0.6341245295759226
Pruning 29 RMSE: 0.6252991499928274
Pruning 30 RMSE: 0.6358270130724287
Pruning 31 RMSE: 0.6321307778980557
Pruning 32 RMSE: 0.6178406352471559
Pruning 33 RMSE: 0.6193021544132367
Pruning 34 RMSE: 0.6228607269236701
Pruning 35 RMSE: 0.6216035344986838
Pruning 36 RMSE: 0.6139450289933974
Pruning 37 RMSE: 0.612540027019624
Pruning 38 RMSE: 0.6048404001593066
Pruning 39 RMSE: 0.6113359998142704
Pruning 40 RMSE: 0.6164687194331823
Pruning 41 RMSE: 0.6149720517769527
Pruning 42 RMSE: 0.6112682501319839
Pruning 43 RMSE: 0.6132182251822867
Pruning 44 RMSE: 0.6038391880532306
Pruning 45 RMSE: 0.6140527032296857
Pruning 46 RMSE: 0.6202

In [None]:
# TRAIN THE MODEL 

def input_fn(data_set):
    feature_cols = {k: tf.constant(data_set[k].values) for k in cols_dnn}
    labels = tf.constant(data_set["y"].values)
    return feature_cols, labels
        
      
model_dnn = MLPRegressor(hidden_layer_sizes=(10,),
                                 max_iter=1000,
                                 early_stopping=True,
                                 alpha=parameters["l2_reg"],
                                 random_state=random_seed)
    
model_dnn.fit(train_fold[cols_dnn], train_fold["y"])

train_pred = train_fold[["id"]].assign(y_hat=0)
        
y_hat = model_dnn.predict(train_fold[cols_dnn])
        
train_pred["y_hat"] = y_hat

y_hat_med = train_pred.groupby("id").median()["y_hat"].to_dict()

RMSE = np.sqrt(mean_squared_error(train_pred["id"].map(y_hat_med).values, train_fold["y"]))
print("Pruning {0} RMSE: {1}".format(count, RMSE))
        
# Prune outliers
RMSE_decreasing = True
while (RMSE_decreasing):
    count +=1
    train_pred["y_med"] = train_pred["id"].map(y_hat_med)
    train_pred["y"] = train_pred["id"].map(train_fold["y"])

    # Distance from the median for each bag
    train_pred["score"] = (train_pred["y_hat"] - train_pred["y_med"])**2
    #Rank of each instance by bag
    train_pred["rank"] = train_pred.groupby("id")["score"].rank()
    bag_size_dict = train_pred.groupby("id")["score"].count().to_dict()
    train_pred["bag_size"] = train_pred["id"].map(bag_size_dict)
    train_pred["rank"] = train_pred["rank"] / train_pred["bag_size"]

    # Remove outliers
    outliers_index = train_pred["rank"] > (1 - 0.02)
    train_pred = train_pred.loc[~outliers_index, :].reset_index(drop=True)
    train_fold = train_fold.loc[~outliers_index, :].reset_index(drop=True)
        
    # Remove bags
    y_hat_mean = train_pred.groupby("id").mean()["y_hat"].to_dict()
    train_pred["y_mean"] = train_pred["id"].map(y_hat_mean)
    train_pred["bag_score"] = (train_pred["y_mean"] - train_pred["y"])**2
        
    train_pred["bag_rank"] = train_pred.groupby("id")["bag_score"].rank()
    number_of_bags = len(train_pred.id.unique())
    train_pred["bag_rank"] = train_pred["bag_rank"] / number_of_bags
        
    outliers_index2 = train_pred["bag_rank"] > (1 - 0.02)
    #train_pred = train_pred.loc[~outliers_index2, :].reset_index(drop=True)
    train_fold = train_fold.loc[~outliers_index2, :].reset_index(drop=True)
            
    model_dnn = MLPRegressor(hidden_layer_sizes=(10,),
                                     max_iter=1000,
                                     early_stopping=True,
                                    alpha=parameters["l2_reg"],
                                     random_state=random_seed)
        
    model_dnn.fit(train_fold[cols_dnn], train_fold["y"])
            
            # Compute new RMSE
    train_pred = train_fold[["id"]].assign(y_hat=0)
            
            
    y_hat = model_dnn.predict(train_fold[cols_dnn])
            #np.array(list(itertools.islice(temp, 0, None)))
    train_pred["y_hat"] = y_hat

            # Use median value by id
    y_hat_mean = train_pred.groupby("id").mean()["y_hat"].to_dict()

    new_RMSE = np.sqrt(mean_squared_error(train_pred["id"].map(y_hat_mean), train_fold["y"]))
    print("Pruning {0} RMSE: {1}".format(count, new_RMSE))
            
    if (abs(new_RMSE - RMSE) > 0.0025):
            
        RMSE = new_RMSE
    else:
        RMSE_decreasing = False
        
validation_pred = validation_fold[["id"]].assign(y_hat=0)
    
y_hat = model_dnn.predict(validation_fold[cols_dnn])

In [24]:
# SET UP MODEL FIT

feature_cols = [tf.contrib.layers.real_valued_column(k) for k in cols_dnn]
        
model_dnn = tf.contrib.learn.DNNRegressor(feature_columns=feature_cols, hidden_units=[10])

def input_fn(data_set):
    feature_cols = {k: tf.constant(data_set[k].values) for k in cols_dnn}
    labels = tf.constant(data_set["y"].values)
    return feature_cols, labels


KeyboardInterrupt: 

In [25]:
# MODEL FIT

model_dnn.fit(input_fn=lambda: input_fn(train), steps=900)

TypeError: fit() got an unexpected keyword argument 'input_fn'

In [173]:
# GET THE COMPUTED TRAINED PREDICTIONS

train_pred = train[["id"]].assign(y_hat=0)
temp = model_dnn.predict(input_fn=lambda: input_fn(train))
# .predict() returns an iterator; convert to an array
y_hat = np.array(list(itertools.islice(temp, 0, None)))
train_pred["y_hat"] = y_hat





In [375]:
# COMPUTE MEAN AND SD FOR EACH 100 Y_hat

mean_y_hat = []
sd_y_hat = []


for i in range(1,981):
    xx1 = (train_pred[train_pred["id"] == i])
    mean_y_hat.append((np.median(xx1.loc[:,"y_hat"])))
    
for i in range(1,981):
    xx2 = (train_pred[train_pred["id"] == i])
    sd_y_hat.append(np.std(xx2.loc[:,"y_hat"]))
    


In [376]:
# COMPUTE THE CI 80%

CI = []
CI_upper = []
CI_lower = []

for i in range(0,980):
    CI_upper.append(mean_y_hat[i]+(2.576*(sd_y_hat[i]/np.sqrt(np.count_nonzero([train_pred["id"] == (i+1)])))))
    CI_lower.append(mean_y_hat[i]-(2.576*(sd_y_hat[i]/np.sqrt(np.count_nonzero([train_pred["id"] == (i+1)])))))

    
CI = ([CI_lower,CI_upper])     
CI_t = np.transpose(CI)



In [377]:
# GET THE INDEX OF THE INSTANCES THAT ARE IN THE 80 % CI
kept_instances = []

for i in range(0,98000):
    if ((train_pred.iloc[i,1] > CI_lower[int(np.floor((i/100)))]) & (train_pred.iloc[i,1] < CI_upper[int(np.floor((i/100)))])):
        kept_instances.append(i)

np.shape(kept_instances)

(24523,)

In [378]:
id2 = []

for i in range(0,98000):
    id2.append(i)

In [379]:
ID2 = ["id2"]
id2_Array = np.asarray(id2)
df_id2 = pd.DataFrame(id2_Array, columns=ID2)
df_id2_y_hat = pd.concat([train_pred,df_id2], axis=1)

df_kept_y_hat = df_id2_y_hat[df_id2_y_hat['id2'].isin(np.asarray(kept_instances))]

In [380]:
# Put the y with the y_hat
y_values = []
count_array = []
count979 = []

for i in range(0,980):
    y_values.append(train.iloc[i*100,13])
    count_array.append((np.count_nonzero(df_kept_y_hat[df_kept_y_hat["id"] == (i+1)])/3))
    
new_y = np.repeat(y_values, count_array, axis=0)

for i in range (1,981):
    count979.append(i)
    
id3 = np.repeat(count979, count_array, axis=0)
    

In [381]:
#CONCATENATE AS A DATA FRAME
Y = ["y"]
new_index = []

new_y_Array = np.asarray(new_y)
df_new_y = pd.DataFrame(new_y_Array, columns=Y)
array1 = np.asarray(df_kept_y_hat.loc[:,"id"])
array2 = np.asarray(df_kept_y_hat.loc[:,"y_hat"])
array3 = np.asarray(df_kept_y_hat.loc[:,"id2"])

for i in range(0,np.shape(df_kept_y_hat)[0]):
    new_index.append(str(i))

# different indexing as df_kept_y_hat
df_new_index = pd.DataFrame({'id': array1 ,'y_hat': array2, 'id2': array3},index=new_index)



In [382]:
# SPREAD BETWEEN mean of y_hat and true y
id3_array = np.asarray(id3)
df_id3 = pd.DataFrame(id3_array, columns=["id"])
df_trueY_withID = pd.concat([df_id3,df_new_y], axis=1)

RMSE = []
Difference = []

for i in range(1,981):
    xxx = (df_trueY_withID[df_trueY_withID["id"] == i])
    xxx2 = (df_new_index[df_new_index["id"] == i])
    mean_tmp1 = np.mean(xxx2.loc[:,"y_hat"])
    mean_tmp2 = (xxx.loc[:,"y"])
    Difference.append(mean_tmp1-mean_tmp2)
    RMSE.append(np.sqrt(np.mean(np.sum(np.square(mean_tmp1-mean_tmp2)))))
    


In [409]:
# Reshape the training set without the outliers

RMSE_array = np.asarray(RMSE)
outliers = np.argwhere(RMSE_array > 7)
outliers = np.reshape(outliers, (np.shape(outliers)[0],))

train2 = pd.concat([train,df_id2], axis=1)

train2_with_instances_removed = train2[train2['id2'].isin(np.asarray(kept_instances))]
train2_with_bags_removed = train2_with_instances_removed[train2_with_instances_removed['id'].isin(outliers)==False]




Unnamed: 0,reflectance_0,reflectance_1,reflectance_2,reflectance_3,reflectance_4,reflectance_5,reflectance_6,solar_0,solar_1,solar_2,solar_3,solar_4,id,y,id2
7,-0.681429,-0.668911,-0.263843,-0.168792,-0.060708,-0.532096,-0.163504,0.674068,-1.148351,0.95523,0.205882,-1.105008,1,-3.998082,7
10,-0.415577,-0.441,-0.199816,0.010904,0.092415,-0.422702,-0.023154,0.674068,-1.148351,0.95523,0.205882,-1.105008,1,-3.998082,10
18,-0.685702,-0.733025,-0.28824,-0.133886,-0.057212,-0.629416,-0.146956,0.674068,-1.148351,0.95523,0.205882,-1.105008,1,-3.998082,18
24,-0.668289,-0.677004,-0.386083,-0.212368,-0.110722,-0.584201,-0.19925,0.674068,-1.148351,0.95523,0.205882,-1.105008,1,-3.998082,24
25,-0.821525,-0.894904,-0.339823,-0.192117,-0.14781,-0.735597,-0.222946,0.674068,-1.148351,0.95523,0.205882,-1.105008,1,-3.998082,25


In [None]:
# GET THE MEAN AND STD OF THE NEW TRAINING SET

#mean_kept_y = []
#sd_kept_y = []

#for i in range(1,981):
 #   xx3 = (df_kept_y_hat[df_kept_y_hat["id"] == i])
    #mean_kept_y.append((np.mean(xx3.loc[:,"y_hat"])))
     #sd_kept_y.append((np.std(xx3.loc[:,"y_hat"])))
    
#for i in range(0,980):
   # numpy.random.normal(loc = mean_kept_y[i], scale = sd_kept_y[i], size=100)

In [33]:
id_y_dict = dict(zip(train["id"], train["y"]))

train_pred["y"] = train_pred["id"].replace(id_y_dict)

np.shape(train_pred)

(784, 3)

In [8]:
true_y = []
predicted_y_training = []


for i in range(1,981):
    xx = (train_pred[train_pred["id"] == i])
    true_y.append(xx.iloc[0,2])
    
for i in range(1,981):
    xx = (train_pred[train_pred["id"] == i])
    predicted_y_training.append(xx.loc[:,"y_hat"])

    

In [9]:
np.shape(predicted_y_training)

(980, 100)

In [10]:
FEATURES2 = []
COLUMNS2 = []
for i in range(1,101):
    COLUMNS2.append(str(i))
    FEATURES2.append(str(i))
    
LABEL = ["y"]
ID = ["id"]

id_column = []
for i in range(1,981):
    id_column.append(i)
    

In [11]:
predicted_y_trainingArray = np.asarray(predicted_y_training)
true_yArray = np.asarray(true_y)
id_columnArray = np.asarray(id_column)

df_predicted_y = pd.DataFrame(predicted_y_trainingArray, columns=COLUMNS2)
df_true_y =  pd.DataFrame(true_yArray, columns=LABEL)
df_id_columnArray = pd.DataFrame(id_columnArray, columns=ID)

df_predicted_true = pd.concat([df_predicted_y,df_id_columnArray,df_true_y], axis=1)


In [12]:
cols_excl2 = ["y","id"]
cols_orig2 = [c for c in df_predicted_true.columns if c not in cols_excl2]

df_predicted_true.tail()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,93,94,95,96,97,98,99,100,id,y
975,-6.0214,-3.840708,-3.536518,-3.335321,-3.714761,-3.848545,-2.687434,-3.681495,-3.477635,-3.503855,...,-3.814636,-3.475733,-4.224454,-3.460755,-4.086445,-3.319243,-3.205924,-4.078343,976,-3.666263
976,-3.300596,-3.448747,-3.38578,-3.117596,-3.394598,-3.508353,-3.417411,-3.473523,-2.616889,-3.48086,...,-3.452243,-3.484011,-3.357871,-3.23227,-3.082731,-2.985777,-3.214181,-3.558757,977,-3.906247
977,-1.835551,-1.716611,-1.585653,-2.067531,-1.955666,-1.631939,-2.023438,-1.682725,-1.991886,-2.184844,...,-1.692677,-2.81205,-2.197767,-1.502378,-1.826462,-2.369208,-2.731704,-2.016426,978,-1.888213
978,-3.684743,-4.15468,-4.023054,-3.90961,-4.148894,-3.69253,-4.099753,-3.815801,-3.919218,-4.093242,...,-3.91741,-4.316907,-3.973176,-3.732786,-3.885842,-4.290096,-3.82215,-3.908524,979,-4.532433
979,-3.652737,-4.109573,-4.194172,-4.014184,-3.695538,-4.110124,-4.124703,-4.052232,-3.461169,-3.597208,...,-3.335089,-4.173898,-4.150982,-4.144796,-3.406513,-4.125513,-4.152604,-3.368629,980,-4.744636


In [28]:
y_hat_test = list(model_dnn.predict(input_fn=lambda: input_fn(test)))

In [None]:
id_y_dict2 = dict(zip(test["id"], test["y"]))

train_pred2["y"] = train_pred2["id"].replace(id_y_dict2)

In [None]:
true_y = []
predicted_y_training = []


for i in range(1,981):
    xx = (train_pred[train_pred["id"] == i])
    true_y.append(xx.iloc[0,2])
    
for i in range(1,981):
    xx = (train_pred[train_pred["id"] == i])
    predicted_y_training.append(xx.loc[:,"y_hat"])

    

In [None]:
predicted_y_trainingArray = np.asarray(predicted_y_training)
true_yArray = np.asarray(true_y)
id_columnArray = np.asarray(id_column)

df_predicted_y = pd.DataFrame(predicted_y_trainingArray, columns=COLUMNS2)
df_true_y =  pd.DataFrame(true_yArray, columns=LABEL)
df_id_columnArray = pd.DataFrame(id_columnArray, columns=ID)

df_predicted_true = pd.concat([df_predicted_y,df_id_columnArray,df_true_y], axis=1)


In [None]:
cols_excl2 = ["y","id"]
cols_orig2 = [c for c in df_predicted_true.columns if c not in cols_excl2]

df_predicted_true.tail()