# Import statements at the top in a separate cell

In [1]:
from modules.data_preparator import DataPreparator
from modules.scale_analyzer import ScaleAnalyzer
from modules.model_pipeline import MLModelsPipeline

# Set-up the data parameters

In [2]:
# Note: features are the independent variables
features = ['A1a', 'A1b', 'A2a', 'A2b', 'A2c', 'A2d', 'A2e', 'A2f', 'A2g', 'A2h', 'A3a', 'A3b', 'A3c', 'A3d',
                'A3e', 'A3f', 'A3g', 'A3h', 'A3i', 'A3j', 'A3k', 'A3l', 'A3m', 'A4', 'A4ai', 'A4aii', 'A4aiii',
                'A4aiv', 'A4av', 'A4avi', 'A4avii', 'B1a', 'B1b', 'B2a', 'B2b', 'B2c', 'B2d', 'B3a', 'B3b', 'B3c',
                'B3d', 'B4a', 'B4b', 'B4c', 'B4d', 'B5a', 'B5b', 'B5c', 'B5d', 'B6', 'B7a', 'B7b', 'B7c', 'B7d',
                'B7e', 'B8', 'B8ai', 'B8aii', 'B8aiii', 'B8aiv', 'B8av', 'B8avi', 'B8avii', 'B9', 'C1a', 'C1b', 'C1c',
                'C1d', 'C1e', 'C1f', 'C1g', 'C2', 'C3', 'C4a', 'C4b', 'C4c', 'C4d', 'C4e', 'C4f', 'C4g', 'C4h', 'C4i',
                'C4j', 'C4k', 'C4l', 'c4m', 'C4n', 'C4o', 'C4ii', 'C5', 'C6', 'C6ai', 'C6aii', 'C6aiii', 'C6aiv',
                'C6av', 'C6avi', 'C6avii', 'C6aviii', 'C6aix', 'C6ax', 'C6axi', 'C7', 'C8', 'C9', 'D3', 'D8', 'D14']
dep_var = 'AgencySize'

# Set-up the DataPreparator class

First we instantiate the class. In this example, I will use the option to pass the Path to the data file.

In [3]:
data_prep = DataPreparator(data="data/satisfaction/survey (1).csv", features=features, dep_var=dep_var, max_miss=100)

The following variables contain fewer than the max_miss cutoff of [100] and were dropped from the dataset: {'B8avii', 'B8ai', 'B8aii', 'C6ax', 'C4ii', 'C6aii', 'C6axi', 'B7c', 'B7a', 'B8aiv', 'B8aiii', 'A4aiv', 'C6aiii', 'C6ai', 'B8avi', 'B7e', 'A4avii', 'A4av', 'A4aiii', 'A4aii', 'B7b', 'C6av', 'C6aix', 'A4', 'A4avi', 'C6aviii', 'C6aiv', 'A4ai', 'C6avi', 'B8av', 'C6avii', 'B7d', 'C9'}


Now, we make the method call to split the data, specifying the parameters for the split

In [4]:
data_prep.split_data(val_set=False, test_size=.30, random_state=456)

We can access the training and testing data through the DataPreparator class if we need to. For the other classes we will use, we can just pass the instantiated data_handler and the rest is automatic!

In [5]:
data_prep.data

Unnamed: 0,A1a,A1b,A2a,A2b,A2c,A2d,A2e,A2f,A2g,A2h,...,C4n,C4o,C5,C6,C7,C8,D3,D8,D14,AgencySize
0,2.0,5.0,1.0,1.0,1.0,2.0,1.0,1.0,3.0,1.0,...,1.0,1.0,1.0,2.0,1.0,2.0,2,4.0,2.0,3
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,2.0,1.0,1.0,2,3.0,2.0,3
2,6.0,6.0,3.0,3.0,3.0,4.0,2.0,4.0,4.0,4.0,...,1.0,1.0,1.0,2.0,1.0,1.0,1,3.0,2.0,3
3,7.0,7.0,5.0,7.0,5.0,7.0,1.0,5.0,7.0,6.0,...,1.0,2.0,1.0,2.0,1.0,2.0,1,2.0,2.0,3
4,1.0,3.0,2.0,1.0,2.0,5.0,2.0,3.0,2.0,1.0,...,1.0,1.0,1.0,2.0,1.0,2.0,2,3.0,2.0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2896,2.0,2.0,2.0,2.0,2.0,2.0,2.0,3.0,3.0,3.0,...,1.0,1.0,1.0,2.0,1.0,3.0,2,2.0,2.0,2
2897,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,2.0,1.0,1.0,2,4.0,1.0,2
2898,3.0,3.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,3.0,...,1.0,1.0,1.0,1.0,1.0,1.0,2,4.0,2.0,2
2899,2.0,2.0,2.0,2.0,2.0,3.0,2.0,2.0,2.0,2.0,...,1.0,1.0,1.0,2.0,1.0,2.0,2,3.0,2.0,2


In [6]:
data_prep.x_train

Unnamed: 0,D3,B5a,C4h,c4m,A2h,B2c,C1d,C4e,C4d,A1a,...,B4b,A3l,A3a,C4i,C8,C4g,D14,B3c,B9,A2a
1422,2,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,2.0,...,3.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,3.0,2.0
2202,1,3.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,3.0,...,3.0,1.0,3.0,1.0,2.0,1.0,2.0,2.0,5.0,5.0
2249,2,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,2.0,...,1.0,2.0,3.0,1.0,2.0,1.0,2.0,1.0,3.0,1.0
738,1,2.0,5.0,5.0,1.0,1.0,1.0,5.0,1.0,1.0,...,2.0,1.0,2.0,1.0,2.0,1.0,2.0,1.0,5.0,1.0
2629,2,2.0,1.0,2.0,2.0,3.0,4.0,1.0,1.0,3.0,...,4.0,3.0,3.0,1.0,1.0,1.0,2.0,2.0,4.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1598,2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0,...,2.0,1.0,4.0,1.0,2.0,1.0,2.0,1.0,4.0,6.0
75,2,8.0,1.0,2.0,7.0,5.0,6.0,1.0,1.0,6.0,...,2.0,7.0,7.0,2.0,2.0,1.0,2.0,3.0,1.0,5.0
925,2,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,...,2.0,3.0,3.0,1.0,1.0,1.0,2.0,2.0,2.0,1.0
87,1,2.0,1.0,1.0,2.0,2.0,1.0,1.0,1.0,3.0,...,4.0,4.0,6.0,1.0,2.0,1.0,2.0,4.0,3.0,2.0


In [7]:
data_prep.x_test

Unnamed: 0,D3,B5a,C4h,c4m,A2h,B2c,C1d,C4e,C4d,A1a,...,B4b,A3l,A3a,C4i,C8,C4g,D14,B3c,B9,A2a
1973,1,2.0,1.0,1.0,1.0,3.0,1.0,1.0,1.0,2.0,...,2.0,1.0,2.0,1.0,3.0,1.0,2.0,3.0,3.0,1.0
179,1,3.0,1.0,2.0,1.0,6.0,4.0,2.0,1.0,2.0,...,4.0,2.0,3.0,1.0,3.0,1.0,2.0,2.0,3.0,2.0
2309,1,2.0,1.0,1.0,7.0,7.0,7.0,1.0,1.0,3.0,...,7.0,7.0,6.0,1.0,1.0,1.0,2.0,3.0,6.0,3.0
1218,2,2.0,1.0,1.0,2.0,1.0,2.0,3.0,2.0,3.0,...,2.0,3.0,5.0,1.0,2.0,2.0,2.0,1.0,1.0,2.0
591,2,2.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
712,1,2.0,1.0,2.0,4.0,1.0,1.0,1.0,2.0,3.0,...,4.0,4.0,5.0,1.0,1.0,1.0,2.0,3.0,3.0,3.0
1532,2,2.0,1.0,2.0,1.0,1.0,1.0,2.0,1.0,1.0,...,1.0,1.0,1.0,1.0,3.0,2.0,1.0,1.0,1.0,1.0
2615,2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,...,2.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0
1024,2,3.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,...,2.0,2.0,2.0,1.0,1.0,1.0,2.0,2.0,3.0,1.0


In [8]:
data_prep.y_train

1422    2
2202    3
2249    3
738     3
2629    2
       ..
1598    2
75      3
925     3
87      3
2457    2
Name: AgencySize, Length: 1421, dtype: int64

In [9]:
data_prep.y_test

1973    3
179     3
2309    3
1218    3
591     3
       ..
712     3
1532    2
2615    2
1024    3
1720    2
Name: AgencySize, Length: 871, dtype: int64

# Exploratory Factor Analysis

The efa is automatically performed on the training dataset. If you set n_factors to estimate, then the process will estimate a number of factors equal to the number of items and provide Eigenvalues.

In [10]:
sa = ScaleAnalyzer(data_preparator=data_prep)
efa_factor_loadings = sa.efa(rotation='oblimin', estimation='ml', n_factors='estimate')
efa_factor_loadings



array([22.99672096,  4.48378134,  3.42187001,  2.4944326 ,  2.08832157,
        1.90837882,  1.68759135,  1.4222784 ,  1.40202798,  1.34968674,
        1.26814922,  1.20957741,  1.10804609,  1.02610582,  0.98826576,
        0.94685702,  0.92386562,  0.89724608,  0.84929106,  0.83312517,
        0.79580192,  0.77492917,  0.76513262,  0.73130439,  0.70502681,
        0.69444574,  0.6740148 ,  0.65223305,  0.63102516,  0.60975828,
        0.57865778,  0.55917524,  0.55414351,  0.53854878,  0.5200498 ,
        0.51231506,  0.49508844,  0.48325153,  0.46486319,  0.46172752,
        0.44813209,  0.43422225,  0.42782786,  0.41237472,  0.40405219,
        0.40066601,  0.38368541,  0.37470819,  0.36734001,  0.36021531,
        0.35113819,  0.33227995,  0.32442684,  0.32373555,  0.30252876,
        0.27444178,  0.27178322,  0.2683454 ,  0.25653354,  0.24824433,
        0.23598002,  0.22903379,  0.21781735,  0.21561457,  0.19514438,
        0.19160273,  0.18154254,  0.16951237,  0.16360004,  0.15

Based on these results, I have decided to keep 5 factors (eigenvalues over 2). This is fairly trivial in the grand-scheme...there are statistical methods to help determine the number of factors and also you can use the 'old reliable' scree plot. That isn't important for this tutorial, so we will be moving on, but maybe you want to add that in for yourself later! 

Running the EFA again, but this time n_factors is set to 5.

In [11]:
sa = ScaleAnalyzer(data_preparator=data_prep)
efa_factor_loadings = sa.efa(rotation='oblimin', estimation='ml', n_factors=5)
efa_factor_loadings



Unnamed: 0,factor_1,factor_2,factor_3,factor_4,factor_5,max_load
D3,0.107094,0.023843,-0.055466,-0.020941,-0.071404,factor_1
B5a,0.404977,0.014853,-0.001257,0.057079,0.412273,factor_5
C4h,-0.020745,0.589163,-0.029585,-0.024532,-0.022806,factor_2
c4m,0.050330,0.538839,0.025215,0.091166,0.051743,factor_2
A2h,0.677056,0.003623,-0.073291,0.090305,0.008689,factor_1
...,...,...,...,...,...,...
C4g,-0.040795,0.536299,-0.063713,0.066333,-0.088829,factor_2
D14,0.048461,-0.102761,-0.006592,0.010604,0.209391,factor_5
B3c,0.137977,-0.057661,0.034876,0.510859,0.084612,factor_4
B9,-0.015084,-0.016199,0.355209,0.017013,0.151099,factor_3


At first glance, we can already see some items that have poor loadings across factors (e.g. D8). Again, not important for the tutorial so we will move on, but maybe its possible to add a process that would automatically remove any items that have eigenvalues all below a set threshold. If you decide to do this, don't forget that you will need to update the scale_dict to reflect the dropped items for downstream procedures!

# Confirmatory Factor Analysis (but not my favorite package)

We can use the scale_dict that was created by our EFA. 

In [12]:
cfa_output = sa.cfa(scale_dict=sa.scale_dict)
cfa_output



Unnamed: 0,factor_1,factor_2,factor_3,factor_4,factor_5,factor_error_1,factor_error_2,factor_error_3,factor_error_4,factor_error_5
D3,0.556973,0.0,0.0,0.0,0.000000,0.031914,0.0,0.0,0.0,0.000000
A2h,0.892982,0.0,0.0,0.0,0.000000,0.036571,0.0,0.0,0.0,0.000000
A1a,0.938822,0.0,0.0,0.0,0.000000,0.036500,0.0,0.0,0.0,0.000000
A1b,1.108141,0.0,0.0,0.0,0.000000,0.038757,0.0,0.0,0.0,0.000000
A3i,1.013887,0.0,0.0,0.0,0.000000,0.038527,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...
C2,0.000000,0.0,0.0,0.0,0.902095,0.000000,0.0,0.0,0.0,0.031145
C1f,0.000000,0.0,0.0,0.0,1.083919,0.000000,0.0,0.0,0.0,0.041530
C5,0.000000,0.0,0.0,0.0,0.913576,0.000000,0.0,0.0,0.0,0.031358
C8,0.000000,0.0,0.0,0.0,0.924701,0.000000,0.0,0.0,0.0,0.034045


Yikes, a warning that we didn't converge! OK, I admit that we should probably parse down the items based on EFA results, but I'm still going to ignore it and leave the audiance with something to think about. Don't worry though, you can create you own scale dictionary based on your observations and pass that to the methods instead, so its still flexible!

# Polytomous IRT using the Graded Unfolding Model (GUM)

The warning that appears is caused because the items have different scale ranges (e.g., 1-7), so some items have a shorter range and will cause this warning to appear. Wherever you see a nan values to difficulty or tau, its because the range for them items did not extend to those values. 

In [13]:
irt_stats_res, abilities = sa.irt_gum(scale_dict=sa.scale_dict)
irt_stats_res

  fx = wrapped_fun(x)
  fx = wrapped_fun(x)
  fx = wrapped_fun(x)
  fx = wrapped_fun(x)


Unnamed: 0,Unnamed: 1,discrimination,difficulty_1,difficulty_2,difficulty_3,difficulty_4,difficulty_5,difficulty_6,difficulty_7,difficulty_8,difficulty_9,...,difficulty_13,difficulty_14,difficulty_15,tau1,tau2,tau3,tau4,tau5,tau6,tau7
factor_1,D3,0.300606,-0.575025,,,,,,,2.000000,,...,,,4.575025,-2.575025,,,,,,
factor_1,A2h,1.487029,-0.976915,-0.394861,-0.718178,0.495240,-0.255433,-0.294100,2.355977,-0.071677,-2.499332,...,0.574823,0.251506,0.833560,-0.905237,-0.323184,-0.646501,0.566917,-0.183755,-0.222422,2.427654
factor_1,A1a,1.907712,-1.651452,-0.543131,-0.145724,-0.717366,-0.388435,-0.208602,,-0.214219,,...,-0.282714,0.114693,1.223014,-1.437233,-0.328912,0.068495,-0.503146,-0.174216,0.005617,
factor_1,A1b,2.939091,-1.551974,-0.735983,-0.430125,-0.714569,-0.342603,-0.199019,,-0.034438,,...,0.361249,0.667107,1.483098,-1.517536,-0.701545,-0.395687,-0.680131,-0.308165,-0.164581,
factor_1,A3i,1.758896,-1.571671,-0.921100,-0.736454,-0.097149,-0.407235,-0.453635,1.969070,-0.061897,-2.092865,...,0.612660,0.797306,1.447877,-1.509774,-0.859203,-0.674557,-0.035252,-0.345338,-0.391738,2.030967
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
factor_5,C2,2.636160,0.235232,,,,,,,-0.246351,,...,,,-0.727934,0.481583,,,,,,
factor_5,C1f,1.025940,0.016566,0.570682,0.093418,0.451338,0.449216,-0.363507,0.465765,0.237192,0.008620,...,0.380967,-0.096297,0.457819,-0.220626,0.333490,-0.143775,0.214145,0.212023,-0.600699,0.228573
factor_5,C5,3.518409,-0.115249,,,,,,,-0.167172,,...,,,-0.219096,0.051924,,,,,,
factor_5,C8,1.756845,-1.549832,-0.647122,,,,,,-0.609356,,...,,-0.571591,0.331119,-0.940476,-0.037765,,,,,


Notice that this is a multi-index DF, where the factor is the first index, and the second indicies are the items in that factor. We can access Factor 1 as such...

In [14]:
irt_stats_res.loc['factor_1', :, :]

Unnamed: 0,discrimination,difficulty_1,difficulty_2,difficulty_3,difficulty_4,difficulty_5,difficulty_6,difficulty_7,difficulty_8,difficulty_9,...,difficulty_13,difficulty_14,difficulty_15,tau1,tau2,tau3,tau4,tau5,tau6,tau7
D3,0.300606,-0.575025,,,,,,,2.0,,...,,,4.575025,-2.575025,,,,,,
A2h,1.487029,-0.976915,-0.394861,-0.718178,0.49524,-0.255433,-0.2941,2.355977,-0.071677,-2.499332,...,0.574823,0.251506,0.83356,-0.905237,-0.323184,-0.646501,0.566917,-0.183755,-0.222422,2.427654
A1a,1.907712,-1.651452,-0.543131,-0.145724,-0.717366,-0.388435,-0.208602,,-0.214219,,...,-0.282714,0.114693,1.223014,-1.437233,-0.328912,0.068495,-0.503146,-0.174216,0.005617,
A1b,2.939091,-1.551974,-0.735983,-0.430125,-0.714569,-0.342603,-0.199019,,-0.034438,,...,0.361249,0.667107,1.483098,-1.517536,-0.701545,-0.395687,-0.680131,-0.308165,-0.164581,
A3i,1.758896,-1.571671,-0.9211,-0.736454,-0.097149,-0.407235,-0.453635,1.96907,-0.061897,-2.092865,...,0.61266,0.797306,1.447877,-1.509774,-0.859203,-0.674557,-0.035252,-0.345338,-0.391738,2.030967
A2g,1.369149,-2.030229,-1.294762,-0.917067,-0.86409,-0.779809,-1.034973,1.485722,-0.195359,-1.87644,...,0.526349,0.904043,1.63951,-1.834869,-1.099402,-0.721708,-0.668731,-0.58445,-0.839614,1.681081
A3f,1.395147,-1.426062,-0.644127,-0.203849,-0.523874,-0.161424,-0.432802,2.300908,-0.017194,-2.335296,...,0.169461,0.609738,1.391674,-1.408868,-0.626933,-0.186655,-0.50668,-0.14423,-0.415607,2.318102
C1b,1.267983,-1.166104,-0.368211,-0.622981,-0.220948,-0.236743,-0.717465,0.775097,-0.068136,-0.91137,...,0.486708,0.231938,1.029831,-1.097967,-0.300074,-0.554845,-0.152811,-0.168606,-0.649329,0.843233
A3e,1.186409,-2.149613,-0.976968,-1.031574,-0.55908,-0.479248,-0.784576,0.904036,-0.087262,-1.078561,...,0.85705,0.802444,1.975089,-2.062351,-0.889706,-0.944312,-0.471817,-0.391986,-0.697314,0.991299
B1a,1.48466,-1.822982,-0.87225,-0.199343,-1.066716,-0.534201,-0.619418,1.886012,-0.126837,-2.139686,...,-0.054331,0.618576,1.569308,-1.696145,-0.745413,-0.072506,-0.939879,-0.407364,-0.492581,2.012849


### Ability Scores

In [15]:
abilities

Unnamed: 0,factor_1_ability,factor_2_ability,factor_3_ability,factor_4_ability,factor_5_ability
0,0.678667,-0.309518,-0.012440,0.090528,-0.747208
1,2.645327,0.339038,-0.162486,0.090528,-1.056107
2,0.005664,0.008712,-0.369679,0.264533,0.407378
3,-0.261269,0.022423,-0.451135,-0.084308,0.132708
4,0.058929,-0.101978,0.518860,-0.056026,-0.465660
...,...,...,...,...,...
2896,-0.366993,-0.101978,-0.162486,0.090528,-0.653589
2897,1.769238,-0.101978,0.221810,0.113879,1.508761
2898,0.720803,0.162223,0.653650,-0.980381,0.897436
2899,0.693558,0.307141,-0.052466,0.090528,0.202527


# Create Scale Means from EFA output

In [16]:
scales_data = sa.create_scales(scale_dict=sa.scale_dict, dataset=data_prep.data.drop(labels=[dep_var], axis=1))
# merge in the independent variable
scales_data = scales_data.merge(data_prep.data[dep_var].to_frame(), left_index=True, right_index=True)
scales_data

Unnamed: 0,factor_1,factor_2,factor_3,factor_4,factor_5,AgencySize
0,3.00000,1.066667,1.500,1.166667,1.642857,3
1,1.09375,1.266667,1.000,1.166667,1.214286,3
2,4.75000,1.866667,1.375,2.500000,2.642857,3
3,5.25000,1.800000,1.750,2.833333,2.714286,3
4,2.25000,1.000000,2.750,1.833333,1.428571,3
...,...,...,...,...,...,...
2896,2.18750,1.000000,1.000,1.166667,1.357143,2
2897,1.43750,1.000000,1.125,1.833333,1.142857,2
2898,2.59375,1.266667,1.875,1.166667,1.571429,2
2899,2.59375,1.133333,1.125,1.166667,2.071429,2


Create a new DataPreparator using the scales_data

In [17]:
scales_data_prep = DataPreparator(data=scales_data, features=list(scales_data.columns[0:-1].values), dep_var=dep_var)
scales_data_prep.split_data(val_set=False, test_size=.30, random_state=456)

# Create a Machine Learning Ensemble to predict agency size based on satisfaction survey results

We need to create a dictionary with the models we want and the keyword arguments we want to pass. The only model available right now are: linear regression, support vector regressor, random forest regressor, and k-nearest neighbor.

In [18]:
 model_specs = {
        'lr': {},
        'knn': {
           'n_neighbors': 5,
            'leaf_size': 20
        },
        'svr': {
            'kernel': 'rbf'
        },
        'rf': {
            'n_estimators': 50,
            'max_depth': 10
        }
    }

In [20]:
pipeline = MLModelsPipeline(data_preparator=scales_data_prep, models=model_specs.keys())
pipeline.train_models(model_specs=model_specs)
evaluation_results = pipeline.evaluate_performance(scorer=['mean_absolute_error', 'mean_squared_error', 'r2_score'],
                                                   ensemble='regressor')
print(evaluation_results)

Training for model lr is complete!
Training for model knn is complete!
Training for model svr is complete!
Training for model rf is complete!
Evaluation for model lr is complete!
Evaluation for model knn is complete!
Evaluation for model svr is complete!
Evaluation for model rf is complete!
                mean_absolute_error  mean_squared_error  r2_score
lr       train             0.541587            0.380199  0.017457
         test              0.536214            0.361172 -0.002759
knn      train             0.449683            0.310401  0.197835
         test              0.540987            0.430769 -0.195989
svr      train             0.442636            0.421691 -0.089771
         test              0.456709            0.414305 -0.150277
rf       train             0.390603            0.204319  0.471980
         test              0.531802            0.370459 -0.028542
ensemble train             0.455753            0.297244  0.231837
         test              0.515713            0

These results are leftover from the previous version of the package. You can see that we are getting the same results as before on test dataset, so the new pipeline appears to work!

In [21]:
ens_test_res

Unnamed: 0,MAE,MSE,r2
lr,0.536214,0.361172,-0.002759
knn,0.540987,0.430769,-0.195989
svr,0.456709,0.414305,-0.150277
rf,0.531401,0.368242,-0.022388
ensemble,0.515416,0.362678,-0.006941


So, the models are not good. Probably because I didn't remove some poor performing items ealier...ooops. Maybe you can do a better job than me by making some improvements?