In [7]:
import pandas as pd

from data import Dataset, get_transfer_dataset, get_random_datasets
from learner import PredictorLearner, TransferLearner
import analysis as a

# define training set sizes to be used in experiments
TRAINING_SET_SIZES = [0.2, 0.4, 0.6, 0.8]
SUBJECT_SYSTEMS = ['NODEJS', 'POPPLER', 'X264', 'XZ']

# initialise results columns
rq1_results_fields = ['mse_accuracy_tgt_no_cv', 'mape_accuracy_tgt_no_cv', 'mse_accuracy_tgt_cv', 'mape_accuracy_tgt_cv',
                     'mse_accuracy_trans_no_cv', 'mape_accuracy_trans_no_cv', 'mse_accuracy_trans_cv', 'mape_accuracy_trans_cv']
rq1_results = pd.DataFrame(columns=rq1_results_fields) # done
rq2_results_fields = ['mape_accuracy_pred_20pct', 'mape_accuracy_pred_40pct', 'mape_accuracy_pred_60pct', 
                     'mape_accuracy_pred_80pct', 'mape_accuracy_trans_20pct', 'mape_accuracy_trans_40pct', 
                     'mape_accuracy_trans_60pct',  'mape_accuracy_trans_80pct', 'mse_accuracy_pred_20pct', 
                     'mse_accuracy_pred_40pct', 'mse_accuracy_pred_60pct', 'mse_accuracy_pred_80pct',  
                     'mse_accuracy_trans_20pct', 'mse_accuracy_trans_40pct', 'mse_accuracy_trans_60pct', 
                     'mse_accuracy_trans_80pct']
rq2_results = pd.DataFrame(columns=rq2_results_fields)
rq3_results_fields = ['training_time_pred_20pct_no_cv', 'training_time_pred_40pct_no_cv', 'training_time_pred_60pct_no_cv', 
                     'training_time_pred_80pct_no_cv', 'training_time_pred_20pct_cv', 'training_time_pred_40pct_cv', 
                     'training_time_pred_60pct_cv', 'training_time_pred_80pct_cv', 'training_time_trans_20pct_no_cv', 
                     'training_time_trans_40pct_no_cv', 'training_time_trans_60pct_no_cv', 'training_time_trans_80pct_no_cv', 
                     'training_time_trans_20pct_cv', 'training_time_trans_40pct_cv', 'training_time_trans_60pct_cv', 
                     'training_time_trans_80pct_cv']
rq3_results = pd.DataFrame(columns=rq3_results_fields)

# randomly select a source and target dataset for each subject system
nodejs_datasets, poppler_datasets, x264_datasets, xz_datasets = get_random_datasets()
datasets = {
    'NODEJS': nodejs_datasets,
    'POPPLER': poppler_datasets,
    'X264': x264_datasets,
    'XZ': xz_datasets
}

In [2]:
### RQ1

mape_accuracy_pred_no_cv = []
mape_accuracy_trans_no_cv = []
mape_accuracy_pred_cv = []
mape_accuracy_trans_cv = []
mse_accuracy_pred_no_cv = []
mse_accuracy_trans_no_cv = []
mse_accuracy_pred_cv = []
mse_accuracy_trans_cv = []

predictor = PredictorLearner()
transferrer = TransferLearner()

# get optimised predictor model using hyperparameter optimisation
X_train, X_validate, y_train, y_validate = tgt_x264.get_split_dataset()
optimised_model = predictor.get_optimal_params(X_validate, y_validate)

# get accuracy of optimised model for predictor learner
predictor.fit(X_train, y_train, premade_model=optimised_model)
mape_accuracy_pred_cv.append(predictor.get_error(X_train, y_train, measure='mape'))
mse_accuracy_pred_cv.append(predictor.get_error(X_train, y_train, measure='mse'))

# get accuracy of non-optimised model for predictor learner
predictor.fit(X_train, y_train)
mape_accuracy_pred_no_cv.append(predictor.get_error(X_train, y_train, measure='mape'))
mse_accuracy_pred_no_cv.append(predictor.get_error(X_train, y_train, measure='mse'))


# get optimised transfer model using hyperparameter optimisation
X_train, X_validate, y_train, y_validate = get_transfer_dataset(src_x264, tgt_x264)
optimised_model = transferrer.get_optimal_params(X_validate, y_validate)

# get accuracy of optimised model for transfer learner
transferrer.fit(X_train, y_train, premade_model=optimised_model)
mape_accuracy_trans_cv.append(transferrer.get_error(X_train, y_train, measure='mape'))
mse_accuracy_trans_cv.append(predictor.get_error(X_train, y_train, measure='mse'))

# get accuracy of non-optimised model for transfer learner
transferrer.fit(X_train, y_train)
mape_accuracy_trans_no_cv.append(transferrer.get_error(X_train, y_train, measure='mape'))
mse_accuracy_trans_no_cv.append(predictor.get_error(X_train, y_train, measure='mse'))


rq1_results['mse_accuracy_tgt_no_cv'] = mse_accuracy_pred_no_cv
rq1_results['mape_accuracy_tgt_no_cv'] = mape_accuracy_pred_no_cv
rq1_results['mse_accuracy_tgt_cv'] = mse_accuracy_pred_cv
rq1_results['mape_accuracy_tgt_cv'] = mape_accuracy_pred_cv
rq1_results['mse_accuracy_trans_no_cv'] = mse_accuracy_trans_no_cv
rq1_results['mape_accuracy_trans_no_cv'] = mape_accuracy_trans_no_cv
rq1_results['mse_accuracy_trans_cv'] = mse_accuracy_trans_cv
rq1_results['mape_accuracy_trans_cv'] = mape_accuracy_trans_cv

In [3]:
### RQ2
mape_accuracy_pred_20pct = []
mape_accuracy_pred_40pct = []
mape_accuracy_pred_60pct = []
mape_accuracy_pred_80pct = []
mape_accuracy_trans_20pct = []
mape_accuracy_trans_40pct = []
mape_accuracy_trans_60pct = []
mape_accuracy_trans_80pct = []
mse_accuracy_pred_20pct = []
mse_accuracy_pred_40pct = []
mse_accuracy_pred_60pct = []
mse_accuracy_pred_80pct = []
mse_accuracy_trans_20pct = []
mse_accuracy_trans_40pct = []
mse_accuracy_trans_60pct = []
mse_accuracy_trans_80pct = []

transferrer = TransferLearner()
predictor = PredictorLearner()

for train_size in TRAINING_SET_SIZES: 
    
    X_train, X_validate, y_train, y_validate = tgt_x264.get_split_dataset()
    optimised_model = predictor.get_optimal_params(X_validate, y_validate)
    
    # get accuracy of predictor model for current training set size
    predictor.fit(X_train, y_train, premade_model=optimised_model)
    mape_accuracy_pred = predictor.get_error(X_train, y_train, measure='mape')
    mse_accuracy_pred = predictor.get_error(X_train, y_train, measure='mse')
    
    
    X_train, X_validate, y_train, y_validate = get_transfer_dataset(src_x264, 
                                                                    tgt_x264, 
                                                                    train_size=train_size)
    optimised_model = transferrer.get_optimal_params(X_validate, y_validate)
    
    # get accuracy of transfer model for each training set size
    transferrer.fit(X_train, y_train, premade_model=optimised_model)
    mape_accuracy_trans = transferrer.get_error(X_train, y_train, measure='mape')
    mse_accuracy_trans = transferrer.get_error(X_train, y_train, measure='mse')
    

    # record accuracy in appropriate results column
    if train_size == 0.2:
        mape_accuracy_pred_20pct.append(mape_accuracy_pred)
        mape_accuracy_trans_20pct.append(mape_accuracy_trans)
        mse_accuracy_pred_20pct.append(mse_accuracy_pred)
        mse_accuracy_trans_20pct.append(mse_accuracy_trans)
    elif train_size == 0.4:
        mape_accuracy_pred_40pct.append(mape_accuracy_pred)
        mape_accuracy_trans_40pct.append(mape_accuracy_trans)
        mse_accuracy_pred_40pct.append(mse_accuracy_pred)
        mse_accuracy_trans_40pct.append(mse_accuracy_trans)
    elif train_size == 0.6:
        mape_accuracy_pred_60pct.append(mape_accuracy_pred)
        mape_accuracy_trans_60pct.append(mape_accuracy_trans)
        mse_accuracy_pred_60pct.append(mse_accuracy_pred)
        mse_accuracy_trans_60pct.append(mse_accuracy_trans)
    elif train_size == 0.8:
        mape_accuracy_pred_80pct.append(mape_accuracy_pred)
        mape_accuracy_trans_80pct.append(mape_accuracy_trans)
        mse_accuracy_pred_80pct.append(mse_accuracy_pred)
        mse_accuracy_trans_80pct.append(mse_accuracy_trans)
        
        
rq2_results['mape_accuracy_pred_20pct'] = mape_accuracy_pred_20pct
rq2_results['mape_accuracy_pred_40pct'] = mape_accuracy_pred_40pct
rq2_results['mape_accuracy_pred_60pct'] = mape_accuracy_pred_60pct
rq2_results['mape_accuracy_pred_80pct'] = mape_accuracy_pred_80pct
rq2_results['mse_accuracy_pred_20pct'] = mse_accuracy_pred_20pct
rq2_results['mse_accuracy_pred_40pct'] = mse_accuracy_pred_40pct
rq2_results['mse_accuracy_pred_60pct'] = mse_accuracy_pred_60pct
rq2_results['mse_accuracy_pred_80pct'] = mse_accuracy_pred_80pct
rq2_results['mape_accuracy_trans_20pct'] = mape_accuracy_trans_20pct
rq2_results['mape_accuracy_trans_40pct'] = mape_accuracy_trans_40pct
rq2_results['mape_accuracy_trans_60pct'] = mape_accuracy_trans_60pct
rq2_results['mape_accuracy_trans_80pct'] = mape_accuracy_trans_80pct
rq2_results['mse_accuracy_trans_20pct'] = mse_accuracy_trans_20pct
rq2_results['mse_accuracy_trans_40pct'] = mse_accuracy_trans_40pct
rq2_results['mse_accuracy_trans_60pct'] = mse_accuracy_trans_40pct
rq2_results['mse_accuracy_trans_80pct'] = mse_accuracy_trans_80pct

In [4]:
### RQ3

training_time_pred_20pct_no_cv = []
training_time_pred_40pct_no_cv = []
training_time_pred_60pct_no_cv = []
training_time_pred_80pct_no_cv = []
training_time_pred_20pct_cv = []
training_time_pred_40pct_cv = []
training_time_pred_60pct_cv = []
training_time_pred_80pct_cv = []
training_time_trans_20pct_no_cv = []
training_time_trans_40pct_no_cv = []
training_time_trans_60pct_no_cv = []
training_time_trans_80pct_no_cv = []
training_time_trans_20pct_cv = []
training_time_trans_40pct_cv = []
training_time_trans_60pct_cv = []
training_time_trans_80pct_cv = []

transferrer = TransferLearner()
predictor = PredictorLearner()

for train_size in TRAINING_SET_SIZES:
    
    X_train, X_validate, y_train, y_validate = tgt_x264.get_split_dataset()
    # get optimised predictor model using hyperparameter optimisation
    optimised_model = predictor.get_optimal_params(X_validate, y_validate)
    predictor.fit(X_train, y_train, premade_model=optimised_model)
    
    # gather results
    training_time_pred_no_cv = predictor.get_training_time()
    training_time_pred_cv = predictor.get_training_time(include_optimisation_time=True)
    

    X_train, X_validate, y_train, y_validate = get_transfer_dataset(src_x264, tgt_x264)
    # get optimised transfer model using hyperparameter optimisation
    optimised_model = transferrer.get_optimal_params(X_validate, y_validate)
    transferrer.fit(X_train, y_train, premade_model=optimised_model)
    
    # gather results
    training_time_trans_no_cv = transferrer.get_training_time()
    training_time_trans_cv = transferrer.get_training_time(include_optimisation_time=True)


    # record accuracy in appropriate results column
    if train_size == 0.2:
        training_time_pred_20pct_no_cv.append(training_time_pred_no_cv)
        training_time_pred_20pct_cv.append(training_time_pred_cv)
        training_time_trans_20pct_no_cv.append(training_time_trans_no_cv)
        training_time_trans_20pct_cv.append(training_time_trans_cv)
    elif train_size == 0.4:
        training_time_pred_40pct_no_cv.append(training_time_pred_no_cv)
        training_time_pred_40pct_cv.append(training_time_pred_cv)
        training_time_trans_40pct_no_cv.append(training_time_trans_no_cv)
        training_time_trans_40pct_cv.append(training_time_trans_cv)
    elif train_size == 0.6:
        training_time_pred_60pct_no_cv.append(training_time_pred_no_cv)
        training_time_pred_60pct_cv.append(training_time_pred_cv)
        training_time_trans_60pct_no_cv.append(training_time_trans_no_cv)
        training_time_trans_60pct_cv.append(training_time_trans_cv)
    elif train_size == 0.8:
        training_time_pred_80pct_no_cv.append(training_time_pred_no_cv)
        training_time_pred_80pct_cv.append(training_time_pred_cv)
        training_time_trans_80pct_no_cv.append(training_time_trans_no_cv)
        training_time_trans_80pct_cv.append(training_time_trans_cv)
        
        
rq3_results['training_time_pred_20pct_no_cv'] = training_time_pred_20pct_no_cv
rq3_results['training_time_pred_40pct_no_cv'] = training_time_pred_40pct_no_cv
rq3_results['training_time_pred_60pct_no_cv'] = training_time_pred_60pct_no_cv
rq3_results['training_time_pred_80pct_no_cv'] = training_time_pred_80pct_no_cv
rq3_results['training_time_pred_20pct_cv'] = training_time_pred_20pct_cv
rq3_results['training_time_pred_40pct_cv'] = training_time_pred_40pct_cv
rq3_results['training_time_pred_60pct_cv'] = training_time_pred_60pct_cv
rq3_results['training_time_pred_80pct_cv'] = training_time_pred_80pct_cv
rq3_results['training_time_trans_20pct_no_cv'] = training_time_trans_20pct_no_cv
rq3_results['training_time_trans_40pct_no_cv'] = training_time_trans_40pct_no_cv
rq3_results['training_time_trans_60pct_no_cv'] = training_time_trans_60pct_no_cv
rq3_results['training_time_trans_80pct_no_cv'] = training_time_trans_80pct_no_cv
rq3_results['training_time_trans_20pct_cv'] = training_time_trans_20pct_cv
rq3_results['training_time_trans_40pct_cv'] = training_time_trans_40pct_cv
rq3_results['training_time_trans_60pct_cv'] = training_time_trans_60pct_cv
rq3_results['training_time_trans_80pct_cv'] = training_time_trans_80pct_cv