In [2]:
import math
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn import metrics
from sklearn.metrics import make_scorer
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, RandomizedSearchCV
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from mlxtend.regressor import StackingRegressor
from xgboost import XGBRegressor, XGBRFRegressor

In [3]:
def read_data(filename):

    data=pd.read_csv('~/Desktop/Capstone/data/lucene/' + filename)
    data=data.drop(['name', 'version', 'name.1'], axis='columns')

    features=data.drop(['bug'], axis='columns')
    label=data['bug']

    test_features=features.values.reshape(-1,20)
    test_label=label.values.reshape(-1,1)

    return test_features,test_label,filename

In [4]:
def get_train_data(file1,file2):
    df1=pd.read_csv('~/Desktop/Capstone/data/lucene/'+file1)
    df2=pd.read_csv('~/Desktop/Capstone/data/lucene/'+file2)
    merged=df1.append(df2)

    merged=merged.drop(['name','version','name.1'], axis='columns')

    train_features=merged.drop(['bug'], axis='columns')
    train_label=merged['bug']

    train_features=train_features.values.reshape(-1,20)
    train_label=train_label.values.reshape(-1,1)

    return train_features,train_label

In [5]:
def mean_relative_error(actual, predicted):
    rel_err=[]
    actual,predicted=np.array(actual), np.array(predicted)
    for i in range(0,len(actual)):
        err=abs(actual[i]-predicted[i])/(actual[i]+1)
        rel_err.append(err)
    return np.mean(rel_err)

In [6]:
def run_techniques(): 

    test_features,test_label,filename=read_data('lucene-2.4.csv')
    train_features, train_label=get_train_data('lucene-2.2.csv','lucene-2.0.csv')

    stravgmre,stravgmretest=[],[]
    stravgmae,stravgmaetest=[],[]
    stravgrmse,stravgrmsetest=[],[]

    dtravgmre,dtravgmretest=[],[]
    dtravgmae,dtravgmaetest=[],[]
    dtravgrmse,dtravgrmsetest=[],[]

    rfravgmre,rfravgmretest=[],[]
    rfravgmae,rfravgmaetest=[],[]
    rfravgrmse,rfravgrmsetest=[],[]

    etravgmre,etravgmretest=[],[]
    etravgmae,etravgmaetest=[],[]
    etravgrmse,etravgrmsetest=[],[]

    knnavgmre,knnavgmretest=[],[]
    knnavgmae,knnavgmaetest=[],[]
    knnavgrmse,knnavgrmsetest=[],[]

    svravgmre,svravgmretest=[],[]
    svravgmae,svravgmaetest=[],[]
    svravgrmse,svravgrmsetest=[],[]

    bavgmre,bavgmretest=[],[]
    bavgmae,bavgmaetest=[],[]
    bavgrmse,bavgrmsetest=[],[]

    import warnings
    warnings.filterwarnings("ignore")

    for run in range(0,30):
        print('run: ', run+1)

        kf=KFold(n_splits=10,shuffle=True)
        i=1

        strmre,strmretest=[],[]
        strmae,strmaetest=[],[]
        strrmse,strrmsetest=[],[]

        dtrmre,dtrmretest=[],[]
        dtrmae,dtrmaetest=[],[]
        dtrrmse,dtrrmsetest=[],[]

        rfrmre,rfrmretest=[],[]
        rfrmae,rfrmaetest=[],[]
        rfrrmse,rfrrmsetest=[],[]

        etrmre,etrmretest=[],[]
        etrmae,etrmaetest=[],[]
        etrrmse,etrrmsetest=[],[]

        knnmre,knnmretest=[],[]
        knnmae,knnmaetest=[],[]
        knnrmse,knnrmsetest=[],[]

        svrmre,svrmretest=[],[]
        svrmae,svrmaetest=[],[]
        svrrmse,svrrmsetest=[],[]

        bmre,bmretest=[],[]
        bmae,bmaetest=[],[]
        brmse,brmsetest=[],[]

        for train_index,test_index in kf.split(train_features):
            print('fold number: ', i)

            x_train,x_test=train_features[train_index],train_features[test_index]
            y_train,y_test=train_label[train_index],train_label[test_index]
            
            params_tree = {
                'n_estimators':[50,126],
                'min_samples_leaf':[30,50],
                'max_depth':[20,50]
                }
            
            params_d_tree = {
                'max_depth':[20,50]
                }
            
            params_knn = {
                'n_neighbors':[5,10]
                }
            
            params_svr ={
                'C':[5,10],
                'gamma':[0.001,0.1,1]
                }
            
            params_xgb = {
                'max_depth':[10,50],
                'min_child_weight':[1,6]
                }
            
            params = {
                'svr__C':[1,5,10],
                'svr__gamma':[0.001, 0.1, 0.5,1],
                'extratreesregressor__n_estimators':[50,100,126,200],
                'extratreesregressor__min_samples_leaf':[20,30,50,10],
                'kneighborsregressor__n_neighbors':[5,7,10,15],
                'randomforestregressor__max_depth':[10,20,30,50],
                'randomforestregressor__n_estimators':[20,100],
                'meta_regressor__C':[1,5,10],
                'meta_regressor__gamma':[0.001,0.1,0.5,1],
                'meta_regressor__kernel':['rbf','linear']
                }
            
            print('DTR')
            rand_search_dtr = RandomizedSearchCV(DecisionTreeRegressor(), params_d_tree, cv=5)
            rand_search_dtr.fit(x_train, y_train)
            best_tuned_dtr = rand_search_dtr.best_estimator_
            y_pred = best_tuned_dtr.predict(test_features)
            test_pred = best_tuned_dtr.predict(x_test)
            #testing errors
            dtrmretest.append(mean_relative_error(y_test,test_pred))
            dtrmaetest.append(metrics.mean_absolute_error(y_test,test_pred))
            dtrrmsetest.append(math.sqrt(metrics.mean_squared_error(y_test,test_pred)))
            #validation errors
            dtrmre.append(mean_relative_error(test_label,y_pred))
            dtrmae.append(metrics.mean_absolute_error(test_label,y_pred))
            dtrrmse.append(math.sqrt(metrics.mean_squared_error(test_label,y_pred)))
            
            print('RFR')
            rand_search_rfr = RandomizedSearchCV(RandomForestRegressor(), params_tree, cv=5)
            rand_search_rfr.fit(x_train, y_train)
            best_tuned_rfr = rand_search_rfr.best_estimator_
            y_pred = best_tuned_rfr.predict(test_features)
            test_pred = best_tuned_rfr.predict(x_test)
            #testing errors
            rfrmretest.append(mean_relative_error(y_test,test_pred))
            rfrmaetest.append(metrics.mean_absolute_error(y_test,test_pred))
            rfrrmsetest.append(math.sqrt(metrics.mean_squared_error(y_test,test_pred)))
            #validation errors
            rfrmre.append(mean_relative_error(test_label,y_pred))
            rfrmae.append(metrics.mean_absolute_error(test_label,y_pred))
            rfrrmse.append(math.sqrt(metrics.mean_squared_error(test_label,y_pred)))
            
            print('ETR')
            rand_search_etr = RandomizedSearchCV(ExtraTreesRegressor(), params_tree, cv=5)
            rand_search_etr.fit(x_train, y_train)
            best_tuned_etr = rand_search_etr.best_estimator_
            y_pred = best_tuned_etr.predict(test_features)
            test_pred = best_tuned_etr.predict(x_test)
            #testing errors
            etrmretest.append(mean_relative_error(y_test,test_pred))
            etrmaetest.append(metrics.mean_absolute_error(y_test,test_pred))
            etrrmsetest.append(math.sqrt(metrics.mean_squared_error(y_test,test_pred)))
            #validation errors
            etrmre.append(mean_relative_error(test_label,y_pred))
            etrmae.append(metrics.mean_absolute_error(test_label,y_pred))
            etrrmse.append(math.sqrt(metrics.mean_squared_error(test_label,y_pred)))
            
            print('KNN')
            rand_search_knn = RandomizedSearchCV(KNeighborsRegressor(), params_knn, cv=5)
            rand_search_knn.fit(x_train, y_train)
            best_tuned_knn = rand_search_knn.best_estimator_
            y_pred = best_tuned_knn.predict(test_features)
            test_pred = best_tuned_knn.predict(x_test)
            #testing errors
            knnmretest.append(mean_relative_error(y_test,test_pred))
            knnmaetest.append(metrics.mean_absolute_error(y_test,test_pred))
            knnrmsetest.append(math.sqrt(metrics.mean_squared_error(y_test,test_pred)))
            #validation errors
            knnmre.append(mean_relative_error(test_label,y_pred))
            knnmae.append(metrics.mean_absolute_error(test_label,y_pred))
            knnrmse.append(math.sqrt(metrics.mean_squared_error(test_label,y_pred)))
            
            print('SVR')
            rand_search_svr = RandomizedSearchCV(SVR(), params_svr, cv=5, n_jobs = 4)
            rand_search_svr.fit(x_train, y_train)
            best_tuned_svr = rand_search_svr.best_estimator_
            y_pred = best_tuned_svr.predict(test_features)
            test_pred = best_tuned_svr.predict(x_test)
            #testing errors
            svrmretest.append(mean_relative_error(y_test,test_pred))
            svrmaetest.append(metrics.mean_absolute_error(y_test,test_pred))
            svrrmsetest.append(math.sqrt(metrics.mean_squared_error(y_test,test_pred)))
            #validation errors
            svrmre.append(mean_relative_error(test_label,y_pred))
            svrmae.append(metrics.mean_absolute_error(test_label,y_pred))
            svrrmse.append(math.sqrt(metrics.mean_squared_error(test_label,y_pred)))
            
            print('XGB')
            rand_search_xgb = RandomizedSearchCV(XGBRegressor(silent=True), params_xgb, cv=5, n_jobs = 4)
            print('fitting')
            rand_search_xgb.fit(x_train, y_train)
            print('best estimator')
            best_tuned_xgb = rand_search_xgb.best_estimator_
            print('predicting')
            y_pred = best_tuned_xgb.predict(test_features)
            test_pred = best_tuned_xgb.predict(x_test)
            print('testing')
            #testing errors
            bmretest.append(mean_relative_error(y_test,test_pred))
            bmaetest.append(metrics.mean_absolute_error(y_test,test_pred))
            brmsetest.append(math.sqrt(metrics.mean_squared_error(y_test,test_pred)))
            print('validation')
            #validation errors
            bmre.append(mean_relative_error(test_label,y_pred))
            bmae.append(metrics.mean_absolute_error(test_label,y_pred))
            brmse.append(math.sqrt(metrics.mean_squared_error(test_label,y_pred)))
            
            print('SR')
            reg1=SVR()
            reg2=ExtraTreesRegressor()
            reg3=KNeighborsRegressor()
            reg4=RandomForestRegressor()
            rand_search_sr = RandomizedSearchCV(StackingRegressor(regressors=[reg1,reg2,reg3,reg4], meta_regressor=SVR()), params, cv=5, n_jobs = 6)
            rand_search_sr.fit(x_train, y_train)
            best_tuned_sr = rand_search_sr.best_estimator_
            y_pred = best_tuned_sr.predict(test_features)
            test_pred = best_tuned_sr.predict(x_test)
            print('testing')
            #testing errors
            strmretest.append(mean_relative_error(y_test,test_pred))
            strmaetest.append(metrics.mean_absolute_error(y_test,test_pred))
            strrmsetest.append(math.sqrt(metrics.mean_squared_error(y_test,test_pred)))
            print('validation')
            #validation errors
            strmre.append(mean_relative_error(test_label,y_pred))
            strmae.append(metrics.mean_absolute_error(test_label,y_pred))
            strrmse.append(math.sqrt(metrics.mean_squared_error(test_label,y_pred)))

            i=i+1

        dtravgmre.append(np.mean(dtrmre))
        dtravgmae.append(np.mean(dtrmae))
        dtravgrmse.append(np.mean(dtrrmse))
        dtravgmretest.append(np.mean(dtrmretest))
        dtravgmaetest.append(np.mean(dtrmaetest))
        dtravgrmsetest.append(np.mean(dtrrmsetest))

        rfravgmre.append(np.mean(rfrmre))
        rfravgmae.append(np.mean(rfrmae))
        rfravgrmse.append(np.mean(rfrrmse))
        rfravgmretest.append(np.mean(rfrmretest))
        rfravgmaetest.append(np.mean(rfrmaetest))
        rfravgrmsetest.append(np.mean(rfrrmsetest))

        etravgmre.append(np.mean(etrmre))
        etravgmae.append(np.mean(etrmae))
        etravgrmse.append(np.mean(etrrmse))
        etravgmretest.append(np.mean(etrmretest))
        etravgmaetest.append(np.mean(etrmaetest))
        etravgrmsetest.append(np.mean(etrrmsetest))

        knnavgmre.append(np.mean(knnmre))
        knnavgmae.append(np.mean(knnmae))
        knnavgrmse.append(np.mean(knnrmse))
        knnavgmretest.append(np.mean(knnmretest))
        knnavgmaetest.append(np.mean(knnmaetest))
        knnavgrmsetest.append(np.mean(knnrmsetest))

        svravgmre.append(np.mean(svrmre))
        svravgmae.append(np.mean(svrmae))
        svravgrmse.append(np.mean(svrrmse))
        svravgmretest.append(np.mean(svrmretest))
        svravgmaetest.append(np.mean(svrmaetest))
        svravgrmsetest.append(np.mean(svrrmsetest))

#         bavgmre.append(np.mean(bmre))
#         bavgmae.append(np.mean(bmae))
#         bavgrmse.append(np.mean(brmse))
#         bavgmretest.append(np.mean(bmretest))
#         bavgmaetest.append(np.mean(bmaetest))
#         bavgrmsetest.append(np.mean(brmsetest))

        stravgmre.append(np.mean(strmre))
        stravgmae.append(np.mean(strmae))
        stravgrmse.append(np.mean(strrmse))
        stravgmretest.append(np.mean(strmretest))
        stravgmaetest.append(np.mean(strmaetest))
        stravgrmsetest.append(np.mean(strrmsetest))

        print('dtr: mre: ', np.mean(dtrmre), ' mae: ', np.mean(dtrmae), ' rmse: ', np.mean(dtrrmse))
        print('test dtr: mre: ', np.mean(dtrmretest), ' mae: ', np.mean(dtrmaetest), ' rmse: ', np.mean(dtrrmsetest))        
        print('rfr: mre: ', np.mean(rfrmre), ' mae: ', np.mean(rfrmae), ' rmse: ', np.mean(rfrrmse))
        print('test rfr: mre: ', np.mean(rfrmretest), ' mae: ', np.mean(rfrmaetest), ' rmse: ', np.mean(rfrrmsetest))
        print('etr: mre: ', np.mean(etrmre), ' mae: ', np.mean(etrmae), ' rmse: ', np.mean(etrrmse))
        print('test etr: mre: ', np.mean(etrmretest), ' mae: ', np.mean(etrmaetest), ' rmse: ', np.mean(etrrmsetest))
        print('knn: mre: ', np.mean(knnmre), ' mae: ', np.mean(knnmae), ' rmse: ', np.mean(knnrmse))
        print('test knn: mre: ', np.mean(knnmretest), ' mae: ', np.mean(knnmaetest), ' rmse: ', np.mean(knnrmsetest))
        print('svr: mre: ', np.mean(svrmre), ' mae: ', np.mean(svrmae), ' rmse: ', np.mean(svrrmse))
        print('test svr: mre: ', np.mean(svrmretest), ' mae: ', np.mean(svrmaetest), ' rmse: ', np.mean(svrrmsetest))
#         print('xgb: mre: ', np.mean(bmre), ' mae: ', np.mean(bmae), ' rmse: ', np.mean(brmse))
#         print('test xgb: mre: ', np.mean(bmretest), ' mae: ', np.mean(bmaetest), ' rmse: ', np.mean(brmsetest))
        print('str: mre: ', np.mean(strmre), ' mae: ', np.mean(strmae), ' rmse: ', np.mean(strrmse))
        print('test str: mre: ', np.mean(strmretest), ' mae: ', np.mean(strmaetest), ' rmse: ', np.mean(strrmsetest))

    print('-------------------------------------------------------------------------------------------')
    print('avg dtr: mre: ', np.mean(dtravgmre), ' mae: ', np.mean(dtravgmae), ' rmse: ', np.mean(dtravgrmse))
    print('test avg dtr: mre: ', np.mean(dtravgmretest), ' mae: ', np.mean(dtravgmaetest), ' rmse: ', np.mean(dtravgrmsetest))
    print('avg rfr: mre: ', np.mean(rfravgmre), ' mae: ', np.mean(rfravgmae), ' rmse: ', np.mean(rfravgrmse))
    print('test avg rfr: mre: ', np.mean(rfravgmretest), ' mae: ', np.mean(rfravgmaetest), ' rmse: ', np.mean(rfravgrmsetest))
    print('avg etr: mre: ', np.mean(etravgmre), ' mae: ', np.mean(etravgmae), ' rmse: ', np.mean(etravgrmse))
    print('test avg etr: mre: ', np.mean(etravgmretest), ' mae: ', np.mean(etravgmaetest), ' rmse: ', np.mean(etravgrmsetest))
    print('avg knn: mre: ', np.mean(knnavgmre), ' mae: ', np.mean(knnavgmae), ' rmse: ', np.mean(knnavgrmse))
    print('test avg knn: mre: ', np.mean(knnavgmretest), ' mae: ', np.mean(knnavgmaetest), ' rmse: ', np.mean(knnavgrmsetest))
    print('avg svr: mre: ', np.mean(svravgmre), ' mae: ', np.mean(svravgmae), ' rmse: ', np.mean(svravgrmse))
    print('test avg svr: mre: ', np.mean(svravgmretest), ' mae: ', np.mean(svravgmaetest), ' rmse: ', np.mean(svravgrmsetest))
#     print('avg xgb: mre: ', np.mean(bavgmre), ' mae: ', np.mean(bavgmae), ' rmse: ', np.mean(bavgrmse))
#     print('test avg xgb: mre: ', np.mean(bavgmretest), ' mae: ', np.mean(bavgmaetest), ' rmse: ', np.mean(bavgrmsetest))
    print('avg str: mre: ', np.mean(stravgmre), ' mae: ', np.mean(stravgmae), ' rmse: ', np.mean(stravgrmse))
    print('test avg str: mre: ', np.mean(stravgmretest), ' mae: ', np.mean(stravgmaetest), ' rmse: ', np.mean(stravgrmsetest))

    with open('results-'+ os.path.splitext(filename)[0] +'.txt', 'w') as f:
                        f.write("validation results:\n")
                        f.write("mre, mae, rmse\n")
                        f.write("dtr: %s(%s) & %s(%s) & %s(%s)\n"% (np.mean(dtravgmre),np.std(dtravgmre),
                                                                            np.mean(dtravgmae), np.std(dtravgmae),
                                                                            np.mean(dtravgrmse), np.std(dtravgrmse)))
                        f.write("rfr: %s(%s) & %s(%s) & %s(%s)\n"% (np.mean(rfravgmre),np.std(rfravgmre),
                                                                            np.mean(rfravgmae), np.std(rfravgmae),
                                                                            np.mean(rfravgrmse), np.std(rfravgrmse)))
                        f.write("etr: %s(%s) & %s(%s) & %s(%s)\n"% (np.mean(etravgmre),np.std(etravgmre),
                                                                            np.mean(etravgmae), np.std(etravgmae),
                                                                            np.mean(etravgrmse), np.std(etravgrmse)))
                        f.write("knn: %s(%s) & %s(%s) & %s(%s)\n"% (np.mean(knnavgmre),np.std(knnavgmre),
                                                                            np.mean(knnavgmae), np.std(knnavgmae),
                                                                            np.mean(knnavgrmse), np.std(knnavgrmse)))
                        f.write("svr: %s(%s) & %s(%s) & %s(%s)\n"% (np.mean(svravgmre),np.std(svravgmre),
                                                                            np.mean(svravgmae), np.std(svravgmae),
                                                                            np.mean(svravgrmse), np.std(svravgrmse)))
#                         f.write("xgb: %s(%s) & %s(%s) & %s(%s)\n"% (np.mean(bavgmre),np.std(bavgmre),
#                                                                             np.mean(bavgmae), np.std(bavgmae),
#                                                                             np.mean(bavgrmse), np.std(bavgrmse)))
                        f.write("str: %s(%s) & %s(%s) & %s(%s)\n"% (np.mean(stravgmre),np.std(stravgmre),
                                                                            np.mean(stravgmae), np.std(stravgmae),
                                                                            np.mean(stravgrmse), np.std(stravgrmse)))
                        f.write("testing results:\n")
                        f.write("mre, mae, rmse\n")
                        f.write("dtr: %s(%s) & %s(%s) & %s(%s)\n"% (np.mean(dtravgmretest),np.std(dtravgmretest),
                                                                            np.mean(dtravgmaetest), np.std(dtravgmaetest),
                                                                            np.mean(dtravgrmsetest), np.std(dtravgrmsetest)))
                        f.write("rfr: %s(%s) & %s(%s) & %s(%s)\n"% (np.mean(rfravgmretest),np.std(rfravgmretest),
                                                                            np.mean(rfravgmaetest), np.std(rfravgmaetest),
                                                                            np.mean(rfravgrmsetest), np.std(rfravgrmsetest)))
                        f.write("etr: %s(%s) & %s(%s) & %s(%s)\n"% (np.mean(etravgmretest),np.std(etravgmretest),
                                                                            np.mean(etravgmaetest), np.std(etravgmaetest),
                                                                            np.mean(etravgrmsetest), np.std(etravgrmsetest)))
                        f.write("knn: %s(%s) & %s(%s) & %s(%s)\n"% (np.mean(knnavgmretest),np.std(knnavgmretest),
                                                                            np.mean(knnavgmaetest), np.std(knnavgmaetest),
                                                                            np.mean(knnavgrmsetest), np.std(knnavgrmsetest)))
                        f.write("svr: %s(%s) & %s(%s) & %s(%s)\n"% (np.mean(svravgmretest),np.std(svravgmretest),
                                                                            np.mean(svravgmaetest), np.std(svravgmaetest),
                                                                            np.mean(svravgrmsetest), np.std(svravgrmsetest)))
#                         f.write("xgb: %s(%s) & %s(%s) & %s(%s)\n"% (np.mean(bavgmretest),np.std(bavgmretest),
#                                                                             np.mean(bavgmaetest), np.std(bavgmaetest),
#                                                                             np.mean(bavgrmsetest), np.std(bavgrmsetest)))
                        f.write("str: %s(%s) & %s(%s) & %s(%s)\n"% (np.mean(stravgmretest),np.std(stravgmretest),
                                                                            np.mean(stravgmaetest), np.std(stravgmaetest),
                                                                            np.mean(stravgrmsetest), np.std(stravgrmsetest)))

In [7]:
run_techniques()

run:  1
fold number:  1
DTR
RFR
ETR
KNN
SVR
XGB
fitting
best estimator
predicting
testing
validation
SR
testing
validation
fold number:  2
DTR
RFR
ETR
KNN
SVR
XGB
fitting
best estimator
predicting
testing
validation
SR
testing
validation
fold number:  3
DTR
RFR
ETR
KNN
SVR
XGB
fitting
best estimator
predicting
testing
validation
SR
testing
validation
fold number:  4
DTR
RFR


KeyboardInterrupt: 