In [None]:
import sklearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.externals import joblib
from sklearn.metrics import mean_squared_error

In [None]:
#matrices for rmse and correlation
models = ['Linear','Rf','MLPR']

rmse_df = pd.DataFrame(index = models)
corr_df = pd.DataFrame(index = models)

In [None]:
#for each chromosome, load test data, load models. Run models, save correlation and rmse to appropriate dataframes
chromosomes = list(range(1,23,1)) + ['X']
chromosomes = np.array(chromosomes)
chromosomes = chromosomes.reshape(23,)


for i in chromosomes:
    #read in test data
    X_test = pd.read_pickle('/u/home/m/mudiyang/scratch/Tile_Prediction_Regressors/Data/CV_t15_rotate_chrs/X_test_chr%s.pkl'%i)
    y_test = pd.read_pickle('/u/home/m/mudiyang/scratch/Tile_Prediction_Regressors/Data/CV_t15_rotate_chrs/y_test_chr%s.pkl'%i)
    
    X_test = X_test.values
    y_test = y_test.values.flatten()
    
    
    #load each regressor
    linear = joblib.load('/u/home/m/mudiyang/scratch/Tile_Prediction_Regressors/Data/CV_t15_rotate_chrs/regressors/linear_%s.pkl'%i)
    RF = joblib.load('/u/home/m/mudiyang/scratch/Tile_Prediction_Regressors/Data/CV_t15_rotate_chrs/regressors/RF_%s.pkl'%i)
    MLPR = joblib.load('/u/home/m/mudiyang/scratch/Tile_Prediction_Regressors/Data/CV_t15_rotate_chrs/regressors/MLPR_%s.pkl'%i)
    
    
    #run each regressor
    #run all regressors on data
    y = y_test

    linear_y = linear.predict(X_test)
    linear_y = linear_y.flatten()

    RF_y = RF.predict(X_test)
    RF_y = RF_y.flatten()

    MLPR_y = MLPR.predict(X_test) 
    MLPR_y = MLPR_y.flatten()
    
    
    #evaluate each model
    linear_rms = mean_squared_error(y, linear_y)
    linear_corre = pd.Series(y).corr(pd.Series(linear_y))

    RF_rms = mean_squared_error(y, RF_y)
    RF_corre = pd.Series(y).corr(pd.Series(RF_y))

    MLPR_rms = mean_squared_error(y, MLPR_y)
    MLPR_corre = pd.Series(y).corr(pd.Series(MLPR_y))
    
    #save metrics to dataframes
    corr = [linear_corre, RF_corre, MLPR_corre]
    rmse = [linear_rms, RF_rms, MLPR_rms]
    
    rmse_df['%s'%i] = rmse
    corr_df['%s'%i] = corr

In [None]:
x = np.arange(1,24,1)
y1 = rmse_df.loc['Linear']
y2 = rmse_df.loc['Rf']
y3 = rmse_df.loc['MLPR']

plt.plot(x, y1, label = 'Linear')
plt.plot(x, y2, label = 'RF')
plt.plot(x,y3, label = 'MLPR')
plt.xlabel('Train set CHR')
plt.ylabel('RMSE')
plt.title('Comparision of RMSE Between Models for Train Set')
plt.legend()
plt.show()

In [None]:
x = np.arange(1,24,1)
y1 = corr_df.loc['Linear']
y2 = corr_df.loc['Rf']
y3 = corr_df.loc['MLPR']

plt.plot(x, y1, label = 'Linear')
plt.plot(x, y2, label = 'RF')
plt.plot(x,y3, label = 'MLPR')
plt.xlabel('Train set CHR')
plt.ylabel('CORR')
plt.title('Comparision of Correlation Between Models for Train Set')
plt.legend()
plt.show()

In [None]:
counts = [1562,1270,1046,723,850,1049,820,640,678,779,779,808,375,478,494,556,685,327,528,463,183,284,343]

linear_weighted_rmse = rmse_df.loc['Linear'] * counts
linear_weighted_rmse = linear_weighted_rmse.sum()/sum(counts)

RF_weighted_rmse = rmse_df.loc['Rf'] * counts
RF_weighted_rmse = RF_weighted_rmse.sum()/sum(counts)

MLPR_weighted_rmse = rmse_df.loc['MLPR'] * counts
MLPR_weighted_rmse = MLPR_weighted_rmse.sum()/sum(counts)


linear_weighted_corr = corr_df.loc['Linear'] * counts
linear_weighted_corr = linear_weighted_corr.sum()/sum(counts)

RF_weighted_corr = corr_df.loc['Rf'] * counts
RF_weighted_corr = RF_weighted_corr.sum()/sum(counts)

MLPR_weighted_corr = corr_df.loc['MLPR'] * counts
MLPR_weighted_corr = MLPR_weighted_corr.sum()/sum(counts)

In [None]:
label = ['Linear', 'RF', 'MLPR']
rmse = [linear_weighted_rmse, RF_weighted_rmse, MLPR_weighted_rmse]

plt.barh(label, rmse)
plt.title('Comparison of RMSE for Different Models')

for index, value in enumerate(rmse):
    plt.text(value, index, str(value))

In [None]:
label = ['Linear', 'RF', 'MLPR']
corr = [linear_weighted_corr, RF_weighted_corr, MLPR_weighted_corr]

plt.barh(label, corr)
plt.title('Comparison of Correlation for Different Models')

for index, value in enumerate(corr):
    plt.text(value, index, str(value))