In [24]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn.linear_model
from sklearn.externals import joblib
import scipy
from scipy.stats.stats import pearsonr 

In [25]:
models = ['Linear','RF']

tiles = ['t1','t2','t3','t4','t5','t6','t7','t8','t9','t10','t11','t12','t13','t14','t15','t16','t17','t18',
         't19','t20','t21','t22','t23','t24','t25','t26','t27','t28','t29','t30','t31']

#load in the base set of all data
tile_counts = pd.read_pickle('/u/home/m/mudiyang/scratch/Scaleup_counts_sequences/tile_scores.pkl')

base_input = pd.read_pickle('/u/home/m/mudiyang/scratch/Tile_Prediction_Regressors/Combined/data/base.pkl')
inference_input = pd.read_pickle('/u/home/m/mudiyang/scratch/Tile_Prediction_Regressors/Combined/data/inference.pkl')

base_input_all = base_input.dropna()
inference_input_all = inference_input.dropna()

In [26]:
Linear_predictions = pd.DataFrame(index = base_input_all.index)
Linear_inference_predictions = pd.DataFrame(index = inference_input_all.index)
RF_predictions = pd.DataFrame(index = base_input_all.index)
RF_inference_predictions = pd.DataFrame(index = inference_input_all.index)

In [27]:
for model in models:
    base_predictions = pd.DataFrame(index = base_input_all.index)
    inference_predictions = pd.DataFrame(index = inference_input_all.index)
    
    for tile in tiles:
        base_model = joblib.load('/u/home/m/mudiyang/scratch/Tile_Prediction_Regressors/Combined/%s/%s_%s.pkl'
                                 %(model,model,tile))
        inference_model = joblib.load('/u/home/m/mudiyang/scratch/Tile_Prediction_Regressors/Combined/%s_all_inference/%s_all_inference_%s.pkl'
                                 %(model,model,tile))
        
        
        tiles_in_base = [tile]
        for i in range(1,32):
            if 't%s'%i != tile:
                tiles_in_base.append('actual_t%s'%i)
        
        base_input = base_input_all[tiles_in_base]
        inference_input = inference_input_all.drop(columns = [tile])
        
        base_output = base_model.predict(base_input)
        inference_output = inference_model.predict(inference_input)
        
        base_predictions[tile] = base_output
        inference_predictions[tile] = inference_output
        
    if model == 'RF':
        RF_predictions = base_predictions
        RF_inference_predictions = inference_predictions
    else:
        Linear_predictions = base_predictions
        Linear_inference_predictions = inference_predictions
        

In [28]:
#check if all the regions are the same
array = [all(Linear_predictions.index == Linear_inference_predictions.index),
all(RF_predictions.index == RF_inference_predictions.index),
all(Linear_inference_predictions.index == RF_predictions.index)]
print(all(array))

#check if all prediction files have no Nan values
array = [
    Linear_predictions.equals(Linear_predictions.dropna()),
    Linear_inference_predictions.equals(Linear_inference_predictions.dropna()),
    RF_predictions.equals(RF_predictions.dropna()),
    RF_inference_predictions.equals(RF_inference_predictions.dropna())
]
print(all(array))

True
True


In [34]:
#get the correct coords file based on the regions for which we have predicted values
#read in coord file
coord_df = pd.read_csv('/u/home/m/mudiyang/scratch/Scaleup_counts_sequences/coords_all.txt', 
                           sep='\t', index_col=0, header = None)
coord_df = coord_df.reindex(Linear_predictions.index)

coord_df.to_pickle('/u/home/m/mudiyang/scratch/Scaleup_counts_sequences/coords_SHARPR.pkl')

In [30]:
Linear_predictions.to_csv('/u/home/m/mudiyang/scratch/Tile_Prediction_Regressors/Combined/data/predictions/linear.tsv',
                              sep = '\t', header = False, na_rep = "")
Linear_inference_predictions.to_csv('/u/home/m/mudiyang/scratch/Tile_Prediction_Regressors/Combined/data/predictions/linear_inference.tsv',
                              sep = '\t', header = False, na_rep = "")
RF_predictions.to_csv('/u/home/m/mudiyang/scratch/Tile_Prediction_Regressors/Combined/data/predictions/RF.tsv',
                              sep = '\t', header = False, na_rep = "")
RF_inference_predictions.to_csv('/u/home/m/mudiyang/scratch/Tile_Prediction_Regressors/Combined/data/predictions/RF_inference.tsv',
                              sep = '\t', header = False, na_rep = "")

In [31]:
#figure out which regions are missing
all_regions = tile_counts.index
non_null = base_input.index

missing = all_regions.difference(non_null, sort=False)