In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn.linear_model
from sklearn.externals import joblib
import scipy
from scipy.stats.stats import pearsonr 

In [None]:
#list of all different tiles
tiles = ['t1','t2','t3','t4','t5','t6','t7','t8','t9','t10','t11','t12','t13','t14','t15','t16','t17','t18',
         't19','t20','t21','t22','t23','t24','t25','t26','t27','t28','t29','t30','t31']

#load in the base set of all data
tile_counts = pd.read_pickle('/u/home/m/mudiyang/scratch/Scaleup_counts_sequences/tile_scores.pkl')


missing = tile_counts[tile_counts.isnull().any(axis=1)]

#K562_9_95_chr1_59594155 has values: K562_9_95_chr1_59594155	0.276589	2.762016	0.655101
#set all that to NaN
missing = pd.DataFrame(columns=missing.columns, index=missing.index)
missing.to_pickle('/u/home/m/mudiyang/scratch/Tile_Prediction_Regressors/Data/missing_tiles.pkl')

tile_counts = tile_counts.dropna()


In [None]:
# # #generate predictions only on test set. Remove this if necessary
# chr_8 = np.genfromtxt('/u/home/m/mudiyang/scratch/Scaleup_counts_sequences/DNACOUNTS/chr8.txt', 
#                         dtype=str, delimiter ='\n')
# chr_18 = np.genfromtxt('/u/home/m/mudiyang/scratch/Scaleup_counts_sequences/DNACOUNTS/chr18.txt', 
#                         dtype=str, delimiter ='\n')
# drop_set = np.concatenate((chr_8, chr_18), axis=None)

# tile_counts = tile_counts[tile_counts.index.isin(drop_set)]

In [None]:
tile_counts

In [None]:
#initialize dataframe for all predicted values
all_predictions = pd.DataFrame(index = tile_counts.index)

#for each tile, set the input set to be tile_counts without that tile. load the tile specific regressor
#and predict on values to generate a column of the eventual output. 
for i in tiles:
    input_data = tile_counts.drop(columns = ['%s'%i])
    
    input_data = input_data.values
    
    tile_predictor = joblib.load('/u/home/m/mudiyang/scratch/Tile_Prediction_Regressors/Linear_%s.pkl'%i)
    
    output = tile_predictor.predict(input_data)
    
    all_predictions[i] = output

#multiply predicted values by constant factor to increase apmlitude of variations
all_predictions = all_predictions.apply(lambda x: x*3)

In [None]:
all_predictions = all_predictions.reindex(tile_counts.index)
all_predictions.to_pickle('/u/home/m/mudiyang/scratch/Tile_Prediction_Regressors/Data/all_predictions_linear.pkl')
tile_counts.to_pickle('/u/home/m/mudiyang/scratch/Tile_Prediction_Regressors/Data/tile_counts.pkl')

In [None]:
#generate heatmap
start = 0
matrix = all_predictions.sort_values(by = ['t15'])
matrix = matrix.iloc[start:start+31:1]


# sorting schemes, by mean or by tile value
matrix = matrix.sort_values(by = ['t15'])

plt.figure(figsize = (10,10))
plt.imshow(matrix, cmap = 'plasma')
plt.title("Ccores of predicted Tiles")

plt.xlabel("Tile")
plt.xticks(np.arange(31),matrix.columns)

plt.ylabel("Region")
plt.yticks(np.arange(31),matrix.index)
plt.show()

In [None]:
#generate heatmap
start = 0
matrix = tile_counts.sort_values(by = ['t15'])
matrix = matrix.iloc[start:start+31:1]


# sorting schemes, by mean or by tile value
matrix = matrix.sort_values(by = ['t15'])

plt.figure(figsize = (10,10))
plt.imshow(matrix, cmap = 'plasma')
plt.title("Counts of actual Tiles")

plt.xlabel("Tile")
plt.xticks(np.arange(31),matrix.columns)

plt.ylabel("Region")
plt.yticks(np.arange(31),matrix.index)
plt.show()

In [None]:
correlation_tile = []
for i in tiles:
    correlation_tile.append(all_predictions[i].corr(tile_counts[i]))

plt.plot(range(1,32,1), correlation_tile)
plt.xlabel('tile')
plt.ylabel('correlation')
plt.title('correlation by tile')
plt.show()

In [None]:
correlation_region = all_predictions.corrwith(tile_counts, axis =0)

plt.hist(correlation_region)
plt.xlabel('correlation')
plt.ylabel('number')
plt.title('correlation by tile')
plt.show()

In [None]:
correlation_region = all_predictions.corrwith(tile_counts, axis =1)

plt.hist(correlation_region)
plt.xlabel('correlation')
plt.ylabel('number')
plt.title('correlation by region')
plt.show()

In [None]:
# import scipy
# from scipy.stats.stats import pearsonr 
# indices = []
# for i in range(0,1500, 50):
#     plt.plot(range(1,32,1), all_predictions.iloc[i], label = 'predicted region')
#     plt.plot(range(1,32,1), tile_counts.iloc[i], label = 'experimental region')
#     plt.plot(range(1,32,1), [np.average(tile_counts.iloc[i])]*31, label = 'mean experimental region')
#     correlation, p = pearsonr(all_predictions.iloc[i], tile_counts.iloc[i]) 
#     plt.xlabel('tile number \n\n corr = %s'%(correlation))
#     plt.ylabel('tile value')
#     plt.title(all_predictions.index[i])
#     plt.legend()
#     plt.show()
#     indices.append(all_predictions.index[i])

In [None]:
pd.Series(all_predictions.unstack()).corr(pd.Series(tile_counts.unstack()))

In [None]:
for i in tiles:
    plt.scatter(all_predictions[i], tile_counts[i], edgecolor = 'w')
    plt.xlabel('predicted %s value'%i)
    plt.ylabel('experimental %s value'%i)
    plt.show()
    correlation, p = pearsonr(all_predictions[i], tile_counts[i])
    print(correlation)

In [None]:
tile_counts