In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
from matplotlib import pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import matplotlib.gridspec as gridspec
from matplotlib import patches
import seaborn as sn
from scipy.spatial.distance import pdist
from sklearn.manifold import TSNE
from scipy.stats import friedmanchisquare, kruskal
from sklearn.cluster import k_means
from sklearn.preprocessing import normalize 
#from fancyimpute import KNN
import itertools
#from adjustText import adjust_text
import multiprocessing as mp
from numpy import savetxt
from numpy import loadtxt

## Load in data


In [2]:
%%time
#load barcodes
barcodematrix = np.genfromtxt (r'/home/mwjacobs/mapseq_data_preprocess/p3_nbcm_combined_for-preprocess - Sheet1.csv', delimiter=',')
barcodematrix = np.array(barcodematrix, dtype=np.float64)
print(barcodematrix.shape)

(4910, 8)
CPU times: user 11.9 ms, sys: 991 µs, total: 12.8 ms
Wall time: 12.5 ms


In [3]:
#so you dont have to run the slow code above ^^^^ 
rawbarcodematrix = np.array(barcodematrix, dtype=np.float64)
print(rawbarcodematrix.shape)

(4910, 8)


## Preprocessing

## 1) Organize into separate animals for processing

In [4]:
## Column Re-arrangement index according to sample key
## Order as follows: PFC, NAc,LS, DLS, BNST, LH, BA, CeA, vCA1
# Order as follows: RSP, PM, AM, A, RL, AL, LM, Cerebellum
i1 = [0,1,2,3,4,5,6,7]
#i2 = [9,10,11,12,13,14,15,16]

##generates each barcodematrix for each animal with array conversion to 
##ensure new arrays generated in float.
a1 = np.array(rawbarcodematrix[:,i1], dtype=np.float64)
#a2 = np.array(rawbarcodematrix[:,i2], dtype=np.float64)


print(a1.shape)
#print(a2.shape)


(4910, 8)


## 2) Cleaning 

In [5]:
#remove empty barcoded cells
def clean_up_zeros():
    global a1,a2,a3,a4,a5,a6,a7,a8,a9,a10
    a1 = a1[np.any(a1, axis=1)]
    #a2 = a2[np.any(a2, axis=1)]
clean_up_zeros()


#remove all zero's among targets because they are useless
a1 = a1[np.any(a1[:,0:8], axis=1)]
#a2 = a2[np.any(a2[:,0:8], axis=1)]


print("Animal 1", a1.shape)
#print("Animal 2", a2.shape)

# Total # of Cells
print(a1.shape[0]) # + a2.shape[0])

Animal 1 (4910, 8)
4910


In [6]:
#Removes any barcodes with a count in negative target area (e.g. DLS)
d1 = a1[:,7] > 0
a1 = a1[~d1]
#d2 = a2[:,7] > 0
#a2 = a2[~d2]


print("Animal 1", a1.shape)
#print("Animal 2", a2.shape)

# Total # of Cells
print(a1.shape[0]) # + a2.shape[0])

Animal 1 (4910, 8)
4910


## Filter Thresholds

In [7]:
##Threshold filtering for mininum # of counts in targets
minim2 = 2

filter01 = np.amax(a1[:,0:7], axis = 1) < minim2
caa1 = a1[~filter01]
#filter02 = np.amax(a2[:,0:7], axis = 1) < minim2
#caa2 = a2[~filter02]



print("Animal 1", caa1.shape)
#print("Animal 2", caa2.shape)


print(caa1.shape[0]) #+ caa2.shape[0])

Animal 1 (4910, 8)
4910


In [8]:
##conserved verbiage
na1 = np.array(caa1)
#na2 = np.array(caa2)



In [9]:
#keep floating
f_a1 = np.array(na1, dtype=np.float64)
#f_a2 = np.array(na2, dtype=np.float64)



clean_up_zeros()
print("Animal 1", f_a1.shape)
#print("Animal 2", f_a2.shape)


## Total # of Cells
print(f_a1.shape[0]) # + f_a2.shape[0])

Animal 1 (4910, 8)
4910


In [10]:
#bring all animals together
#matrix = np.concatenate((f_a1,f_a2) axis=0) # if you use more animals
#keep floating
matrix = np.array(f_a1, dtype=np.float64)  #change to matrix if you have more
print(matrix.shape)

(4910, 8)


In [11]:
##Delete animals with any value in DLS
#old order for reference # Order as follows: PFC, NAc,LS, DLS, BNST, LH, BA, CeA, vCA1
# Order as follows: RSP, PM, AM, A, RL, AL, LM, Cerebellum
nondlsers = np.where(matrix[:,7] == 0)
datas = matrix[nondlsers]
ndata = np.array(np.delete(datas, 7, 1))
fdata = ndata #np.array(np.delete(ndata, 6, 1))
print(fdata.shape)

(4910, 7)


In [12]:
##importing and adding first cohort data
#cohort1 = loadtxt(r'/Users/markgergues/Box Sync/MAKheirbekLab/MAPseq/data/C1_fdata.csv', delimiter=',')
#print(cohort1.shape)
#print(fdata.shape)

#combine with current data into new matrix
#ffdata = np.concatenate((cohort1, fdata), axis=0)
#finaldata = np.array(ffdata, dtype=np.float64)

#print(finaldata.shape)

In [13]:
# Check if cohort1 data is available
try:
    cohort1 = loadtxt(r'/Users/markgergues/Box Sync/MAKheirbekLab/MAPseq/data/C1_fdata.csv', delimiter=',')
    print("Cohort1 loaded successfully:", cohort1.shape)
except FileNotFoundError:
    print("Cohort1 data not found. Proceeding with only current data.")
    cohort1 = np.empty((0, fdata.shape[1]))  # Create an empty array with the same number of columns as fdata

# Combine cohort1 and current data into a new matrix
ffdata = np.concatenate((cohort1, fdata), axis=0)
finaldata = np.array(ffdata, dtype=np.float64)

print("Final combined data shape:", finaldata.shape)


Cohort1 data not found. Proceeding with only current data.
Final combined data shape: (4910, 7)


In [14]:
##Normalize barcodes to sum to 1 ===== nmatrix = normalize(fdata, axis=1, norm='l1')
d1 = finaldata.copy()

for row in range(len(d1)):
 rowmax = np.amax(d1[row], axis=0)
 d1[row] = d1[row]/rowmax


print(d1.shape)
print(np.amax(d1))

(4910, 7)
1.0


In [15]:
##convert data into data frame
df = pd.DataFrame(data=d1)
## Order as follows: PFC, NAc,LS, DLS, BNST, LH, BA, CeA, vCA1
# Order as follows: RSP, PM, AM, A, RL, AL, LM, Cerebellum
df.columns = ["RSP", "PM", "AM","A", "RL", "AL", "LM"]
#save raw barcode counts post spikein normalization
savetxt('finalmatrix.csv', d1, delimiter=',')
savetxt('/home/mwjacobs/mapseq_data_preprocess/p3final.csv', d1, delimiter=',')
print(df.shape)
print(df)

(4910, 7)
      RSP        PM   AM    A        RL   AL   LM
0     0.0  0.000000  0.0  0.0  1.000000  0.0  0.0
1     0.0  0.000000  0.0  0.0  1.000000  0.0  0.0
2     0.0  0.000000  0.0  0.0  1.000000  0.0  0.0
3     0.0  0.000000  0.0  0.0  0.000000  0.0  1.0
4     0.0  0.000000  0.0  1.0  0.000000  0.0  0.0
...   ...       ...  ...  ...       ...  ...  ...
4905  0.0  1.000000  0.0  0.0  0.000000  0.0  0.0
4906  0.0  0.000601  0.0  0.0  0.000633  0.0  1.0
4907  0.0  0.000000  0.0  0.0  0.000000  0.0  1.0
4908  0.0  1.000000  0.0  0.0  0.000000  0.0  0.0
4909  0.0  0.000000  0.0  0.0  0.000000  0.0  1.0

[4910 rows x 7 columns]


In [16]:
print(np.amax(d1))

1.0


## END OF PREPROCESSING DATA