In [48]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, ADASYN
from sklearn.model_selection import GridSearchCV, GroupKFold
from sklearn.linear_model import LinearRegression
import sys
import pickle
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.linear_model import HuberRegressor

In [91]:
model = LinearRegression()

In [107]:
random_state = 12
min_cells = 7

In [108]:
df = pd.read_csv("data/transformed_coords_1to2.txt",
                 sep = "\t", header=None, names=["OR", "X", "Y", "EXP"])

In [109]:
# Average coordinates shared between the two experiments
df = df.groupby("OR").agg({"X" : "mean", "Y" : "mean", "EXP" : "sum"})
# This is just a quick and dirty way to remove the OR that falls of the slide without knowing its ID
df = df[df["Y"] < 0]
ORs = sorted(list(df.index))
print("{} ORs with Slide-seq coordinates".format(len(ORs)))
# Convert coordinates to a dictionary keyed on OR
coords = df[["X", "Y"]].to_dict("index")
# Fit scaler so it stores the min and max of the x and y coordinates
scaler_y = MinMaxScaler().fit(df[["X", "Y"]].values)

77 ORs with Slide-seq coordinates


In [110]:
df = pd.read_csv("data/LR_OSN_all_50PCs_3000G.csv", index_col=0)

In [111]:

df.rename(columns={"observed": "OR"}, inplace=True)
print(df.head())

                       harmony_1  harmony_2  harmony_3  harmony_4  harmony_5  \
V3F1_CAACGGCAGTAAACAC   2.558849   0.506802  -2.369886   1.874797   1.657120   
V3F3_GAGACCCTCTGAGCAT  -0.964222  -6.844951  -1.793788  -0.045801  -0.452054   
V3F1_GAGGGATGTAGCCCTG  10.467781   2.561541   2.176045   1.047865  -2.936098   
V3F3_ACGTAGTCAGGACAGT   3.855414  -3.486225   2.411903  -5.956743  -2.668481   
V3M1_ACCAAACCAAGTGACG   3.137225   3.418350   4.027432  -0.138194   0.723199   

                       harmony_6  harmony_7  harmony_8  harmony_9  harmony_10  \
V3F1_CAACGGCAGTAAACAC  -0.746696   0.330887   1.457195  -0.677314   -0.956127   
V3F3_GAGACCCTCTGAGCAT  -0.311725  -0.178832  -0.569134   0.065686    0.931873   
V3F1_GAGGGATGTAGCCCTG  -1.162711   0.215706   0.705806   0.393240    0.137744   
V3F3_ACGTAGTCAGGACAGT   2.818753  -3.793611   2.078421  -2.440817   -1.163426   
V3M1_ACCAAACCAAGTGACG   4.358617  -0.906037  -0.747231  -2.364545    4.069403   

                       ...  harm

In [112]:
#df.drop(df.columns[40:50], axis=1,inplace=True)
#print(df.head())

In [113]:
df = df[df["OR"].isin(ORs)]

In [114]:
ORs_to_exclude = [x for x in ORs if len(df[df["OR"] == x]) < min_cells]
print(ORs_to_exclude)
df = df[~df["OR"].isin(ORs_to_exclude)]

['Olfr1000', 'Olfr1031', 'Olfr1086', 'Olfr118', 'Olfr1357', 'Olfr1424', 'Olfr1477', 'Olfr1496', 'Olfr433', 'Olfr517', 'Olfr749', 'Olfr822', 'Olfr994']


In [115]:
print("{} ORs with Slide-seq coordinates but no gene expression".format(len(ORs_to_exclude)))
ORs = [x for x in ORs if len(df[df["OR"] == x]) >= min_cells]
print("{} ORs remaining".format(len(ORs)))

13 ORs with Slide-seq coordinates but no gene expression
64 ORs remaining


In [116]:
n_col = len(df.columns)
appended_data = []
for z in ORs:
    OR = z
    X_train = df[(df["OR"] != OR)].values[:,0:(n_col - 1)]
    X_test = df[(df["OR"] == OR)].values[:,0:(n_col - 1)]
    train_ors = np.array(df[(df["OR"] != OR)].iloc[:,(n_col - 1)])
    test_ors = np.array(df[(df["OR"] == OR)].iloc[:,(n_col - 1)])
    
    #oversample = BorderlineSMOTE(random_state=random_state, k_neighbors=4)
    #oversample = ADASYN(sampling_strategy='minority')
    oversample = BorderlineSMOTE(random_state=random_state)
    X_train, train_ors = oversample.fit_resample(X_train, train_ors)

    y_train = []
    for i in range(train_ors.shape[0]):
        y_train.append([coords[train_ors[i]]["X"], coords[train_ors[i]]["Y"]])
    y_test = []
    for i in range(test_ors.shape[0]):
        y_test.append([coords[test_ors[i]]["X"], coords[test_ors[i]]["Y"]])
    y_train = np.array(y_train)
    y_test = np.array(y_test)
    
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    for i in range(y_test.shape[0]):
        appended_data.append(["x", OR, y_test[i,0], y_pred[i,0]])
        appended_data.append(["y", OR, y_test[i,1], y_pred[i,1]])

In [123]:
appended_data = pd.DataFrame(appended_data, columns = ["axis","OR","observed","predicted"])

In [124]:
datax = appended_data[appended_data["axis"] == "x"]
datay = appended_data[appended_data["axis"] == "y"]
datax = datax.groupby(['OR']).mean()
datay = datay.groupby(['OR']).mean()

In [125]:
print("X R2 = {} \t Y R2 = {}".format(r2_score(datax.iloc[:,0],datax.iloc[:,1]),r2_score(datay.iloc[:,0],datay.iloc[:,1])))

X R2 = 0.719421325081337 	 Y R2 = 0.8341772968537957


In [127]:
datax2.to_csv("data/LR_LOO_x.csv")
datay2.to_csv("data/LR_LOO_y.csv")

In [126]:
print("X MAE = {} \t Y MAE = {}".format(mean_absolute_error(datax.iloc[:,0],datax.iloc[:,1])*130,mean_absolute_error(datay.iloc[:,0],datay.iloc[:,1])*0.65))

X MAE = 202.31505870197458 	 Y MAE = 230.41276041725965
