In [1564]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, ADASYN
from sklearn.model_selection import GridSearchCV, GroupKFold
from sklearn.linear_model import LinearRegression
import sys
import pickle
from sklearn.metrics import r2_score
from sklearn.linear_model import HuberRegressor

In [1565]:
model = LinearRegression()

In [1566]:
random_state = 12
min_cells = 7

In [1567]:
df = pd.read_csv("transformed_coords_1to2.txt",
                 sep = "\t", header=None, names=["OR", "X", "Y", "EXP"])

In [1568]:
# Average coordinates shared between the two experiments
df = df.groupby("OR").agg({"X" : "mean", "Y" : "mean", "EXP" : "sum"})
# This is just a quick and dirty way to remove the OR that falls of the slide without knowing its ID
df = df[df["Y"] < 0]
ORs = sorted(list(df.index))
print("{} ORs with Slide-seq coordinates".format(len(ORs)))
# Convert coordinates to a dictionary keyed on OR
coords = df[["X", "Y"]].to_dict("index")
# Fit scaler so it stores the min and max of the x and y coordinates
scaler_y = MinMaxScaler().fit(df[["X", "Y"]].values)

77 ORs with Slide-seq coordinates


In [1569]:
df = pd.read_csv("OSN_all_50SPCs_2000G.csv", index_col=0)

In [1570]:
df.rename(columns={"observed": "OR"}, inplace=True)
df.rename(columns={"OR_identity": "OR"}, inplace=True)
df.drop(df.columns[len(df.columns)-1], axis=1, inplace=True)


In [1571]:
df.drop(df.columns[40:50], axis=1,inplace=True)
print(df.head())

                          SPC_1      SPC_2     SPC_3     SPC_4     SPC_5  \
V3F1_CAACGGCAGTAAACAC  2.204747   2.702510  1.306022 -5.855968 -0.499848   
V3F3_GAGACCCTCTGAGCAT  7.649368  -3.261961  1.164442 -4.717881 -3.080400   
V3F1_GAGGGATGTAGCCCTG  3.425824  10.987341  0.713193 -0.977275 -3.692072   
V3F3_ACGTAGTCAGGACAGT  7.682670   3.251003 -5.206459 -1.062002 -3.356139   
V3M1_ACCAAACCAAGTGACG -2.742237   3.922319 -2.571446  8.210359  5.703357   

                          SPC_6     SPC_7     SPC_8     SPC_9    SPC_10  ...  \
V3F1_CAACGGCAGTAAACAC -0.473495 -0.340626 -0.443413  0.688596  1.315484  ...   
V3F3_GAGACCCTCTGAGCAT -2.028103  0.584709  0.125866 -1.427942  0.957374  ...   
V3F1_GAGGGATGTAGCCCTG -3.463886 -2.170145  0.206280 -1.082710 -0.005097  ...   
V3F3_ACGTAGTCAGGACAGT  0.702868 -5.473109  2.582588 -2.513732  3.346870  ...   
V3M1_ACCAAACCAAGTGACG  7.302127 -3.774360 -3.498602 -2.846654  6.789442  ...   

                         SPC_32    SPC_33    SPC_34    SPC_35 

In [1572]:
df = df[df["OR"].isin(ORs)]

In [1573]:
ORs_to_exclude = [x for x in ORs if len(df[df["OR"] == x]) < min_cells]
print(ORs_to_exclude)
df = df[~df["OR"].isin(ORs_to_exclude)]

['Olfr1000', 'Olfr1031', 'Olfr1086', 'Olfr118', 'Olfr1357', 'Olfr1424', 'Olfr1477', 'Olfr1496', 'Olfr433', 'Olfr517', 'Olfr749', 'Olfr822', 'Olfr994']


In [1574]:
print("{} ORs with Slide-seq coordinates but no gene expression".format(len(ORs_to_exclude)))
ORs = [x for x in ORs if len(df[df["OR"] == x]) >= min_cells]
print("{} ORs remaining".format(len(ORs)))

13 ORs with Slide-seq coordinates but no gene expression
64 ORs remaining


In [1575]:
n_col = len(df.columns)
appended_data = []
for z in ORs:
    OR = z
    X_train = df[(df["OR"] != OR)].values[:,0:(n_col - 1)]
    X_test = df[(df["OR"] == OR)].values[:,0:(n_col - 1)]
    train_ors = np.array(df[(df["OR"] != OR)].iloc[:,(n_col - 1)])
    test_ors = np.array(df[(df["OR"] == OR)].iloc[:,(n_col - 1)])
    
    #oversample = BorderlineSMOTE(random_state=random_state, k_neighbors=4)
    #oversample = ADASYN(sampling_strategy='minority')
    oversample = BorderlineSMOTE(random_state=random_state)
    X_train, train_ors = oversample.fit_resample(X_train, train_ors)

    y_train = []
    for i in range(train_ors.shape[0]):
        y_train.append([coords[train_ors[i]]["X"], coords[train_ors[i]]["Y"]])
    y_test = []
    for i in range(test_ors.shape[0]):
        y_test.append([coords[test_ors[i]]["X"], coords[test_ors[i]]["Y"]])
    y_train = np.array(y_train)
    y_test = np.array(y_test)
    
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    for i in range(y_test.shape[0]):
        appended_data.append(["x", OR, y_test[i,0], y_pred[i,0]])
        appended_data.append(["y", OR, y_test[i,1], y_pred[i,1]])

In [1576]:
appended_data = pd.DataFrame(appended_data, columns = ["axis","OR","observed","predicted"])

In [1577]:
datax = appended_data[appended_data["axis"] == "x"]
datay = appended_data[appended_data["axis"] == "y"]

In [1578]:
print("X R2 = {} \t Y R2 = {}".format(r2_score(datax.iloc[:,2],datax.iloc[:,3]),r2_score(datay.iloc[:,2],datay.iloc[:,3])))

X R2 = 0.5221460554177 	 Y R2 = 0.6278000297971973


In [1579]:
datax2 = datax.groupby(['OR']).mean()
datay2 = datay.groupby(['OR']).mean()

In [1580]:
print("X R2 = {} \t Y R2 = {}".format(r2_score(datax2.iloc[:,0],datax2.iloc[:,1]),r2_score(datay2.iloc[:,0],datay2.iloc[:,1])))

X R2 = 0.6960795967404123 	 Y R2 = 0.830374525137265


In [1583]:
datax2.to_csv("data/LR_LOO_x.csv")
datay2.to_csv("data/LR_LOO_y.csv")