In [2]:
import pandas as pd
import numpy as np
import seaborn as sns



In [9]:
de_train = pd.read_parquet("../data/de_train.parquet") # основной датасет для работы
de_train

Unnamed: 0,cell_type,sm_name,sm_lincs_id,SMILES,control,A1BG,A1BG-AS1,A2M,A2M-AS1,A2MP1,...,ZUP1,ZW10,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11B,ZYX,ZZEF1
0,NK cells,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,0.104720,-0.077524,-1.625596,-0.144545,0.143555,...,-0.227781,-0.010752,-0.023881,0.674536,-0.453068,0.005164,-0.094959,0.034127,0.221377,0.368755
1,T cells CD4+,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,0.915953,-0.884380,0.371834,-0.081677,-0.498266,...,-0.494985,-0.303419,0.304955,-0.333905,-0.315516,-0.369626,-0.095079,0.704780,1.096702,-0.869887
2,T cells CD8+,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,-0.387721,-0.305378,0.567777,0.303895,-0.022653,...,-0.119422,-0.033608,-0.153123,0.183597,-0.555678,-1.494789,-0.213550,0.415768,0.078439,-0.259365
3,T regulatory cells,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,0.232893,0.129029,0.336897,0.486946,0.767661,...,0.451679,0.704643,0.015468,-0.103868,0.865027,0.189114,0.224700,-0.048233,0.216139,-0.085024
4,NK cells,Mometasone Furoate,LSM-3349,C[C@@H]1C[C@H]2[C@@H]3CCC4=CC(=O)C=C[C@]4(C)[C...,False,4.290652,-0.063864,-0.017443,-0.541154,0.570982,...,0.758474,0.510762,0.607401,-0.123059,0.214366,0.487838,-0.819775,0.112365,-0.122193,0.676629
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,T regulatory cells,Atorvastatin,LSM-5771,CC(C)c1c(C(=O)Nc2ccccc2)c(-c2ccccc2)c(-c2ccc(F...,False,-0.014372,-0.122464,-0.456366,-0.147894,-0.545382,...,-0.549987,-2.200925,0.359806,1.073983,0.356939,-0.029603,-0.528817,0.105138,0.491015,-0.979951
610,NK cells,Riociguat,LSM-45758,COC(=O)N(C)c1c(N)nc(-c2nn(Cc3ccccc3F)c3ncccc23...,False,-0.455549,0.188181,0.595734,-0.100299,0.786192,...,-1.236905,0.003854,-0.197569,-0.175307,0.101391,1.028394,0.034144,-0.231642,1.023994,-0.064760
611,T cells CD4+,Riociguat,LSM-45758,COC(=O)N(C)c1c(N)nc(-c2nn(Cc3ccccc3F)c3ncccc23...,False,0.338168,-0.109079,0.270182,-0.436586,-0.069476,...,0.077579,-1.101637,0.457201,0.535184,-0.198404,-0.005004,0.552810,-0.209077,0.389751,-0.337082
612,T cells CD8+,Riociguat,LSM-45758,COC(=O)N(C)c1c(N)nc(-c2nn(Cc3ccccc3F)c3ncccc23...,False,0.101138,-0.409724,-0.606292,-0.071300,-0.001789,...,0.005951,-0.893093,-1.003029,-0.080367,-0.076604,0.024849,0.012862,-0.029684,0.005506,-1.733112


# Описание данных:
- Гены A1BG, A1BG-AS1, …, ZZEF1(всего 18 211) - Значение дифференциальной экспрессии (-log10(p-value) * sign(LFC)) для каждого гена. Здесь LFC представляет собой предполагаемое логарифмическое изменение экспрессии между обработанным и контрольным состоянием после усадки, рассчитанное Лиммой. Положительный LFC означает, что уровень гена повышается в условиях лечения по сравнению с контролем.

- cell_type- Аннотированный тип каждой клетки на основе экспрессии РНК.

- sm_name- Основное имя (родительского) соединения (в стандартизированном представлении), выбранное LINCS. Это предусмотрено для сопоставления данных этого эксперимента с данными карты связности LINCS.

- sm_lincs_id- Глобальный идентификатор LINCS ID (родительского) соединения (в стандартизированном представлении). Это предусмотрено для сопоставления данных этого эксперимента с данными карты связности LINCS.

- SMILES- Упрощенная система молекулярного ввода (SMILES) представляет соединения, использованные в эксперименте. Это одномерное представление молекулярной структуры. Эти SMILES предоставлены Cellarity на основе конкретных соединений, заказанных для этого эксперимента.

- control — Логическое значение, указывающее, использовался ли этот экземпляр в качестве элемента управления.

In [10]:
more_info = de_train[["cell_type", "sm_name", "sm_lincs_id", "SMILES", "control"]] # дополнительная информация; вся остальные 18211 колонок описывают гены
more_info.describe()

Unnamed: 0,cell_type,sm_name,sm_lincs_id,SMILES,control
count,614,614,614,614,614
unique,6,146,146,146,2
top,NK cells,R428,LSM-45574,Nc1nc(Nc2ccc3c(c2)CC[C@@H](N2CCCC2)CC3)nn1-c1c...,False
freq,146,6,6,6,602


In [11]:
# de_train[de_train.isna()].count().sum()

In [12]:
id_map = pd.read_csv("../data/id_map.csv")


In [13]:
id_map

Unnamed: 0,id,cell_type,sm_name
0,0,B cells,5-(9-Isopropyl-8-methyl-2-morpholino-9H-purin-...
1,1,B cells,ABT-199 (GDC-0199)
2,2,B cells,ABT737
3,3,B cells,AMD-070 (hydrochloride)
4,4,B cells,AT 7867
...,...,...,...
250,250,Myeloid cells,Vandetanib
251,251,Myeloid cells,Vanoxerine
252,252,Myeloid cells,Vardenafil
253,253,Myeloid cells,Vorinostat


In [15]:
de_train.iloc[:,5:]

Unnamed: 0,A1BG,A1BG-AS1,A2M,A2M-AS1,A2MP1,A4GALT,AAAS,AACS,AAGAB,AAK1,...,ZUP1,ZW10,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11B,ZYX,ZZEF1
0,0.104720,-0.077524,-1.625596,-0.144545,0.143555,0.073229,-0.016823,0.101717,-0.005153,1.043629,...,-0.227781,-0.010752,-0.023881,0.674536,-0.453068,0.005164,-0.094959,0.034127,0.221377,0.368755
1,0.915953,-0.884380,0.371834,-0.081677,-0.498266,0.203559,0.604656,0.498592,-0.317184,0.375550,...,-0.494985,-0.303419,0.304955,-0.333905,-0.315516,-0.369626,-0.095079,0.704780,1.096702,-0.869887
2,-0.387721,-0.305378,0.567777,0.303895,-0.022653,-0.480681,0.467144,-0.293205,-0.005098,0.214918,...,-0.119422,-0.033608,-0.153123,0.183597,-0.555678,-1.494789,-0.213550,0.415768,0.078439,-0.259365
3,0.232893,0.129029,0.336897,0.486946,0.767661,0.718590,-0.162145,0.157206,-3.654218,-0.212402,...,0.451679,0.704643,0.015468,-0.103868,0.865027,0.189114,0.224700,-0.048233,0.216139,-0.085024
4,4.290652,-0.063864,-0.017443,-0.541154,0.570982,2.022829,0.600011,1.231275,0.236739,0.338703,...,0.758474,0.510762,0.607401,-0.123059,0.214366,0.487838,-0.819775,0.112365,-0.122193,0.676629
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,-0.014372,-0.122464,-0.456366,-0.147894,-0.545382,-0.544709,0.282458,-0.431359,-0.364961,0.043123,...,-0.549987,-2.200925,0.359806,1.073983,0.356939,-0.029603,-0.528817,0.105138,0.491015,-0.979951
610,-0.455549,0.188181,0.595734,-0.100299,0.786192,0.090954,0.169523,0.428297,0.106553,0.435088,...,-1.236905,0.003854,-0.197569,-0.175307,0.101391,1.028394,0.034144,-0.231642,1.023994,-0.064760
611,0.338168,-0.109079,0.270182,-0.436586,-0.069476,-0.061539,0.002818,-0.027167,-0.383696,0.226289,...,0.077579,-1.101637,0.457201,0.535184,-0.198404,-0.005004,0.552810,-0.209077,0.389751,-0.337082
612,0.101138,-0.409724,-0.606292,-0.071300,-0.001789,-0.706087,-0.620919,-1.485381,0.059303,-0.032584,...,0.005951,-0.893093,-1.003029,-0.080367,-0.076604,0.024849,0.012862,-0.029684,0.005506,-1.733112


In [33]:
X  = pd.get_dummies(de_train[["cell_type", "sm_name"]], columns=["cell_type", "sm_name"])
X_test = pd.get_dummies(id_map[["cell_type", "sm_name"]], columns=["cell_type", "sm_name"])

X = X.drop([f for f in X if f not in X_test], axis =1) # чтобы везде колонки были одинаковы
X.shape

(614, 131)

In [46]:
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVR
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.metrics import r2_score


y = de_train.drop( ['cell_type','sm_name','sm_lincs_id','SMILES','control'], axis = 1)
y



Unnamed: 0,A1BG,A1BG-AS1,A2M,A2M-AS1,A2MP1,A4GALT,AAAS,AACS,AAGAB,AAK1,...,ZUP1,ZW10,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11B,ZYX,ZZEF1
0,0.104720,-0.077524,-1.625596,-0.144545,0.143555,0.073229,-0.016823,0.101717,-0.005153,1.043629,...,-0.227781,-0.010752,-0.023881,0.674536,-0.453068,0.005164,-0.094959,0.034127,0.221377,0.368755
1,0.915953,-0.884380,0.371834,-0.081677,-0.498266,0.203559,0.604656,0.498592,-0.317184,0.375550,...,-0.494985,-0.303419,0.304955,-0.333905,-0.315516,-0.369626,-0.095079,0.704780,1.096702,-0.869887
2,-0.387721,-0.305378,0.567777,0.303895,-0.022653,-0.480681,0.467144,-0.293205,-0.005098,0.214918,...,-0.119422,-0.033608,-0.153123,0.183597,-0.555678,-1.494789,-0.213550,0.415768,0.078439,-0.259365
3,0.232893,0.129029,0.336897,0.486946,0.767661,0.718590,-0.162145,0.157206,-3.654218,-0.212402,...,0.451679,0.704643,0.015468,-0.103868,0.865027,0.189114,0.224700,-0.048233,0.216139,-0.085024
4,4.290652,-0.063864,-0.017443,-0.541154,0.570982,2.022829,0.600011,1.231275,0.236739,0.338703,...,0.758474,0.510762,0.607401,-0.123059,0.214366,0.487838,-0.819775,0.112365,-0.122193,0.676629
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,-0.014372,-0.122464,-0.456366,-0.147894,-0.545382,-0.544709,0.282458,-0.431359,-0.364961,0.043123,...,-0.549987,-2.200925,0.359806,1.073983,0.356939,-0.029603,-0.528817,0.105138,0.491015,-0.979951
610,-0.455549,0.188181,0.595734,-0.100299,0.786192,0.090954,0.169523,0.428297,0.106553,0.435088,...,-1.236905,0.003854,-0.197569,-0.175307,0.101391,1.028394,0.034144,-0.231642,1.023994,-0.064760
611,0.338168,-0.109079,0.270182,-0.436586,-0.069476,-0.061539,0.002818,-0.027167,-0.383696,0.226289,...,0.077579,-1.101637,0.457201,0.535184,-0.198404,-0.005004,0.552810,-0.209077,0.389751,-0.337082
612,0.101138,-0.409724,-0.606292,-0.071300,-0.001789,-0.706087,-0.620919,-1.485381,0.059303,-0.032584,...,0.005951,-0.893093,-1.003029,-0.080367,-0.076604,0.024849,0.012862,-0.029684,0.005506,-1.733112


In [47]:
model = MultiOutputRegressor(
    estimator = LinearSVR(max_iter= 1000, epsilon= 0.1)
)
model.fit(X, y)

MultiOutputRegressor(estimator=LinearSVR(epsilon=0.1))

In [48]:
y_pred = model.predict(X_test)


In [55]:
y_pred = pd.DataFrame(y_pred, columns=y.columns)
y_pred.index.name = 'id'
y_pred

Unnamed: 0_level_0,A1BG,A1BG-AS1,A2M,A2M-AS1,A2MP1,A4GALT,AAAS,AACS,AAGAB,AAK1,...,ZUP1,ZW10,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11B,ZYX,ZZEF1
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.355189,0.220469,0.088446,0.544013,0.553126,0.249853,-0.011560,0.260391,0.091142,0.215530,...,-0.603900,-0.258775,-0.210734,0.848886,0.376037,0.371943,0.250302,0.572740,-0.349686,0.618835
1,0.185722,0.014152,0.078208,0.250060,0.549358,0.234991,-0.011560,0.147287,0.724308,0.135988,...,0.073854,0.069064,-0.094287,0.131207,0.274178,-0.139858,0.048281,-0.039772,-0.331083,0.040287
2,0.417098,0.220469,0.013850,0.314780,0.733105,0.847751,0.183300,0.253705,0.154690,0.238303,...,-0.107717,-0.121787,-0.044427,0.620088,0.175750,0.153427,0.199606,0.065535,-0.243128,-0.066197
3,0.094097,0.174741,0.058587,0.364719,0.445544,0.221968,-0.245106,0.260391,0.348565,0.349800,...,-0.150880,0.037996,0.062875,0.347832,0.089398,0.153427,0.179235,0.222085,-0.207473,0.224725
4,0.414870,-0.030957,0.144085,0.364719,0.477374,0.243302,-0.323718,0.260391,0.293310,0.366617,...,-0.647122,-0.042693,-0.054920,0.123561,0.114638,0.238210,-0.047380,0.512520,-0.163860,0.054209
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
250,0.407330,-0.093537,-0.160269,-0.390564,1.245345,0.242089,0.186852,0.137276,0.356397,0.733185,...,0.097091,-0.010932,-0.235728,-0.043733,0.312129,-0.002624,-0.082250,-0.153025,-0.408372,0.040547
251,0.530511,-0.378273,0.135925,0.131040,1.591201,0.820110,-0.030905,-0.044211,0.165144,0.324692,...,0.030805,-0.010932,0.156236,0.074423,0.505206,-0.059609,-0.307180,-0.056433,-0.455711,-0.142204
252,0.452210,0.574761,-0.080612,0.009547,1.378137,0.396256,0.078540,-0.403259,0.460308,0.580828,...,0.087118,-0.095732,0.856929,0.077832,0.505206,-0.396341,-0.045857,-0.153025,-0.000431,-0.394028
253,0.148125,-0.114624,0.119461,0.105054,1.031677,0.755604,0.432216,0.466481,1.165594,0.936327,...,0.133203,-0.010932,0.077157,-0.106241,0.104198,0.048701,-0.351337,0.116828,-0.336613,0.254230


In [56]:
y_pred.to_csv("submission.csv")

In [57]:
y_pred

Unnamed: 0_level_0,A1BG,A1BG-AS1,A2M,A2M-AS1,A2MP1,A4GALT,AAAS,AACS,AAGAB,AAK1,...,ZUP1,ZW10,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11B,ZYX,ZZEF1
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.355189,0.220469,0.088446,0.544013,0.553126,0.249853,-0.011560,0.260391,0.091142,0.215530,...,-0.603900,-0.258775,-0.210734,0.848886,0.376037,0.371943,0.250302,0.572740,-0.349686,0.618835
1,0.185722,0.014152,0.078208,0.250060,0.549358,0.234991,-0.011560,0.147287,0.724308,0.135988,...,0.073854,0.069064,-0.094287,0.131207,0.274178,-0.139858,0.048281,-0.039772,-0.331083,0.040287
2,0.417098,0.220469,0.013850,0.314780,0.733105,0.847751,0.183300,0.253705,0.154690,0.238303,...,-0.107717,-0.121787,-0.044427,0.620088,0.175750,0.153427,0.199606,0.065535,-0.243128,-0.066197
3,0.094097,0.174741,0.058587,0.364719,0.445544,0.221968,-0.245106,0.260391,0.348565,0.349800,...,-0.150880,0.037996,0.062875,0.347832,0.089398,0.153427,0.179235,0.222085,-0.207473,0.224725
4,0.414870,-0.030957,0.144085,0.364719,0.477374,0.243302,-0.323718,0.260391,0.293310,0.366617,...,-0.647122,-0.042693,-0.054920,0.123561,0.114638,0.238210,-0.047380,0.512520,-0.163860,0.054209
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
250,0.407330,-0.093537,-0.160269,-0.390564,1.245345,0.242089,0.186852,0.137276,0.356397,0.733185,...,0.097091,-0.010932,-0.235728,-0.043733,0.312129,-0.002624,-0.082250,-0.153025,-0.408372,0.040547
251,0.530511,-0.378273,0.135925,0.131040,1.591201,0.820110,-0.030905,-0.044211,0.165144,0.324692,...,0.030805,-0.010932,0.156236,0.074423,0.505206,-0.059609,-0.307180,-0.056433,-0.455711,-0.142204
252,0.452210,0.574761,-0.080612,0.009547,1.378137,0.396256,0.078540,-0.403259,0.460308,0.580828,...,0.087118,-0.095732,0.856929,0.077832,0.505206,-0.396341,-0.045857,-0.153025,-0.000431,-0.394028
253,0.148125,-0.114624,0.119461,0.105054,1.031677,0.755604,0.432216,0.466481,1.165594,0.936327,...,0.133203,-0.010932,0.077157,-0.106241,0.104198,0.048701,-0.351337,0.116828,-0.336613,0.254230
