In [1]:
#!pip install scispacy
import numpy as np
import re
import time
import spacy
import gensim
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [34]:
adverse_effects = pd.read_csv('data/concepts/side_effects.txt', sep='\t')
adverse_effects['id'] = range(len(adverse_effects))

old_CUIs = pd.read_csv('data/concepts_2/concepts.tsv', sep='\t')
adverse_effects

Unnamed: 0,id,Text
0,0,"extreme weight gain, short-term memory loss, h..."
1,1,COMPLETELY DESTROYED SEXUALLY FUNCTIONING .
2,2,Just TWO tablets of Lexapro 10mg completely de...
3,3,It's called PSSD: post-SSRI sexual dysfunction.
4,4,And there is a chance that it will give you PS...
...,...,...
2145,2145,Exercising and dieting don't seem to get the w...
2146,2146,"Stomach problems early on: bloating, nausea, c..."
2147,2147,No side effects now accept for yawning.
2148,2148,The only side effects I experienced were mild ...


In [35]:
old_CUIs = old_CUIs.drop(columns='SNOMED_CODE')
old_CUIs.head()

Unnamed: 0,CUI,CONCEPT
0,C0000765,"Excessive body weight gain,Excessive weight gain"
1,C0701811,"Poor short-term memory,Poor short-term memory"
2,C0002170,"Alopecia,Loss of hair"
3,C0549622,"Sexual Dysfunction,Sexual disorder"
4,C0027497,"Nausea,Nausea"


In [36]:
# Функция для разделения строк и создания нового DataFrame
def split_concept(row):
    concepts = row['CONCEPT'].split(',')
    return pd.DataFrame({
        'CUI': [row['CUI']] * len(concepts),
        'CONCEPT': concepts
    })

In [37]:
CUIs = pd.concat([split_concept(row) for _, row in old_CUIs.iterrows()], ignore_index=True)
CUIs = CUIs.drop_duplicates()
CUIs.head()

Unnamed: 0,CUI,CONCEPT
0,C0000765,Excessive body weight gain
1,C0000765,Excessive weight gain
2,C0701811,Poor short-term memory
4,C0002170,Alopecia
5,C0002170,Loss of hair


In [38]:
CUIs.shape

(730, 2)

In [39]:
# скачать, если не установлено
# !pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_lg-0.5.1.tar.gz

In [49]:
%%time
# Загрузка модели для английского языка
nlp = spacy.load("en_core_sci_lg")
spacy_stop_words = nlp.Defaults.stop_words

CPU times: total: 8.59 s
Wall time: 9.07 s


In [50]:
# Предобработка текста
custom_stop_words = ['noc', 'nos', '[d]', 'unknown_unit', '|', 'see comment', 'due', 'nec', 'unspecified', '[v]', '(see comments)']
stop_words = spacy_stop_words.union(set(custom_stop_words))

def preprocess_text(text):
    if isinstance(text, str):
        text = re.sub(r"\s+", " ", text.encode('ascii', 'ignore').decode())
        text = re.sub("[,.!;?)%(\'\":-]", '', text)
        text = text.lower()
        text = ' '.join([word for word in text.split() if word not in stop_words])
    else:
        text = ''
    return text

In [51]:
# Применение предварительной обработки
adverse_effects['adv_name_processed'] = adverse_effects['Text'].apply(preprocess_text)
CUIs['concept_name_processed'] = CUIs['CONCEPT'].apply(preprocess_text)

In [52]:
adverse_effects['adv_name_processed'].head()

0    extreme weight gain shortterm memory loss hair...
1            completely destroyed sexually functioning
2    tablets lexapro 10mg completely destroyed sexu...
3              called pssd postssri sexual dysfunction
4    chance pssd suggests persists stop taking drug...
Name: adv_name_processed, dtype: object

In [53]:
CUIs['concept_name_processed'].head()

0    excessive body weight gain
1         excessive weight gain
2         poor shortterm memory
4                      alopecia
5                     loss hair
Name: concept_name_processed, dtype: object

In [54]:
# Токенизация
def tokenize(text):
    doc = nlp(text)
    tokens = [token.text for token in doc if not token.is_punct and not token.is_space]
    return tokens

In [55]:
%%time

adverse_effects['adv_name_processed'] = adverse_effects['adv_name_processed'].apply(tokenize)
CUIs['concept_name_processed'] = CUIs['concept_name_processed'].apply(tokenize)

CPU times: total: 15.2 s
Wall time: 15.3 s


In [56]:
# Лематизация
def lemmatize(text):
    doc = nlp(text)
    lemmatized_tokens = [token.lemma_ for token in doc]
    return lemmatized_tokens

In [57]:
%%time

adverse_effects['adv_name_processed_tokens'] = adverse_effects['adv_name_processed'].apply(lambda x: lemmatize(' '.join(x)))
CUIs['concept_name_processed_tokens'] = CUIs['concept_name_processed'].apply(lambda x: lemmatize(' '.join(x)))

CPU times: total: 14.2 s
Wall time: 14.4 s


In [58]:
adverse_effects.head(10)

Unnamed: 0,id,Text,adv_name_processed,adv_name_processed_tokens
0,0,"extreme weight gain, short-term memory loss, h...","[extreme, weight, gain, shortterm, memory, los...","[extreme, weight, gain, shortterm, memory, los..."
1,1,COMPLETELY DESTROYED SEXUALLY FUNCTIONING .,"[completely, destroyed, sexually, functioning]","[completely, destroy, sexually, function]"
2,2,Just TWO tablets of Lexapro 10mg completely de...,"[tablets, lexapro, 10, mg, completely, destroy...","[tablet, lexapro, 10, mg, completely, destroy,..."
3,3,It's called PSSD: post-SSRI sexual dysfunction.,"[called, pssd, postssri, sexual, dysfunction]","[call, pssd, postssri, sexual, dysfunction]"
4,4,And there is a chance that it will give you PS...,"[chance, pssd, suggests, persists, stop, takin...","[chance, pssd, suggest, persist, stop, take, d..."
5,5,"Nausea, Blurred Vision, 3 to 5 hours sleep, Su...","[nausea, blurred, vision, 3, 5, hours, sleep, ...","[nausea, blurred, vision, 3, 5, hour, sleep, s..."
6,6,"I was unable to sleep, had blurred vision, and...","[unable, sleep, blurred, vision, felt, sick, s...","[unable, sleep, blurred, vision, feel, sick, s..."
7,7,Unable to eat anything significant for the 3 d...,"[unable, eat, significant, 3, days]","[unable, eat, significant, 3, day]"
8,8,While driving to a friends house crazy thought...,"[driving, friends, house, crazy, thoughts, kep...","[drive, friend, house, crazy, thought, keep, h..."
9,9,Would not have been able to work (software dev...,"[able, work, software, developer, attempting, ...","[able, work, software, developer, attempt, dru..."


In [59]:
CUIs.head(10)

Unnamed: 0,CUI,CONCEPT,concept_name_processed,concept_name_processed_tokens
0,C0000765,Excessive body weight gain,"[excessive, body, weight, gain]","[excessive, body, weight, gain]"
1,C0000765,Excessive weight gain,"[excessive, weight, gain]","[excessive, weight, gain]"
2,C0701811,Poor short-term memory,"[poor, shortterm, memory]","[poor, shortterm, memory]"
4,C0002170,Alopecia,[alopecia],[alopecia]
5,C0002170,Loss of hair,"[loss, hair]","[loss, hair]"
6,C0549622,Sexual Dysfunction,"[sexual, dysfunction]","[sexual, dysfunction]"
7,C0549622,Sexual disorder,"[sexual, disorder]","[sexual, disorder]"
8,C0027497,Nausea,[nausea],[nausea]
10,C0344232,Blurred vision,"[blurred, vision]","[blurred, vision]"
11,C0344232,Blurring of visual image,"[blurring, visual, image]","[blur, visual, image]"


### Векторизация токенов

In [60]:
combined_texts = pd.concat([adverse_effects['adv_name_processed'], CUIs['concept_name_processed']])
vectorizer = TfidfVectorizer(tokenizer=lambda x: x, lowercase=False, token_pattern=None)
vectorizer.fit(combined_texts)
adv_effects_matrix = vectorizer.transform(adverse_effects['adv_name_processed'])
CUIs_matrix = vectorizer.transform(CUIs['concept_name_processed'])

In [61]:
# Поиск концептов с использованием косинусного сходства
similarities = cosine_similarity(adv_effects_matrix, CUIs_matrix)

In [62]:
# Получение наиболее похожих концептов для каждого побочного эффекта
adverse_effects['most_similar_concept_index'] = similarities.argmax(axis=1)
adverse_effects['similarity_score'] = similarities.max(axis=1)

In [63]:
# Получение текста наиболее похожих концептов
def get_concept_text(index):
    return CUIs.iloc[index]['CONCEPT']

In [64]:
adverse_effects['most_similar_concept_text'] = adverse_effects['most_similar_concept_index'].apply(get_concept_text)

In [65]:
# Добавление текста концептов в DataFrame
adverse_effects = adverse_effects.merge(CUIs[['CUI', 'CONCEPT']], left_on='most_similar_concept_index', right_index=True, suffixes=('', '_concept'))

# # Переименование столбцов для удобства
# adverse_effects = adverse_effects.rename(columns={'CONCEPT': 'most_similar_concept_text'})

In [66]:
# Вывод результатов
adverse_effects[['Text', 'adv_name_processed_tokens', 'most_similar_concept_text', 'similarity_score']].head(10)

Unnamed: 0,Text,adv_name_processed_tokens,most_similar_concept_text,similarity_score
0,"extreme weight gain, short-term memory loss, h...","[extreme, weight, gain, shortterm, memory, los...",Loss of hair,0.612739
1,COMPLETELY DESTROYED SEXUALLY FUNCTIONING .,"[completely, destroy, sexually, function]",Cannot be aroused sexually,0.36211
2,Just TWO tablets of Lexapro 10mg completely de...,"[tablet, lexapro, 10, mg, completely, destroy,...",Sexual disorder,0.143115
3,It's called PSSD: post-SSRI sexual dysfunction.,"[call, pssd, postssri, sexual, dysfunction]",Sexual Dysfunction,0.495548
5,"Nausea, Blurred Vision, 3 to 5 hours sleep, Su...","[nausea, blurred, vision, 3, 5, hour, sleep, s...",Blurred vision,0.53316
6,"I was unable to sleep, had blurred vision, and...","[unable, sleep, blurred, vision, feel, sick, s...",Blurred vision,0.591695
7,Unable to eat anything significant for the 3 d...,"[unable, eat, significant, 3, day]",Unable to move,0.414744
9,Would not have been able to work (software dev...,"[able, work, software, developer, attempt, dru...",Problems at work,0.382048
10,"First 10 days were HORRIBLE, like a looong pan...","[10, day, horrible, like, looong, panic, attac...",Panic attack,0.496849
13,the last three weeks I haven't been able to wa...,"[week, have, not, able, watch, tv, make, feel,...",Dizziness,0.182931


### Предобученная модель

In [67]:
%%time

model_path = 'data/BioWordVec_PubMed_MIMICIII_d200.vec.bin'  # 12.5G
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=True)

CPU times: total: 1min 53s
Wall time: 2min 33s


In [68]:
# Функция для векторизации текста с использованием BioWordVec
def vectorize_text_with_gensim_embeddings(tokens, model):
    vectors = [model[token] for token in tokens if token in model]
    if vectors:
        return np.mean(vectors, axis=0)  # Усреднение векторов
    else:
        return np.zeros(model.vector_size)  # Возвращает нулевой вектор, если нет известных слов

In [69]:
# Применение функции к медицинским терминам в обоих наборах данных
adverse_effects['adv_eff_vector'] = adverse_effects['adv_name_processed'].apply(lambda x: vectorize_text_with_gensim_embeddings(x, word2vec_model))
CUIs['CUI_vector'] = CUIs['concept_name_processed'].apply(lambda x: vectorize_text_with_gensim_embeddings(x, word2vec_model))

# Удаление строк с отсутствующими векторами
adverse_effects = adverse_effects[adverse_effects['adv_eff_vector'].apply(lambda x: x is not None)]
CUIs = CUIs[CUIs['CUI_vector'].apply(lambda x: x is not None)]

In [70]:
# Применение функции к медицинским терминам в обоих наборах данных
adverse_effects['adv_eff_vector'] = adverse_effects['adv_name_processed'].apply(lambda x: vectorize_text_with_gensim_embeddings(x, word2vec_model))
CUIs['CUI_vector'] = CUIs['concept_name_processed'].apply(lambda x: vectorize_text_with_gensim_embeddings(x, word2vec_model))

# Удаление строк с отсутствующими векторами
adverse_effects = adverse_effects[adverse_effects['adv_eff_vector'].apply(lambda x: x is not None)]
CUIs = CUIs[CUIs['CUI_vector'].apply(lambda x: x is not None)]

In [71]:
v1 = np.vstack(adverse_effects['adv_eff_vector'])
v2 = np.vstack(CUIs['CUI_vector'])

In [74]:
# Вычисление косинусного сходства
similarity = cosine_similarity(v1, v2)

# Получение топ N похожих концептов
num_top_matches = 2
tops = (-similarity).argsort(axis=1)[:, :num_top_matches]

# Создание таблицы с результатами
results = []
for i in range(len(adverse_effects)):
    for t in tops[i]:
        results.append({
            'Text': adverse_effects.iloc[i]['Text'],
            'Most Similar Concept': CUIs.iloc[t]['CONCEPT'],
            'Similarity': similarity[i][t]
        })

# Создание DataFrame из списка результатов
joined_table = pd.DataFrame(results)

# Удаление дублирующихся строк
joined_table = joined_table.drop_duplicates().reset_index(drop=True)

joined_table[10:20]

Unnamed: 0,Text,Most Similar Concept,Similarity
10,"I was unable to sleep, had blurred vision, and...",Feels dreams are real,0.823651
11,"I was unable to sleep, had blurred vision, and...",Sleep Talking,0.815879
12,Unable to eat anything significant for the 3 d...,Unable to think clearly,0.780996
13,Unable to eat anything significant for the 3 d...,Unable to drink,0.775648
14,Would not have been able to work (software dev...,Unable to perform creative activity,0.826414
15,Would not have been able to work (software dev...,Stopped work,0.825809
16,"First 10 days were HORRIBLE, like a looong pan...",Anxiety attack,0.82543
17,"First 10 days were HORRIBLE, like a looong pan...",Panic attack,0.814723
18,the last three weeks I haven't been able to wa...,Unable to think clearly,0.805907
19,the last three weeks I haven't been able to wa...,Feels dreams are real,0.76818
