In [1]:
#!pip install scispacy
import numpy as np
import re
import time
import spacy
import gensim
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [115]:
adverse_effects = pd.read_csv('data/concepts/side_effects.txt', sep='\t')
adverse_effects['id'] = range(len(adverse_effects))

old_CUIs = pd.read_csv('data/concepts_2/concepts.tsv', sep='\t')
adverse_effects

Unnamed: 0,id,Text
0,0,"extreme weight gain, short-term memory loss, h..."
1,1,COMPLETELY DESTROYED SEXUALLY FUNCTIONING .
2,2,Just TWO tablets of Lexapro 10mg completely de...
3,3,It's called PSSD: post-SSRI sexual dysfunction.
4,4,And there is a chance that it will give you PS...
...,...,...
2145,2145,Exercising and dieting don't seem to get the w...
2146,2146,"Stomach problems early on: bloating, nausea, c..."
2147,2147,No side effects now accept for yawning.
2148,2148,The only side effects I experienced were mild ...


In [116]:
old_CUIs = old_CUIs.drop(columns='SNOMED_CODE')
old_CUIs.head()

Unnamed: 0,CUI,CONCEPT
0,C0000765,"Excessive body weight gain,Excessive weight gain"
1,C0701811,"Poor short-term memory,Poor short-term memory"
2,C0002170,"Alopecia,Loss of hair"
3,C0549622,"Sexual Dysfunction,Sexual disorder"
4,C0027497,"Nausea,Nausea"


In [117]:
# Функция для разделения строк и создания нового DataFrame
def split_concept(row):
    concepts = row['CONCEPT'].split(',')
    return pd.DataFrame({
        'CUI': [row['CUI']] * len(concepts),
        'CONCEPT': concepts
    })

In [118]:
CUIs = pd.concat([split_concept(row) for _, row in old_CUIs.iterrows()], ignore_index=True)
CUIs = CUIs.drop_duplicates()
CUIs.head()

Unnamed: 0,CUI,CONCEPT
0,C0000765,Excessive body weight gain
1,C0000765,Excessive weight gain
2,C0701811,Poor short-term memory
4,C0002170,Alopecia
5,C0002170,Loss of hair


In [119]:
CUIs.shape

(730, 2)

In [120]:
# скачать, если не установлено
# !pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_lg-0.5.1.tar.gz

In [121]:
%%time
# Загрузка модели для английского языка
nlp = spacy.load("en_core_sci_lg")
spacy_stop_words = nlp.Defaults.stop_words

CPU times: total: 10.5 s
Wall time: 12.3 s


In [122]:
# Предобработка текста
custom_stop_words = ['noc', 'nos', '[d]', 'unknown_unit', '|', 'see comment', 'due', 'nec', 'unspecified', '[v]', '(see comments)']
stop_words = spacy_stop_words.union(set(custom_stop_words))

def preprocess_text(text):
    if isinstance(text, str):
        text = re.sub(r"\s+", " ", text.encode('ascii', 'ignore').decode())
        text = re.sub("[,.!;?)%(\'\":-]", '', text)
        text = text.lower()
        text = ' '.join([word for word in text.split() if word not in stop_words])
    else:
        text = ''
    return text

In [123]:
# Применение предварительной обработки
adverse_effects['adv_name_processed'] = adverse_effects['Text'].apply(preprocess_text)
CUIs['concept_name_processed'] = CUIs['CONCEPT'].apply(preprocess_text)

In [124]:
adverse_effects['adv_name_processed'].head()

0    extreme weight gain shortterm memory loss hair...
1            completely destroyed sexually functioning
2    tablets lexapro 10mg completely destroyed sexu...
3              called pssd postssri sexual dysfunction
4    chance pssd suggests persists stop taking drug...
Name: adv_name_processed, dtype: object

In [125]:
CUIs['concept_name_processed'].head()

0    excessive body weight gain
1         excessive weight gain
2         poor shortterm memory
4                      alopecia
5                     loss hair
Name: concept_name_processed, dtype: object

In [126]:
# Токенизация
def tokenize(text):
    doc = nlp(text)
    tokens = [token.text for token in doc if not token.is_punct and not token.is_space]
    return tokens

In [127]:
%%time

adverse_effects['adv_name_processed'] = adverse_effects['adv_name_processed'].apply(tokenize)
CUIs['concept_name_processed'] = CUIs['concept_name_processed'].apply(tokenize)

CPU times: total: 15.8 s
Wall time: 16.2 s


In [128]:
# Лематизация
def lemmatize(text):
    doc = nlp(text)
    lemmatized_tokens = [token.lemma_ for token in doc]
    return lemmatized_tokens

In [129]:
%%time

adverse_effects['adv_name_processed_tokens'] = adverse_effects['adv_name_processed'].apply(lambda x: lemmatize(' '.join(x)))
CUIs['concept_name_processed_tokens'] = CUIs['concept_name_processed'].apply(lambda x: lemmatize(' '.join(x)))

CPU times: total: 14.4 s
Wall time: 14.4 s


In [130]:
adverse_effects.head(10)

Unnamed: 0,id,Text,adv_name_processed,adv_name_processed_tokens
0,0,"extreme weight gain, short-term memory loss, h...","[extreme, weight, gain, shortterm, memory, los...","[extreme, weight, gain, shortterm, memory, los..."
1,1,COMPLETELY DESTROYED SEXUALLY FUNCTIONING .,"[completely, destroyed, sexually, functioning]","[completely, destroy, sexually, function]"
2,2,Just TWO tablets of Lexapro 10mg completely de...,"[tablets, lexapro, 10, mg, completely, destroy...","[tablet, lexapro, 10, mg, completely, destroy,..."
3,3,It's called PSSD: post-SSRI sexual dysfunction.,"[called, pssd, postssri, sexual, dysfunction]","[call, pssd, postssri, sexual, dysfunction]"
4,4,And there is a chance that it will give you PS...,"[chance, pssd, suggests, persists, stop, takin...","[chance, pssd, suggest, persist, stop, take, d..."
5,5,"Nausea, Blurred Vision, 3 to 5 hours sleep, Su...","[nausea, blurred, vision, 3, 5, hours, sleep, ...","[nausea, blurred, vision, 3, 5, hour, sleep, s..."
6,6,"I was unable to sleep, had blurred vision, and...","[unable, sleep, blurred, vision, felt, sick, s...","[unable, sleep, blurred, vision, feel, sick, s..."
7,7,Unable to eat anything significant for the 3 d...,"[unable, eat, significant, 3, days]","[unable, eat, significant, 3, day]"
8,8,While driving to a friends house crazy thought...,"[driving, friends, house, crazy, thoughts, kep...","[drive, friend, house, crazy, thought, keep, h..."
9,9,Would not have been able to work (software dev...,"[able, work, software, developer, attempting, ...","[able, work, software, developer, attempt, dru..."


In [59]:
CUIs.head(10)

Unnamed: 0,CUI,CONCEPT,concept_name_processed,concept_name_processed_tokens
0,C0000765,Excessive body weight gain,"[excessive, body, weight, gain]","[excessive, body, weight, gain]"
1,C0000765,Excessive weight gain,"[excessive, weight, gain]","[excessive, weight, gain]"
2,C0701811,Poor short-term memory,"[poor, shortterm, memory]","[poor, shortterm, memory]"
4,C0002170,Alopecia,[alopecia],[alopecia]
5,C0002170,Loss of hair,"[loss, hair]","[loss, hair]"
6,C0549622,Sexual Dysfunction,"[sexual, dysfunction]","[sexual, dysfunction]"
7,C0549622,Sexual disorder,"[sexual, disorder]","[sexual, disorder]"
8,C0027497,Nausea,[nausea],[nausea]
10,C0344232,Blurred vision,"[blurred, vision]","[blurred, vision]"
11,C0344232,Blurring of visual image,"[blurring, visual, image]","[blur, visual, image]"


### Векторизация токенов

In [131]:
combined_texts = pd.concat([adverse_effects['adv_name_processed'], CUIs['concept_name_processed']])
vectorizer = TfidfVectorizer(tokenizer=lambda x: x, lowercase=False, token_pattern=None)
vectorizer.fit(combined_texts)
adv_effects_matrix = vectorizer.transform(adverse_effects['adv_name_processed'])
CUIs_matrix = vectorizer.transform(CUIs['concept_name_processed'])

In [132]:
# Поиск концептов с использованием косинусного сходства
similarities = cosine_similarity(adv_effects_matrix, CUIs_matrix)

In [133]:
# Получение наиболее похожих концептов для каждого побочного эффекта
adverse_effects['most_similar_concept_index'] = similarities.argmax(axis=1)
adverse_effects['Similarity'] = similarities.max(axis=1)

In [134]:
# Получение текста наиболее похожих концептов
def get_concept_text(index):
    return CUIs.iloc[index]['CONCEPT']

# Получение CUI
def get_concept_cui(index):
    return CUIs.iloc[index]['CUI']

In [135]:
adverse_effects['most_similar_concept_text'] = adverse_effects['most_similar_concept_index'].apply(get_concept_text)
adverse_effects['concept_cui'] = adverse_effects['most_similar_concept_index'].apply(get_concept_cui)

In [136]:
# Добавление текста концептов в DataFrame
adverse_effects = adverse_effects.merge(CUIs[['CUI', 'CONCEPT']], left_on='most_similar_concept_index', right_index=True, suffixes=('', '_concept'))

In [146]:
adverse_effects['level_4'] = (adverse_effects['Similarity'] > 0.4).astype(int)

In [147]:
# Вывод результатов
adverse_effects[['id', 'Text', 'most_similar_concept_text', 'Similarity', 'concept_cui', 'level_4']].head()

Unnamed: 0,id,Text,most_similar_concept_text,Similarity,concept_cui,level_4
0,0,"extreme weight gain, short-term memory loss, h...",Loss of hair,0.612739,C0002170,1
1,1,COMPLETELY DESTROYED SEXUALLY FUNCTIONING .,Cannot be aroused sexually,0.36211,C0425736,0
2,2,Just TWO tablets of Lexapro 10mg completely de...,Sexual disorder,0.143115,C0549622,0
3,3,It's called PSSD: post-SSRI sexual dysfunction.,Sexual Dysfunction,0.495548,C0549622,1
5,5,"Nausea, Blurred Vision, 3 to 5 hours sleep, Su...",Blurred vision,0.53316,C0344232,1


### Предобученная модель

In [67]:
%%time

model_path = 'data/BioWordVec_PubMed_MIMICIII_d200.vec.bin'  # 12.5G
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=True)

CPU times: total: 1min 53s
Wall time: 2min 33s


In [76]:
# Функция для векторизации текста с использованием BioWordVec
def vectorize_text_with_gensim_embeddings(tokens, model):
    vectors = [model[token] for token in tokens if token in model]
    if vectors:
        return np.mean(vectors, axis=0)  # Усреднение векторов
    else:
        return np.zeros(model.vector_size)  # Возвращает нулевой вектор, если нет известных слов

In [77]:
# Применение функции к медицинским терминам в обоих наборах данных
adverse_effects['adv_eff_vector'] = adverse_effects['adv_name_processed'].apply(lambda x: vectorize_text_with_gensim_embeddings(x, word2vec_model))
CUIs['CUI_vector'] = CUIs['concept_name_processed'].apply(lambda x: vectorize_text_with_gensim_embeddings(x, word2vec_model))

# Удаление строк с отсутствующими векторами
adverse_effects = adverse_effects[adverse_effects['adv_eff_vector'].apply(lambda x: x is not None)]
CUIs = CUIs[CUIs['CUI_vector'].apply(lambda x: x is not None)]

In [78]:
# Применение функции к медицинским терминам в обоих наборах данных
adverse_effects['adv_eff_vector'] = adverse_effects['adv_name_processed'].apply(lambda x: vectorize_text_with_gensim_embeddings(x, word2vec_model))
CUIs['CUI_vector'] = CUIs['concept_name_processed'].apply(lambda x: vectorize_text_with_gensim_embeddings(x, word2vec_model))

# Удаление строк с отсутствующими векторами
adverse_effects = adverse_effects[adverse_effects['adv_eff_vector'].apply(lambda x: x is not None)]
CUIs = CUIs[CUIs['CUI_vector'].apply(lambda x: x is not None)]

In [79]:
v1 = np.vstack(adverse_effects['adv_eff_vector'])
v2 = np.vstack(CUIs['CUI_vector'])

In [109]:
# Вычисление косинусного сходства
similarity = cosine_similarity(v1, v2)

# Получение топ N похожих концептов
num_top_matches = 2
tops = (-similarity).argsort(axis=1)[:, :num_top_matches]

# Создание таблицы с результатами
results = []
for i in range(len(adverse_effects)):
    for t in tops[i]:
        results.append({
            'id': adverse_effects.iloc[i]['id'],
            'Text': adverse_effects.iloc[i]['Text'],
            'Most Similar Concept': CUIs.iloc[t]['CONCEPT'],
            'concept_cui': CUIs.iloc[t]['CUI'],
            'Similarity': similarity[i][t]
        })

# Создание DataFrame из списка результатов
joined_table = pd.DataFrame(results)
df_max_similarity = joined_table.loc[joined_table.groupby(['Text', 'id'])['Similarity'].idxmax()]
df_max_similarity['level_5'] = (df_max_similarity['Similarity'] > 0.5).astype(int)

# Удаление дублирующихся строк
df_max_similarity = df_max_similarity.drop_duplicates().reset_index(drop=True)

df_sorted_single = df_max_similarity.sort_values(by='id', ascending=True)
df_sorted_single 

Unnamed: 0,id,Text,Most Similar Concept,concept_cui,Similarity,level_5
1260,0,"extreme weight gain, short-term memory loss, h...",Excessive weight loss,C0586746,0.895789,1
130,1,COMPLETELY DESTROYED SEXUALLY FUNCTIONING .,Unable to control behavior,C0582140,0.720222,1
737,2,Just TWO tablets of Lexapro 10mg completely de...,Loss of capacity to feel emotions,C0456820,0.726751,1
724,3,It's called PSSD: post-SSRI sexual dysfunction.,Sexual Dysfunction,C0549622,0.860351,1
835,5,"Nausea, Blurred Vision, 3 to 5 hours sleep, Su...",Night Terrors,C0037320,0.809945,1
...,...,...,...,...,...,...
307,2144,Have experienced weight gain.,Weight gain,C0043094,0.940722,1
237,2145,Exercising and dieting don't seem to get the w...,Failure to lose weight,C0231247,0.813654,1
997,2146,"Stomach problems early on: bloating, nausea, c...",Abdominal bloating,C1291077,0.859831,1
867,2147,No side effects now accept for yawning.,Yawning,C0043387,0.779574,1


### Result

In [154]:
df1_filtered = adverse_effects[['id', 'concept_cui', 'level_4']]
df2_filtered = df_sorted_single[['id', 'concept_cui', 'level_5']]
result = pd.merge(df1_filtered, df2_filtered, on=['id', 'concept_cui'], how='outer')
result = result.fillna(0)
result['level_4'] = result['level_4'].astype(int)
result['level_5'] = result['level_5'].astype(int)

result= result.rename(columns={
    'id': 'Sentence ID',
    'concept_cui': 'Concept ID',
    'level_4': 'Level_4',
    'level_5': 'Level_5'
})

result.head(10)

Unnamed: 0,Sentence ID,Concept ID,Level_4,Level_5
0,0,C0002170,1,0
1,0,C0586746,0,1
2,1,C0425736,0,0
3,1,C0582140,0,1
4,2,C0456820,0,1
5,2,C0549622,0,0
6,3,C0549622,1,1
7,5,C0037320,0,1
8,5,C0344232,1,0
9,6,C0344232,1,0


### Data save

In [157]:
result.to_csv('Karelin_4.csv', index=False, sep=',', encoding='utf-8')