In [296]:
with open('data/concepts/side_effects.txt') as file:
    documents = file.readlines()
    
    print(*documents[:5], sep='\n')

id	Text

1	extreme weight gain, short-term memory loss, hair loss.

2	COMPLETELY DESTROYED SEXUALLY FUNCTIONING .

3	Just TWO tablets of Lexapro 10mg completely destroyed my sexual functioning, probably for life.

4	It's called PSSD: post-SSRI sexual dysfunction.


In [297]:
cuinames = []
concepts = []

with open('data/concepts/concepts.tsv') as cuifile:
    print(next(cuifile))

    for i, line in enumerate(cuifile):
        fields = line.split('\t')
        cui = fields[0].strip()
        texts = set(f.strip().lower() for f in fields[1].split(','))
        for t in texts:
            cuinames.append(cui)
            concepts.append(t.strip())

CUI	CONCEPT	SNOMED_CODE


In [298]:
print(len(concepts), len(cuinames))
for i, item in enumerate(concepts):
    print(cuinames[i], item)

705 705
C0000765 excessive body weight gain
C0000765 excessive weight gain
C0701811 poor short-term memory
C0002170 loss of hair
C0002170 alopecia
C0549622 sexual dysfunction
C0549622 sexual disorder
C0027497 nausea
C0344232 blurring of visual image
C0344232 blurred vision
C0037316 sleep deprivation
C0424000 feeling suicidal
C0424565 cannot sleep at all
C1971624 loss of appetite
C0558066 intrusive thoughts
C0557386 restricted work performance
C0086769 panic attacks
C0086769 panic attack
C0003467 anxiety
C0231403 severe anxiety
C0233754 derealization
C0917801 sleeplessness
C0917801 insomnia
C0012833 dizziness
C0010201 chronic cough
C0043094 weight gain
C2981158 lack of libido
C0583237 emergency room admission
C0039231 tachycardia
C0522365 severe vertigo
C0424573 always sleepy
C0560765 unable to maintain a position
C0042571 vertigo
C0043352 xerostomia
C0425083 loss of job
C0751908 epidemic vertigo
C0751908 vestibular neuronitis
C0037763 spasm
C0231530 muscle twitch
C0231528 muscle pain
C

### Level 1

In [299]:
import nltk
from string import punctuation as punct
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from IPython.display import clear_output

porter_stemmer = PorterStemmer()

def text_preprocessing_1(sentence):
    s = sentence.translate(sentence.maketrans(punct, ' '*len(punct)))
    
    toks = s.split()
    
    lowered = [w.lower() for w in toks]
    stemmed = [porter_stemmer.stem(w) for w in lowered]
    
    return stemmed

In [300]:
concepts_1 = [text_preprocessing_1(concept) for concept in concepts]
sentences_1 = [text_preprocessing_1(sentence) for sentence in documents]

In [301]:
def check_concept_1(c_lst, s_lst):
    it = iter(s_lst)
    return all(word in it for word in c_lst)

In [302]:
def check_all_concepts(concepts, sentences, checker):
    """Check all concepts against all sentences."""
    
    results = []

    for i, concept in enumerate(concepts):
        clear_output()
        print(f'Step: {i+1}/{len(concepts)}')
        concept_results = [(ind, cuinames[i], 1) for ind, sentence in enumerate(sentences) if checker(concept, sentence)]
        results.append(concept_results)
        
    final = sorted([item for sublist in results for item in sublist], key=lambda x: x[0])
    
    return final

In [303]:
 level_1 = list(set(check_all_concepts(concepts_1, sentences_1, check_concept_1)))

Step: 705/705


In [304]:
len(level_1)

2651

In [305]:
level_1[:10]

[(106, 'C0018681', 1),
 (1177, 'C0019080', 1),
 (892, 'C0028084', 1),
 (1564, 'C0015672', 1),
 (1335, 'C0038990', 1),
 (1041, 'C0557875', 1),
 (1432, 'C0240327В', 1),
 (1864, 'C0231303', 1),
 (1570, 'C0917801', 1),
 (1855, 'C0028084', 1)]

In [306]:
# пример уровеня 1
concept1_1 = 'Blurred vision' # то же CUI как 'Blurring of visual image'
sentence1_1 = 'I was unable to sleep, had blurred vision, and felt sick to my stomach.'

check_concept_1(text_preprocessing_1(concept1_1), text_preprocessing_1(sentence1_1))

True

### Level 2

In [307]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\grayni\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [308]:
def text_preprocessing_2(sentence):
    s = sentence.translate(str.maketrans(punct, ' '*len(punct)))
    
    toks = s.split()
    lowered = [w.lower() for w in toks]
    filtered = [w for w in lowered if w not in stop_words]
    stemmed = [porter_stemmer.stem(w) for w in filtered]
    return stemmed

In [309]:
def check_concept_2(c_lst, s_lst):
    return all(word in s_lst for word in c_lst)

In [310]:
concepts_2 = [text_preprocessing_2(concept) for concept in concepts]
sentences_2 = [text_preprocessing_2(sentence) for sentence in documents]
concepts_2

[['excess', 'bodi', 'weight', 'gain'],
 ['excess', 'weight', 'gain'],
 ['poor', 'short', 'term', 'memori'],
 ['loss', 'hair'],
 ['alopecia'],
 ['sexual', 'dysfunct'],
 ['sexual', 'disord'],
 ['nausea'],
 ['blur', 'visual', 'imag'],
 ['blur', 'vision'],
 ['sleep', 'depriv'],
 ['feel', 'suicid'],
 ['cannot', 'sleep'],
 ['loss', 'appetit'],
 ['intrus', 'thought'],
 ['restrict', 'work', 'perform'],
 ['panic', 'attack'],
 ['panic', 'attack'],
 ['anxieti'],
 ['sever', 'anxieti'],
 ['dereal'],
 ['sleepless'],
 ['insomnia'],
 ['dizzi'],
 ['chronic', 'cough'],
 ['weight', 'gain'],
 ['lack', 'libido'],
 ['emerg', 'room', 'admiss'],
 ['tachycardia'],
 ['sever', 'vertigo'],
 ['alway', 'sleepi'],
 ['unabl', 'maintain', 'posit'],
 ['vertigo'],
 ['xerostomia'],
 ['loss', 'job'],
 ['epidem', 'vertigo'],
 ['vestibular', 'neuron'],
 ['spasm'],
 ['muscl', 'twitch'],
 ['muscl', 'pain'],
 ['myalgia'],
 ['confusion', 'state'],
 ['confus'],
 ['flush'],
 ['face', 'goe', 'red'],
 ['tension', 'headach'],
 ['ten

In [311]:
level_2 = list(set(check_all_concepts(concepts_2, sentences_2, check_concept_2)))

Step: 705/705


In [312]:
len(level_2)

2941

In [313]:
level_2[:20]

[(575, 'C0232462', 1),
 (106, 'C0018681', 1),
 (1177, 'C0019080', 1),
 (892, 'C0028084', 1),
 (1564, 'C0015672', 1),
 (2067, 'C0011124', 1),
 (1335, 'C0038990', 1),
 (1041, 'C0557875', 1),
 (1398, 'C0232462', 1),
 (1432, 'C0240327В', 1),
 (1864, 'C0231303', 1),
 (1570, 'C0917801', 1),
 (1855, 'C0028084', 1),
 (1377, 'C0028643', 1),
 (1466, 'C0028081', 1),
 (1312, 'C4075722', 1),
 (386, 'C0557875', 1),
 (1732, 'C0015672', 1),
 (973, 'C2129214', 1),
 (1827, 'C0009676', 1)]

In [314]:
for i, item in enumerate(concepts):
    if cuinames[i] == 'C0549622':
        print(concepts[i])
        
concept2_1 = 'Sexual Dysfunction' # C0549622
concept2_2 = 'Sexual disorder' # C0549622
sentence2_2 = "It's called PSSD: post-SSRI sexual dysfunction."

check_con_2_1 = text_preprocessing_2(concept2_1)
check_con_2_2 = text_preprocessing_2(concept2_2)
check_sen_2 = text_preprocessing_2(sentence2_2)

print(f'\n{sentence2_2}')
print(f'{check_con_2_1}:', check_concept_2(check_con_2_1, check_sen_2))
print(f'{check_con_2_2}:', check_concept_2(check_con_2_2, check_sen_2))

sexual dysfunction
sexual disorder

It's called PSSD: post-SSRI sexual dysfunction.
['sexual', 'dysfunct']: True
['sexual', 'disord']: False


### Level 3

In [315]:
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from fuzzywuzzy import fuzz
import string
from nltk import download

download('wordnet')
download('punkt')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\grayni\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\grayni\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [316]:
# Проверка по синонимам
def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    return synonyms

In [317]:
def text_preprocessing_3(text):
    """Предварительная обработка текста: удаление пунктуации и приведение к нижнему регистру."""
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text.lower())
    return tokens

In [318]:
def find_best_synonym_match(word, sentence_words):
    """Находит лучшее совпадение слова среди синонимов в предложении."""
    best_score = 0
    best_match = None
    synonyms = get_synonyms(word)
    for sentence_word in sentence_words:
        for synonym in synonyms:
            score = fuzz.ratio(synonym, sentence_word)
            if score > best_score:
                best_score = score
                best_match = sentence_word
    return best_match, best_score

In [319]:
def check_concept_3(concept, sentence, threshold=80):
    """Проверка наличия концепта в предложении с учетом синонимов и сходства."""
    for word in concept:
        best_match, best_score = find_best_synonym_match(word, sentence)
        if best_score < threshold:
            return False  # Если хотя бы одно слово концепта не найдено, возвращаем False
    return True  # Все слова концепта найдены

In [320]:
def check_all_concepts(concepts, sentences, checker):
    """Проверка всех концептов для всех предложений."""
    
    results = []

    for i, concept in enumerate(concepts):
        clear_output(wait=True)
        print(f'Step: {i+1}/{len(concepts)}')
        concept_results = [(ind, cuinames[i], 1) for ind, sentence in enumerate(sentences) if checker(concept, sentence)]
        results.extend(concept_results)
        
    final = sorted(results, key=lambda x: x[0])
    return final

In [321]:
level_3 = list(set(check_all_concepts(concepts_2, sentences_2, check_concept_3)))

Step: 705/705


In [322]:
len(level_3)

7165

In [323]:
level_3[:10]

[(1108, 'C0344232', 1),
 (2124, 'C0038999', 1),
 (885, 'C0220870', 1),
 (198, 'C0424092', 1),
 (1403, 'C0522165', 1),
 (624, 'C0030318', 1),
 (1432, 'C0240327В', 1),
 (1570, 'C0917801', 1),
 (253, 'C0392674', 1),
 (1827, 'C0009676', 1)]

### Join results

In [324]:
def merge_results(level_1, level_2, level_3):
    combined_dict = {}

    # Функция для добавления данных в словарь
    def add_to_dict(data, level_index):
        for (sentence_id, concept_id, _) in data:
            if (sentence_id, concept_id) not in combined_dict:
                combined_dict[(sentence_id, concept_id)] = [0, 0, 0]
            combined_dict[(sentence_id, concept_id)][level_index] = 1

    # Добавляем данные из каждого уровня
    add_to_dict(level_1, 0)
    add_to_dict(level_2, 1)
    add_to_dict(level_3, 2)

    # Преобразуем словарь в нужный формат
    combined_results = [(sentence_id, concept_id, *levels) for (sentence_id, concept_id), levels in combined_dict.items()]

    # Сортировка по id предложения и концепта
    combined_results.sort(key=lambda x: (x[0], x[1]))

    return combined_results

# Объединение результатов
combined_results = merge_results(level_1, level_2, level_3)

# Вывод результатов
for result in combined_results:
    print(result)

(1, 'C0002170', 0, 1, 1)
(1, 'C0043094', 1, 1, 1)
(1, 'C0521008', 0, 0, 1)
(1, 'C1262477', 1, 1, 1)
(2, 'C0036104', 1, 1, 1)
(3, 'C0036104', 1, 1, 1)
(4, 'C0424092', 0, 0, 1)
(4, 'C0521008', 0, 0, 1)
(4, 'C0549622', 1, 1, 0)
(5, 'C0015967', 0, 0, 1)
(5, 'C0016382', 0, 0, 1)
(5, 'C0019112', 0, 0, 1)
(5, 'C0233660', 0, 0, 1)
(5, 'C0424092', 0, 0, 1)
(6, 'C0027497', 1, 1, 1)
(6, 'C0028643', 0, 0, 1)
(6, 'C0038661', 1, 1, 0)
(6, 'C0042963', 0, 0, 1)
(6, 'C0085633', 0, 0, 1)
(6, 'C0344232', 1, 1, 1)
(6, 'C0438696', 1, 1, 0)
(7, 'C0028643', 0, 0, 1)
(7, 'C0042963', 0, 0, 1)
(7, 'C0344232', 1, 1, 1)
(8, 'C0038999', 0, 0, 1)
(8, 'C0392674', 0, 0, 1)
(9, 'C0011206', 0, 0, 1)
(10, 'C0001241', 0, 0, 1)
(10, 'C0036104', 0, 0, 1)
(11, 'C0003467', 1, 1, 0)
(11, 'C0027497', 1, 1, 1)
(11, 'C0030318', 1, 1, 1)
(11, 'C0042963', 0, 0, 1)
(11, 'C0086769', 1, 1, 1)
(11, 'C0700031', 0, 1, 0)
(12, 'C0003467', 1, 1, 0)
(12, 'C0027497', 1, 1, 1)
(12, 'C0028643', 0, 0, 1)
(12, 'C0036104', 0, 0, 1)
(12, 'C004296

### Data save

In [326]:
import csv

def save_to_csv(data, filename):
    """Сохраняет данные в CSV-файл с разделителем запятая."""
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        # Записываем заголовки
        writer.writerow(['Sentence ID', 'Concept ID', 'Level 1', 'Level 2', 'Level 3'])
        # Записываем данные
        writer.writerows(data)

In [327]:
save_to_csv(combined_results, 'Karelin_3.csv')