In [1]:
import pandas as pd
import re
import numpy as np
from difflib import get_close_matches
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from joblib import Parallel, delayed
import csv


'''load train.csv'''
train = pd.read_csv('data/train.csv', dtype=str)
features = pd.read_csv('data/features.csv', dtype=str)
'''load patient_notes.csv'''
patient_notes = pd.read_csv('data/patient_notes.csv', dtype=str)



In [2]:
def strtypeArrToArr(strtypeArr):
      return [x.strip() for x in eval(strtypeArr.upper())]

def get_words_only(text):
    return re.sub('[^a-zA-Z]+', ' ', ' '.join(text.split()).upper()).strip('"').strip("'").split()

def get_words_only_str(text):
    return ' '.join(get_words_only(text))

def get_words_from_notes(patient_notes, train, save_file=True):
    words_list = set()
    for I in range(len(patient_notes)):
        words_list.update(get_words_only(patient_notes['pn_history'][I].strip()))
    for I in range(len(train)):
        words_list.update(get_words_only(train['annotation'][I].strip()))

    # words_list = sorted(list(set(re.sub('[^a-zA-Z]+', ' ', ' '.join(words_list)).split())))
    words_list = sorted(list(words_list))
    # words_list = sorted([word for word in tqdm(word_tokenize(' '.join(words_list))) if not word in stopwords.words()])

    if save_file:
        with open('data/words_list.txt', 'w') as f:
            for item in words_list:
                f.write("%s\n" % item)

    return words_list
        
def encode_to_arr(text, words_list, return_long=True):
    r = [0] * len(words_list)
    words = get_words_only(text)
    for word in words:
        try:
            r[words_list.index(word)] = 1
        except ValueError:
            # possible_words = get_close_matches('HYROID', get_words_from_notes(patient_notes), cutoff=0.9)
            pass
    return np.array(r) if return_long else np.array(np.where(np.array(r) == 1)).tolist()[0]

def encode_labels(label, labels_list, return_long=True, normalize=False):
    r = [0] * len(labels_list)
    r[labels_list.index(label)] = 1
    res = np.array(r) if return_long else np.array(np.where(np.array(r) == 1)).tolist()[0]
    return [res[0] / len(labels_list)] if normalize else res

def save_test_labels_to_file(label_text_pairs, file_name='test_anotations_labels.jsons', size=1000):
    ''' save labels to file '''
    rNums = np.random.randint(len(label_text_pairs), size=size)
    with open(file_name, 'w') as f:
        for i in tqdm(rNums):
            f.write('{ "label": "%s", "text": "%s" }\n' % (label_text_pairs[i][0], label_text_pairs[i][1]))

def save_labels_to_file(label_text_pairs, file_name='anotations_labels.jsons'):
    ''' save labels to file '''
    with open(file_name, 'w') as f:
        for label, text in tqdm(label_text_pairs):
            f.write('{ "label": "%s", "text": "%s" }\n' % (label, text))

feature_to_notes = {}
for I in range(len(features)):
    feature_to_notes[str(features['feature_num'][I])] = []

# add notes to the dictionary
for I in range(len(train)):
    feature_to_notes[str(train['feature_num'][I])].extend(strtypeArrToArr(train['annotation'][I]))

# get_words_only('felt dizzy, last meal was "2" days ago, weakness, ew$sdfg12 "fatigu"-re')
label_text_pairs = []
# label_text_pairs = [[key, get_words_only_str(note)] for note in feature_to_notes[key] for key in feature_to_notes]
for key in feature_to_notes:
    for note in feature_to_notes[key]:
        label_text_pairs.append([key, get_words_only_str(note)])

labels_list = list(feature_to_notes.keys())
words_list = get_words_from_notes(patient_notes, train, save_file=False)
features_to_notes = {}

if True:
    print(encode_to_arr(' '.join([words_list[1], words_list[2], words_list[-1]]), words_list))
    print(encode_to_arr('felt as if he going to pass out', words_list))
    print(encode_to_arr('felt as if he going to pass out', words_list, False))

[0 1 1 ... 0 0 1]
[0 0 0 ... 0 0 0]
[3285, 14547, 15929, 16824, 18895, 28154, 29012, 39042]


In [3]:
# x_trainA = [encode_to_arr(label_text[1], words_list )   for label_text in tqdm(label_text_pairs)]
# y_trainA = [encode_labels(label_text[0], labels_list)   for label_text in tqdm(label_text_pairs)]

# label_text_pairs = [label, text]

In [4]:

str(encode_labels(label_text_pairs[12][0], labels_list, return_long=False, normalize=True)[0])

'0.0'

In [5]:
from happytransformer import HappyTextClassification
happy_tc_roberta = HappyTextClassification("ROBERTA", "roberta-base")

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

In [None]:

# cases= [("Wow I love using BERT for text classification", 0), ("I hate NLP", 1)]
def write_to_csv(cases, filename="train_classifier.csv"):
    with open(filename, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(["text", "label"])
            for case in cases:
                writer.writerow([case[1], encode_labels(case[0], labels_list, return_long=False)[0]])
    return filename

happy_tc_roberta.train(write_to_csv(cases=label_text_pairs))

# comments

replacing all labels with some integer (tried 0 & 1 only) atleast starts the training without any errors. 

used : 

https://www.vennify.ai/train-text-classification-transformers/


**smooth labeling?**


https://towardsdatascience.com/what-is-label-smoothing-108debd7ef06

https://pyimagesearch.com/2019/12/30/label-smoothing-with-keras-tensorflow-and-deep-learning/
