In [None]:
!nvidia-smi

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
# load folder
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# please change the path of your dataset here below
ROOT_PATH = "/content/drive/MyDrive/colab notebook/data_exp/german_med_termss/"
# ROOT_PATH= "updated_german_med_terms/"

In [None]:
# install all needed libraries
!pip install transformers
!pip install pytorch-lightning
!pip install transformers datasets --quiet
!pip install -U imbalanced-learn
!pip install scikit-multilearn

!pip install spacy
!pip install spacy-transformers
!python3 -m spacy download de_dep_news_trf
!python3 -m spacy download de_core_news_lg

# !python -m spacy download de_dep_news_trf

In [None]:
import torch
from transformers.file_utils import is_tf_available, is_torch_available, is_torch_tpu_available
from transformers import BertTokenizerFast, BertForSequenceClassification, BertTokenizer, BertForMultipleChoice
from transformers import Trainer, TrainingArguments
import numpy as np
import random
from sklearn.model_selection import train_test_split
import os
import pandas as pd
import ast
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from imblearn.under_sampling import RandomUnderSampler
import datasets
from datasets import Dataset

In [None]:
files = os.listdir(ROOT_PATH)
filepaths = [ROOT_PATH + f for f in os.listdir(ROOT_PATH) if f.endswith('.csv')]
df = pd.concat(map(pd.read_csv, filepaths))
df.shape

In [None]:
def tx(x):
  res = []
  for v in ast.literal_eval(x):
    res.append(str(v).strip().lower())
  return res


df['labels_array']= df['expertise'].apply(lambda x: tx(x))
df = df[pd.notna(df['labels_array'])]
df.shape

In [None]:
def get_categories_with_index(labels_classfified: dict):
    categories = []
    main = labels_classfified['Unnamed: 1']
    for k,v in main.items():
        if type(v) == str:
            categories.append(v)
    
    categories_with_index = []
    for i in range(len(categories) -1 ):
        start = list(main.values()).index(str(categories[i]))
        end = list(main.values()).index(str(categories[i + 1])) - 1

        main_category = str(categories[i]).lower()
        categories_with_index.append({"category": categories[i], "start": start, "end": end, "values": [main_category]})
    
    # for the last category
    last_category = categories[-1].lower()
    categories_with_index.append({"category": last_category, "start": -1, "end": -1, "values": [last_category]})
    
    columns = list(labels_classfified.keys())[2:]
    for v in columns:
        for ci in categories_with_index:
            ci['category'] = str(ci['category']).lower()
            ci['values'] += found(labels_classfified[v], ci)
        
    return categories_with_index

def found(dict_under, ci):
    values = []
    for k,v in dict_under.items():
        if k > ci['start'] and k < ci['end'] and type(v) == str:
            values += [str(v).lower()]
    
    return list(set(values))

categories_with_index = get_categories_with_index(pd.read_excel('../content/drive/MyDrive/colab notebook/Praxisprojekt_MMC.xlsx').to_dict())
# categories_with_index = get_categories_with_index(pd.read_excel('Praxisprojekt_MMC.xlsx').to_dict())
target_names = [c['category'] for c in categories_with_index]
print(target_names)

In [None]:
def change_label_to_main_category(x):
    result = []
    for value_from_x in x:
        for ci in categories_with_index:
            if value_from_x in ci['values']:
                result.append(ci['category'])

    return list(set(result))

df['labels_array']= df['labels_array'].apply(lambda x: change_label_to_main_category(x))

In [None]:
print(df['labels_array'].value_counts())

In [None]:

def get_number_of_sample_per_label(df, number_of_sample_per_label):
    result = []

    for label, count in df['labels_array'].value_counts().items():
        if count <= number_of_sample_per_label:
            result += [label]

    print("count -> ", len(result))
    return result

found_labels = get_number_of_sample_per_label(df, 1)
print(found_labels)

if len(found_labels) > 0:
    ri = []
    for index, row in df.iterrows():
        for l in found_labels:
            if l == row['labels_array']:
                ri.append(index)

    print("ri", ri)
    df.drop(index=ri, inplace = True)
    df.shape

In [None]:
print(df['labels_array'].value_counts())

In [None]:
df_train, df_val = train_test_split(df, test_size=0.2, random_state=42, stratify=df['labels_array'])
print(df_train.shape, df_val.shape)

In [None]:
def explode_doc(df):
    df['doc'] = df[['all_page', 'wiki_content']].apply(lambda x: [str(x['all_page']) ]+ [str(x['wiki_content']) ], axis=1)
    df = df.explode("doc")

    df = df[pd.notna(df['doc'])]
    df['doc'] = df['doc'].replace('nan', np.nan)
    df.dropna(subset=['doc'], inplace=True)  

    return df

def explode_labels_array(df):

    df = df.explode("labels_array")
    df.rename(columns = {'labels_array': 'label'}, inplace = True)

    df['label'].replace('', np.nan, inplace=True)
    df.dropna(subset=['label'], inplace=True) 

    sns.set(rc={'figure.figsize':(11.7,8.27)})
    sns.countplot(data=df, y="label", orient="v")

    return df

In [None]:
import re
from typing import List

import spacy
from spacy.tokens import Doc
from tqdm import tqdm


class SpacyPreprocessor:
    def __init__(
        self,
        spacy_model=None,
        remove_numbers=False,
        remove_special=True,
        pos_to_remove=None,
        remove_stopwords=False,
        lemmatize=False,
        remove_duplicates=False,
        extra_abbreviations=[]
    ):
        """
        Preprocesses text using spaCy
        :param remove_numbers: Whether to remove numbers from text
        :param remove_stopwords: Whether to remove stopwords from text
        :param remove_special: Whether to remove special characters (including numbers)
        :param pos_to_remove: list of PoS tags to remove
        :param lemmatize:  Whether to apply lemmatization
        """

        self._remove_numbers = remove_numbers
        self._pos_to_remove = pos_to_remove
        self._remove_stopwords = remove_stopwords
        self._remove_special = remove_special
        self._lemmatize = lemmatize
        self.remove_duplicates = remove_duplicates
        self.extra_abbreviations = extra_abbreviations

        if not spacy_model:
            self.model = spacy.load("de_core_news_lg")
        else:
            self.model = spacy_model

    @staticmethod
    def download_spacy_model(model="de_core_news_lg"):
        print(f"Downloading spaCy model {model}")
        spacy.cli.download(model)
        print(f"Finished downloading model")

    @staticmethod
    def load_model(model="de_core_news_lg"):
        return spacy.load(model, disable=["ner", "parser"])

    def tokenize(self, text) -> List[str]:
        """
        Tokenize text using a spaCy pipeline
        :param text: Text to tokenize
        :return: list of str
        """
        doc = self.model(text)
        return [token.text for token in doc]

    def preprocess_text(self, text) -> str:
        """
        Runs a spaCy pipeline and removes unwanted parts from text
        :param text: text string to clean
        :return: str, clean text
        """

        if self.extra_abbreviations == []:
            adds = self.extra_abbreviations
            for add in adds:
                text = text.replace(add, "")

        doc = self.model(text)
        return self.__clean(doc)

    def preprocess_text_list(self, texts=List[str]) -> List[str]:
        """
        Runs a spaCy pipeline and removes unwantes parts from a list of text.
        Leverages spaCy's `pipe` for faster batch processing.
        :param texts: List of texts to clean
        :return: List of clean texts
        """
        clean_texts = []
        for doc in tqdm(self.model.pipe(texts)):
            clean_texts.append(self.__clean(doc))

        return clean_texts

    def __clean(self, doc: Doc) -> str:
    
        tokens = []
        # POS Tags removal
        if self._pos_to_remove:
            for token in doc:
                if token.pos_ not in self._pos_to_remove:
                    tokens.append(token)
        else:
            tokens = doc

        # Remove Numbers
        if self._remove_numbers:
            tokens = [
                token for token in tokens if not (token.like_num or token.is_currency)
            ]
        

        # Remove Stopwords
        if self._remove_stopwords:
            tokens = [token for token in tokens if not token.is_stop]
        
            
        # remove unwanted tokens
        tokens = [
            token
            for token in tokens
            if not (
                token.is_punct or token.is_space or token.is_quote or token.is_bracket
            )
        ]

        # Remove empty tokens
        tokens = [token for token in tokens if token.text.strip() != ""]

        # # Remove duplicates
        # if self.remove_duplicates:
        #     tokens = list(set(tokens))

        # Lemmatize
        if self._lemmatize:
            # text = " ".join([token.lemma_ for token in tokens])
            words = [token.lemma_ for token in tokens]
        else:
            # text = " ".join([token.text for token in tokens])
            words = [token.text for token in tokens]
        
        if self._remove_special:
            words = [re.sub(r"[^a-zA-Z0-9äöüÄÖÜß]", " ", str(word)) for word in words]
            # words = [re.sub(r"[^\x00-\x7F]+", "", word) for word in words]
            words = [word for word in words if len(word.strip()) > 0 ]

        # if self._remove_special:
        #     # Remove non alphabetic characters
        #     text = re.sub(r"[^a-zA-Z\']", " ", text)
        # # remove non-Unicode characters
        # text = re.sub(r"[^\x00-\x7F]+", "", text)

        # text = text.lower()

        return words # " ".join(words)


spacy_model = SpacyPreprocessor.load_model()
preprocessor = SpacyPreprocessor(spacy_model=spacy_model, lemmatize=True, remove_numbers=False, remove_stopwords=True, remove_special=True)

In [None]:
# print("---------------------------LABELS ARRAYS--------------------------------")
# print(df_train['labels_array'].value_counts())
# print("-----------------------------------------------------------------")
# print(df_val['labels_array'].value_counts())


df_train = explode_doc(df_train)
df_val = explode_doc(df_val)

print(df_train.shape)
print(df_val.shape)

# print("---------------------------LABELS--------------------------------")
# print(df_train['label'].value_counts())
# print("-----------------------------------------------------------------")
# print(df_val['label'].value_counts())

In [None]:
###################################################################
# df_train = df_train.head(200)
# df_val = df_val.head(3)
####################################################################

In [None]:
train_documents = df_train['doc']
train_categories = df_train['labels_array']
test_documents = df_val['doc']
test_categories = df_val['labels_array']

print(train_documents.shape, train_categories.shape)
print(test_documents.shape, test_categories.shape)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = TfidfVectorizer(
        tokenizer = preprocessor.preprocess_text, 
        analyzer = 'word', 
        lowercase=False, 
        use_idf=True,
        # max_features=512
    )

vectorised_train_documents = vectorizer.fit_transform(train_documents)
vectorised_test_documents = vectorizer.transform(test_documents)

In [None]:
train_labels = train_categories
test_labels = test_categories

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

label_encoder = MultiLabelBinarizer()
train_labels = label_encoder.fit_transform(train_categories)
test_labels = label_encoder.transform(test_categories)

In [None]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(multi_class='multinomial', random_state=2, n_jobs=-1)
classifier = OneVsRestClassifier(model)
classifier.fit(vectorised_train_documents, train_labels)

In [None]:
from sklearn.model_selection import KFold, cross_val_score

kf = KFold(n_splits=10, random_state = 42, shuffle = True)
scores = cross_val_score(classifier, vectorised_train_documents, train_labels, cv = kf)

In [None]:
print('Cross-validation scores:', scores)
print('Cross-validation accuracy: {:.4f} (+/- {:.4f})'.format(scores.mean(), scores.std() * 2))

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

predictions = classifier.predict(vectorised_test_documents)

accuracy = accuracy_score(test_labels, predictions)

macro_precision = precision_score(test_labels, predictions, average='macro')
macro_recall = recall_score(test_labels, predictions, average='macro')
macro_f1 = f1_score(test_labels, predictions, average='macro')

micro_precision = precision_score(test_labels, predictions, average='micro')
micro_recall = recall_score(test_labels, predictions, average='micro')
micro_f1 = f1_score(test_labels, predictions, average='micro')

cm =  confusion_matrix(test_labels.argmax(axis = 1), predictions.argmax(axis = 1))
cr = classification_report(test_labels.argmax(axis = 1), predictions.argmax(axis = 1)) #, target_names = target_names)

In [None]:
print("Accuracy: {:.4f}\nPrecision:\n- Macro: {:.4f}\n- Micro: {:.4f}\nRecall:\n- Macro: {:.4f}\n- Micro: {:.4f}\nF1-measure:\n- Macro: {:.4f}\n- Micro: {:.4f}".format(accuracy, macro_precision, micro_precision, macro_recall, micro_recall, macro_f1, micro_f1))

In [None]:
print(cr)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sb
import pandas as pd

cm_plt = pd.DataFrame(cm[:73])

plt.figure(figsize = (25, 25))
ax = plt.axes()

sb.heatmap(cm_plt, annot=True)

ax.xaxis.set_ticks_position('top')

plt.show()

In [None]:
def get_input_prediction(input: str):
    new_predictions = classifier.predict(vectorizer.transform([input]))
    return label_encoder.inverse_transform(new_predictions)

In [None]:
input = """Die Gerontologie reflektiert den Wandel des Altersbildes in der Gesellschaft. Zielgruppe sind hierbei die allgemeine Öffentlichkeit, die Senioren selbst, beruflich mit Senioren befasste Gruppen und die Politik. Als Medium zwischen Universitäten und Allgemeinheit dienen Seniorentage und Kongresse. Zur gerontologischen Forschung zählen die Untersuchung der biologischen Grundlagen des Älterwerdens ebenso wie die Veränderung der sozialen Systeme. Sozialwissenschaften und Demographie bilden Nachbarwissenschaften der Gerontologie. Ziel der Gerontologie ist die Verknüpfung unterschiedlicher Fachbereiche wie Geriatrie, Gerontopsychiatrie, Altenpflege und Sozialarbeit zu einer eigenständigen wissenschaftlichen Disziplin. Es ist eine verstärkte Zuwendung zu pragmatischen Fragestellungen zu beobachten. Auch Disziplinen der Volkswirtschaftslehre bedienen etwa die Frage nach einer optimalen Ausgestaltung des Rentensystems. Wirtschaftswissenschaftliche Kenntnisse werden aufgrund der steigenden Managementorientierung des Bereiches in Zukunft zunehmen. Die Deutsche Bundesregierung hat bislang sieben Altenberichte veröffentlicht, welche die Situation alter Menschen untersuchen (1991 – 2016). """

In [None]:
####  test ###
get_input_prediction(input)

In [None]:
le_name_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print(le_name_mapping)