In [19]:
import re
import pandas as pd
# classic ml
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder

# transformer
from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf

2023-05-29 12:23:00.044093: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Preprocessing

In [16]:
# Define a function to clean the text, not necessarily always good to remove punctations etc here as the information is fairly dense, but lets see later
df = pd.read_csv('/Users/maximilianlindholz/Desktop/Promotion:Arbeit/Scoring/ScoresTrain_filtered.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,IMS,Text
0,1,0,passive Atemtherapie (reflektorisch)c-Krankeng...
1,2,3,passive Atemtherapie (reflektorisch)c-Krankeng...
2,3,3,"aktive Atemtherapie, c-Krankengymnastik neurop..."
3,4,1,"c-Krankengymnastik neurophysiologisch -aktiv,S..."
4,5,3,passive Atemtherapie (reflektorisch)Atemtherap...


In [17]:
def clean_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove leading/trailing whitespace
    text = text.strip()
    return text

# Apply the clean_text function to the 'free_text' column
df['cleaned_text'] = df['Text'].apply(clean_text)

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_text'], df['IMS'], test_size=0.2, random_state=42)


# most basic

In [18]:
# Set up pipelines
pipelines = [
    {
        "name": "Naive Bayes",
        "pipeline": Pipeline([
            ('tfidf', TfidfVectorizer()),
            ('clf', MultinomialNB())
        ]),
        "parameters": {
            'tfidf__max_df': (0.5, 0.75, 1.0),
            'tfidf__max_features': (None, 5000, 10000, 50000),
            'tfidf__ngram_range': ((1, 1), (1, 2), (2, 2)),  
            'clf__alpha': (0.1, 1, 10),
        }
    },
    {
        "name": "Random Forest",
        "pipeline": Pipeline([
            ('tfidf', TfidfVectorizer()),
            ('clf', RandomForestClassifier())
        ]),
        "parameters": {
            'tfidf__max_df': (0.5, 0.75, 1.0),
            'tfidf__max_features': (None, 5000, 10000, 50000),
            'tfidf__ngram_range': ((1, 1), (1, 2), (2, 2)),  
            'clf__n_estimators': [50, 100, 200],
            'clf__max_depth': [None, 30, 50, 100],
        }
    },
    {
        "name": "Logistic Regression",
        "pipeline": Pipeline([
            ('tfidf', TfidfVectorizer()),
            ('clf', LogisticRegression())
        ]),
        "parameters": {
            'tfidf__max_df': (0.5, 0.75, 1.0),
            'tfidf__max_features': (None, 5000, 10000, 50000),
            'tfidf__ngram_range': ((1, 1), (1, 2), (2, 2)),  
            'clf__C': (0.01, 0.1, 1, 10),
        }
    },
    {
        "name": "SVM",
        "pipeline": Pipeline([
            ('tfidf', TfidfVectorizer()),
            ('clf', SVC())
        ]),
        "parameters": {
            'tfidf__max_df': (0.5, 0.75, 1.0),
            'tfidf__max_features': (None, 5000, 10000, 50000),
            'tfidf__ngram_range': ((1, 1), (1, 2), (2, 2)),  
            'clf__C': (0.01, 0.1, 1, 10),
        }
    }
]

# Loop through pipelines, performing a Grid Search with Cross Validation for each
for model in pipelines:
    grid_search = GridSearchCV(model["pipeline"], model["parameters"], cv=5)
    print("\nPerforming grid search for model:", model["name"])
    grid_search.fit(X_train, y_train)
    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(model["parameters"].keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))


Performing grid search for model: Naive Bayes




Best score: 0.828
Best parameters set:
	clf__alpha: 0.1
	tfidf__max_df: 0.5
	tfidf__max_features: 5000
	tfidf__ngram_range: (1, 2)

Performing grid search for model: Random Forest




Best score: 0.870
Best parameters set:
	clf__max_depth: None
	clf__n_estimators: 200
	tfidf__max_df: 0.75
	tfidf__max_features: 50000
	tfidf__ngram_range: (1, 1)

Performing grid search for model: Logistic Regression




Best score: 0.879
Best parameters set:
	clf__C: 10
	tfidf__max_df: 0.75
	tfidf__max_features: None
	tfidf__ngram_range: (1, 2)

Performing grid search for model: SVM




Best score: 0.872
Best parameters set:
	clf__C: 10
	tfidf__max_df: 0.5
	tfidf__max_features: None
	tfidf__ngram_range: (1, 1)


# Fine tuning Transformer models

In [None]:
# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


In [21]:
# Define a function to encode the text
def encode_text(text):
    return tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=512,
        return_attention_mask=True,
        return_tensors='tf',
        padding='max_length',
        truncation=True
    )

# Encode the cleaned text
X_train_encoded = [encode_text(text) for text in X_train]
X_test_encoded = [encode_text(text) for text in X_test]

# Convert to TensorFlow tensors and make sure tensors are 2D
X_train_tensors = {key: tf.convert_to_tensor([doc[key][0] for doc in X_train_encoded]) for key in X_train_encoded[0]}
X_test_tensors = {key: tf.convert_to_tensor([doc[key][0] for doc in X_test_encoded]) for key in X_test_encoded[0]}

# Label encoding
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)

# Convert labels to tensors
y_train_tensors = tf.convert_to_tensor(y_train_encoded)
y_test_tensors = tf.convert_to_tensor(y_test_encoded)

# Initialize BERT model
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(le.classes_))

# Define loss and metrics
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = tf.metrics.SparseCategoricalAccuracy('accuracy')

# Compile model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5), loss=loss, metrics=[metrics])

# Train model
history = model.fit(X_train_tensors, y_train_tensors, epochs=3, validation_data=(X_test_tensors, y_test_tensors))

# Evaluate model
test_loss, test_acc = model.evaluate(X_test_tensors, y_test_tensors, verbose=2)
print('\nTest accuracy:', test_acc)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
 3/31 [=>............................] - ETA: 41:47 - loss: 2.5361 - accuracy: 0.1667 

In [None]:
# todo: also use med-bert, choose different measures and display confusion matrices etc.
# maybe also add a random forrest or other ensemble algorithm  and also display loss possibly