## Text Preprocessing

## Token classification on n2c2 track2 using Logistic regression

In [None]:
def train_logisticRegression(X_train, y_train):
    param_grid = {'penalty': ['l1','l2']}

<div class="alert alert-warning">
    <strong>Warning:</strong> Run it on a subset of data for time limitaion!
</div>


In [None]:
import glob
import pandas as pd
import warnings
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from hmmlearn import hmm

# Disable warning messages
warnings.filterwarnings("ignore")

# Directory path
directory_path = "/Users/sinaabdous/SinaDocuments/UniStudies/nlp/exercise/4/n2c2/2/data/test2"

# Get all CSV files in the directory
csv_files = glob.glob(directory_path + "/*.csv")

# Initialize empty lists to store train and test data
train_data = []
test_data = []

# Loop through each CSV file
for file in csv_files:
    # Load the CSV file
    df = pd.read_csv(file, header=None)

    # Rename the columns for clarity
    df.columns = ['DocID', 'SentenceID', 'Word', 'Label']

    # Replace missing values with an empty string
    df['Word'] = df['Word'].fillna('')

    # Split the data into train and test sets
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

    # Append train data to the train_data list
    train_data.append(train_df)

    # Append test data to the test_data list
    test_data.append(test_df)

# Combine all train data
train_data_combined = pd.concat(train_data, ignore_index=True)

# Create the feature matrix X_train and target variable y_train
X_train = train_data_combined['Word']
y_train = train_data_combined['Label']

# Create a CountVectorizer to convert words into numerical features
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)

# Convert sparse matrix to dense numpy array
X_train_vectorized = X_train_vectorized.toarray()

# Initialize and train the HMM model
model = hmm.MultinomialHMM(n_components=2)  # 2 states for simplicity

# Set a valid initial state probability distribution
startprob_prior = np.array([0.5, 0.5])  # Example: equal initial probabilities for two states
model.startprob_ = startprob_prior

# Set a valid emission probability distribution
emissionprob_prior = np.ones((2, X_train_vectorized.shape[1]))  # Example: uniform emission probabilities
model.emissionprob_ = emissionprob_prior

# Set a valid transition probability distribution
transition_prior = np.array([[0.5, 0.5], [0.5, 0.5]])  # Example: equal transition probabilities
model.transmat_ = transition_prior

# Fit the HMM model
model.fit(X_train_vectorized)

# Loop through each test data and perform evaluation
for i, test_df in enumerate(test_data):
    # Create the feature matrix X_test and target variable y_test
    X_test = test_df['Word']
    y_test = test_df['Label']

    # Convert the words into numerical features
    X_test_vectorized = vectorizer.transform(X_test)

    # Convert sparse matrix to dense numpy array
    X_test_vectorized = X_test_vectorized.toarray()

    # Predict labels
    _, y_pred = model.decode(X_test_vectorized)

    # Print accuracy and classification report
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    print("Test Set", i+1)
    print("Accuracy:", accuracy)
    print(report)


ModuleNotFoundError: ignored

## Token classification on ncbi (using logistic regression)

In [None]:
!pip install datasets
!pip install sklearn

Collecting datasets
  Downloading datasets-2.13.1-py3-none-any.whl (486 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/486.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.2/486.2 kB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.7,>=0.3.0 (from datasets)
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.14-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m18.3 MB/s[0m eta [36m0:

In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("ncbi_disease")

# Extract the train, validation, and test sets
train_data = dataset["train"]
valid_data = dataset["validation"]
test_data = dataset["test"]

# Preprocess the data and convert it into feature and target vectors
def preprocess_data(data):
    X = []
    y = []
    for instance in data:
        tokens = instance["tokens"]
        labels = instance["ner_tags"]
        for token, label in zip(tokens, labels):
            X.append(token)
            y.append(label)
    return X, y

X_train, y_train = preprocess_data(train_data)
X_valid, y_valid = preprocess_data(valid_data)
X_test, y_test = preprocess_data(test_data)

# Create a CountVectorizer to convert the tokenized text into numerical features
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_valid_vectorized = vectorizer.transform(X_valid)
X_test_vectorized = vectorizer.transform(X_test)

# Initialize and train the logistic regression model
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

# Predict the labels for the validation and test sets
y_valid_pred = model.predict(X_valid_vectorized)
y_test_pred = model.predict(X_test_vectorized)

# Print accuracy and classification reports
valid_accuracy = accuracy_score(y_valid, y_valid_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
valid_report = classification_report(y_valid, y_valid_pred)
test_report = classification_report(y_test, y_test_pred)

print("Validation Accuracy:", valid_accuracy)
print("Validation Report:")
print(valid_report)

print("\nTest Accuracy:", test_accuracy)
print("Test Report:")
print(test_report)


Downloading builder script:   0%|          | 0.00/5.83k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.45k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/9.70k [00:00<?, ?B/s]

Downloading and preparing dataset ncbi_disease/ncbi_disease to /root/.cache/huggingface/datasets/ncbi_disease/ncbi_disease/1.0.0/92314c7992b0b8a5ea2ad101be33f365b684a2cc011e0ffa29c691e6d32b2d03...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/284k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/51.2k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/52.4k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/5433 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/924 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/941 [00:00<?, ? examples/s]

Dataset ncbi_disease downloaded and prepared to /root/.cache/huggingface/datasets/ncbi_disease/ncbi_disease/1.0.0/92314c7992b0b8a5ea2ad101be33f365b684a2cc011e0ffa29c691e6d32b2d03. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Validation Accuracy: 0.9511869498101715
Validation Report:
              precision    recall  f1-score   support

           0       0.96      0.99      0.98     22092
           1       0.68      0.46      0.55       787
           2       0.76      0.43      0.55      1090

    accuracy                           0.95     23969
   macro avg       0.80      0.63      0.69     23969
weighted avg       0.94      0.95      0.94     23969


Test Accuracy: 0.9476670612728089
Test Report:
              precision    recall  f1-score   support

           0       0.96      0.99      0.98     22450
           1       0.77      0.49      0.60       960
           2       0.73      0.41      0.53      1087

    accuracy                           0.95     24497
   macro avg       0.82      0.63      0.70     24497
weighted avg       0.94      0.95      0.94     24497



## Token Classification on n2c2 track2 data using LSTM

In [None]:
import glob
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense


directory_path = "/Users/sinaabdous/SinaDocuments/UniStudies/nlp/exercise/4/n2c2/2/data/test"


csv_files = glob.glob(directory_path + "/*.csv")


train_data = []
test_data = []


for file in csv_files:
    df = pd.read_csv(file, header=None)
    df.columns = ['SentenceID', 'WordID', 'Word', 'Label']
    df['Word'] = df['Word'].fillna('')
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
    train_data.append(train_df)
    test_data.append(test_df)


train_data_combined = pd.concat(train_data, ignore_index=True)
X_train = train_data_combined['Word']
y_train = train_data_combined['Label']


tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_sequences = tokenizer.texts_to_sequences(X_train)



max_sequence_length = max(len(seq) for seq in X_train_sequences)
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_sequence_length)


label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
num_classes = len(label_encoder.classes_)


embedding_dim = 100

model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, input_length=max_sequence_length))
model.add(LSTM(units=100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(units=num_classes, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


model.fit(X_train_padded, y_train_encoded, epochs=2, batch_size=32)


test_data_combined = pd.concat(test_data, ignore_index=True)
X_test = test_data_combined['Word']
y_test = test_data_combined['Label']

X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_sequence_length)
y_test_encoded = label_encoder.transform(y_test)

y_test_pred = model.predict_classes(X_test_padded)

accuracy = accuracy_score(y_test_encoded, y_test_pred)
report = classification_report(y_test_encoded, y_test_pred)


print("Accuracy:", accuracy)
print("Classification Report:")
print(report)

## Token classification on ncbi using LSTM

In [None]:
import numpy as np
from datasets import load_dataset
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, TimeDistributed, Dense

# Load dataset
dataset = load_dataset("ncbi_disease")
train_data = dataset["train"]
val_data = dataset["validation"]
test_data = dataset["test"]

# Tokenize words and tags
word_tokenizer = Tokenizer(filters='', lower=False, oov_token='<UNK>')
tag_tokenizer = Tokenizer(filters='', lower=False)

word_tokenizer.fit_on_texts(train_data["tokens"])
tag_tokenizer.fit_on_texts(train_data["ner_tags"])

# Convert words and tags to sequences
X_train = word_tokenizer.texts_to_sequences(train_data["tokens"])
y_train = tag_tokenizer.texts_to_sequences(train_data["ner_tags"])
X_val = word_tokenizer.texts_to_sequences(val_data["tokens"])
y_val = tag_tokenizer.texts_to_sequences(val_data["ner_tags"])
X_test = word_tokenizer.texts_to_sequences(test_data["tokens"])
y_test = tag_tokenizer.texts_to_sequences(test_data["ner_tags"])

# Pad sequences
max_seq_len = max([len(seq) for seq in X_train])  # You can also set an arbitrary number
X_train = pad_sequences(X_train, maxlen=max_seq_len, padding='post')
y_train = pad_sequences(y_train, maxlen=max_seq_len, padding='post')
X_val = pad_sequences(X_val, maxlen=max_seq_len, padding='post')
y_val = pad_sequences(y_val, maxlen=max_seq_len, padding='post')
X_test = pad_sequences(X_test, maxlen=max_seq_len, padding='post')
y_test = pad_sequences(y_test, maxlen=max_seq_len, padding='post')

# One-hot encode labels
y_train = to_categorical(y_train)
y_val = to_categorical(y_val)
y_test = to_categorical(y_test)

# Build LSTM model
vocab_size = len(word_tokenizer.word_index) + 1
num_tags = len(tag_tokenizer.word_index) + 1

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=128, input_length=max_seq_len),
    LSTM(units=256, return_sequences=True),
    TimeDistributed(Dense(units=num_tags, activation='softmax'))
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train model
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=32, epochs=10)

# Evaluate model
test_loss, test_acc = model.evaluate(X_test, y_test, batch_size=32)
print(f"Test accuracy: {test_acc}")

Found cached dataset ncbi_disease (/Users/sinaabdous/.cache/huggingface/datasets/ncbi_disease/ncbi_disease/1.0.0/92314c7992b0b8a5ea2ad101be33f365b684a2cc011e0ffa29c691e6d32b2d03)


  0%|          | 0/3 [00:00<?, ?it/s]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy: 0.990841805934906
