In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
df = pd.read_csv("./data/text_label.csv",  encoding='latin-1')

In [8]:
# Split into labeled and unlabeled data
labeled_data = df[df['LABEL'].notna()]
unlabeled_data = df[df['LABEL'].isna()]

# Split labeled data into train and validation sets
train_data, val_data = train_test_split(labeled_data, test_size=0.2, random_state=42)

# Preprocess the text data using TF-IDF
vectorizer = TfidfVectorizer(max_features=10000)
X_train = vectorizer.fit_transform(train_data['FOI_TEXT'])
X_val = vectorizer.transform(val_data['FOI_TEXT'])
X_unlabeled = vectorizer.transform(unlabeled_data['FOI_TEXT'])

y_train = train_data['LABEL']
y_val = val_data['LABEL']

In [46]:
labeled_data['LABEL'].unique()

array([1., 4., 5., 2., 3.])

In [44]:
labeled_data[labeled_data['LABEL'] == 1]

Unnamed: 0,MDR_REPORT_KEY,MDR_TEXT_KEY,TEXT_TYPE_CODE,PATIENT_SEQUENCE_NUMBER,FOI_TEXT,DATE_RECEIVED,LABEL
0,6383024,106903842,N,1,BASED ON ADDITIONAL INFORMATION RECEIVED THIS ...,6/3/2017,1.0
1,6383024,106903843,D,1,BASED ON ADDITIONAL INFORMATION RECEIVED THIS ...,6/3/2017,1.0
2,6383024,109652829,N,1,"IF INFORMATION IS PROVIDED IN THE FUTURE, A SU...",6/3/2017,1.0
3,6383024,69202956,N,1,MANUFACTURER REFERENCE NUMBER: (B)(4). INCIDEN...,6/3/2017,1.0
5,6343125,107384375,N,1,"IF INFORMATION IS PROVIDED IN THE FUTURE, A SU...",20/2/2017,1.0
...,...,...,...,...,...,...,...
3588,10722253,212603921,N,1,"BASED ON THE INVESTIGATION, THE DEVICE WAS NOT...",22/10/2020,1.0
3590,10722253,320662340,N,0,THIS REPORT IS A DUPLICATE REPORT TO 2183959-2...,22/10/2020,1.0
3592,10722574,212684512,D,1,IT WAS REPORTED THAT THE PATIENT UNDERWENT A R...,22/10/2020,1.0
3594,10732217,212963377,N,1,THE EXACT EVENT DATE IS UNKNOWN.,25/10/2020,1.0


In [39]:
unlabeled_data['LABEL']

(9202,)

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Initialize the classifier
clf = LogisticRegression(max_iter=1000)

# Train the classifier on labeled data
clf.fit(X_train, y_train)

# Evaluate on the validation set
y_val_pred = clf.predict(X_val)
initial_accuracy = accuracy_score(y_val, y_val_pred)
print(f'Validation Accuracy: {accuracy_score(y_val, y_val_pred):.4f}')


Validation Accuracy: 0.5185


In [29]:
data = pd.read_csv("./data/text_label.csv",  encoding='latin-1')
X = data['FOI_TEXT']  # Text data
y = data['LABEL']  # Labels

# Split into labeled and unlabeled
X_labeled = X[y.notnull()]
y_labeled = y[y.notnull()]
X_unlabeled = X[y.isnull()]

In [32]:
from sklearn.model_selection import train_test_split

# Split the labeled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_labeled, y_labeled, test_size=0.2, random_state=42, stratify=y_labeled)

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=1000)  # Limit to top 1000 features for efficiency
X_labeled_vec = vectorizer.fit_transform(X_labeled)
X_unlabeled_vec = vectorizer.transform(X_unlabeled)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [34]:
# Initialize the model
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=200)

# Self-training loop
while True:
    # Fit the model on the training data
    model.fit(X_train_vec, y_train)

    # Get probability predictions on the unlabeled data
    y_pred_proba = model.predict_proba(X_unlabeled_vec)

    # Get indices of samples with high confidence predictions
    high_confidence_indices = np.where(np.max(y_pred_proba, axis=1) > 0.90)[0]

    if len(high_confidence_indices) == 0:
        break  # Exit if no high-confidence predictions

    # Get the corresponding text and predicted labels
    X_high_conf = X_unlabeled.iloc[high_confidence_indices]
    y_high_conf = np.argmax(y_pred_proba[high_confidence_indices], axis=1)

    # Append high-confidence samples to the labeled data
    X_train = pd.concat([X_train, X_high_conf])
    y_train = pd.concat([y_train, pd.Series(y_high_conf)])

    # Remove high-confidence samples from the unlabeled data
    X_unlabeled = X_unlabeled.drop(X_high_conf.index)

    # Re-vectorize the updated training data
    X_train_vec = vectorizer.fit_transform(X_train)
    X_unlabeled_vec = vectorizer.transform(X_unlabeled)


In [35]:
# Evaluate on the test set
y_pred = model.predict(X_test_vec)

# Print classification report
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

         0.0       0.81      1.00      0.90        44
         1.0       0.50      0.40      0.44        25
         2.0       0.47      0.47      0.47        15
         3.0       0.31      0.31      0.31        16
         4.0       0.83      0.38      0.53        13
         5.0       0.71      0.83      0.77        12

    accuracy                           0.65       125
   macro avg       0.61      0.57      0.57       125
weighted avg       0.64      0.65      0.63       125



In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Plot confusion matrix
plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=np.unique(y_test), 
            yticklabels=np.unique(y_test))
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()


In [48]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")


Accuracy: 0.65
