In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

In [None]:
# Load the data
# Load embeddings and labels
embeddings_1 = np.load('embeddings_1.npy')
embeddings_2 = np.load('embeddings_2.npy')
embeddings = np.vstack([embeddings_1, embeddings_2],dtype='float32')  # Combine both embedding files

In [3]:
# Load labels and convert them to multi-hot encoding
with open('icd_codes_1.txt') as f1, open('icd_codes_2.txt') as f2:
    labels_1 = [line.strip().split(';') for line in f1]
    labels_2 = [line.strip().split(';') for line in f2]
    labels = labels_1 + labels_2

In [4]:
# Create a mapping for ICD10 codes to multi-hot encoding
unique_codes = sorted(set(code for sublist in labels for code in sublist))
code_to_index = {code: idx for idx, code in enumerate(unique_codes)}
num_classes = len(unique_codes)

In [5]:
# Convert labels to multi-hot vectors
y = np.zeros((len(labels), num_classes), dtype=int)
for i, label_list in enumerate(labels):
    for code in label_list:
        y[i, code_to_index[code]] = 1

In [6]:
# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(embeddings, y, test_size=0.15, random_state=101)

In [21]:
# Initialize logistic regression model for multi-label classification
log_reg = LogisticRegression(max_iter=4000,solver='liblinear',class_weight='balanced')
# log_reg = SVC(kernel='sigmoid',gamma='auto',decision_function_shape='ovo')
multi_target_clf = OneVsRestClassifier(log_reg, n_jobs=9)

In [None]:
# Train the model
multi_target_clf.fit(X_train, y_train)

In [None]:
# Predict on validation set
y_pred = multi_target_clf.predict(X_val)

In [None]:
# Evaluate the model using average micro F2 score
micro_f2_score = f1_score(y_val, y_pred, average='micro')

print(f"Average Micro F2 Score on Validation Set: {micro_f2_score:.4f}")

In [None]:
# Predict on test data for submission
test_embeddings = np.load('test_data.npy')
test_predictions = multi_target_clf.predict(test_embeddings)



In [None]:
# Step 4: Create a submission file in the specified format
# Map indices back to ICD10 codes
index_to_code = {v: k for k, v in code_to_index.items()}

submission_data = []
for idx, label_vector in enumerate(test_predictions, start=1):
    # Get codes with predictions above the threshold and sort lexicographically
    codes = [index_to_code[i] for i, val in enumerate(label_vector) if val == 1]
    codes = sorted(codes)  # Sort lexicographically
    label_string = ';'.join(codes) if codes else ''  # Stitch with ';' or leave blank if no label
    submission_data.append({'id': idx, 'labels': label_string})

In [None]:
# Convert to DataFrame and save as CSV
submission_df = pd.DataFrame(submission_data)
submission_df

In [None]:
submission_df.to_csv('submission.csv', index=False)

print("Submission file 'submission.csv' created successfully.")