### Import Packages

In [None]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

### Import Data

In [None]:
DATA_PATH = "/content/drive/Shareddrives/CIS522-Project/data"

In [None]:
rel_train = pd.read_parquet(f"{DATA_PATH}/rel_train_baseline.parquet")
rel_test = pd.read_parquet(f"{DATA_PATH}/rel_test_baseline.parquet")

In [None]:
rel_train.head()

Unnamed: 0,text,arg1,arg2,label,sid,uid,sdp
0,He|also|may|have|recurrent|seizures|which|shou...,4:6,11:12,Reason-Drug,0,100035,recurrent seizures treated with ativan
0,He|also|may|have|recurrent|seizures|which|shou...,14:15,11:12,Route-Drug,1,100035,IM IV ativan
0,-patient|will|be|on|Topiramate|25mg|PO|BID|unt...,5:6,4:5,Strength-Drug,2,100035,-patient will be on Topiramate 25mg
0,-patient|will|be|on|Topiramate|25mg|PO|BID|unt...,6:7,4:5,Route-Drug,3,100035,PO mg be on Topiramate
0,-patient|will|be|on|Topiramate|25mg|PO|BID|unt...,7:8,4:5,Frequency-Drug,4,100035,BID Topiramate


### Preprocess Data

In [None]:
tfidf = TfidfVectorizer()

X_train = tfidf.fit_transform(rel_train['sdp'].tolist())
print(f"TFIDF vocab size: {tfidf.idf_.shape[0]}")
y_train = rel_train['label']

X_test = tfidf.transform(rel_test['sdp'].tolist())
y_test = rel_test['label']

TFIDF vocab size: 5699


In [None]:
rel_train['label'].value_counts(normalize=True)

Strength-Drug     0.184394
Form-Drug         0.183074
Frequency-Drug    0.173609
Route-Drug        0.152369
Reason-Drug       0.142161
Dosage-Drug       0.116244
ADE-Drug          0.030457
Duration-Drug     0.017691
Name: label, dtype: Float64

In [None]:
rel_test['label'].value_counts(normalize=True)

Form-Drug         0.186429
Strength-Drug     0.180888
Frequency-Drug    0.171938
Route-Drug        0.151138
Reason-Drug       0.145341
Dosage-Drug       0.114867
ADE-Drug          0.031242
Duration-Drug     0.018157
Name: label, dtype: Float64

### Train Model

In [None]:
logreg = LogisticRegression(
    class_weight='balanced', penalty=None, max_iter=1000, n_jobs=-1
).fit(X_train, y_train)

### Evaluate Model

In [None]:
print(metrics.classification_report(y_test, logreg.predict(X_test)))

                precision    recall  f1-score   support

      ADE-Drug       0.62      0.53      0.57       733
   Dosage-Drug       0.75      0.83      0.79      2695
 Duration-Drug       0.48      0.54      0.51       426
     Form-Drug       0.92      0.90      0.91      4374
Frequency-Drug       0.87      0.87      0.87      4034
   Reason-Drug       0.84      0.77      0.80      3410
    Route-Drug       0.90      0.89      0.90      3546
 Strength-Drug       0.85      0.89      0.87      4244

      accuracy                           0.85     23462
     macro avg       0.78      0.78      0.78     23462
  weighted avg       0.85      0.85      0.85     23462

