In [62]:
import os
import pandas as pd
import datasets
from utils import save_df_as_csv, stratified_split

data_folder = "./data"
dataset_id = "drug_review_raw"
df_train = pd.read_csv(f"{data_folder}/{dataset_id}/{dataset_id}_train.csv")
df_test = pd.read_csv(f"{data_folder}/{dataset_id}/{dataset_id}_test.csv")
df_train

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,3-Nov-15,10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,27-Nov-16,37
...,...,...,...,...,...,...,...
161292,191035,Campral,Alcohol Dependence,"""I wrote my first report in Mid-October of 201...",10,31-May-15,125
161293,127085,Metoclopramide,Nausea/Vomiting,"""I was given this in IV before surgey. I immed...",1,1-Nov-11,34
161294,187382,Orencia,Rheumatoid Arthritis,"""Limited improvement after 4 months, developed...",2,15-Mar-14,35
161295,47128,Thyroid desiccated,Underactive Thyroid,"""I&#039;ve been on thyroid medication 49 years...",10,19-Sep-15,79


In [50]:
# remove low volume class
dropped = df_train.groupby("condition").filter(lambda x: len(x) < 4000)
df_train = df_train.drop(dropped.index).reset_index(drop=True)
df_train

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount
0,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17
1,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,3-Nov-15,10
2,102654,Aripiprazole,Bipolar Disorde,"""Abilify changed my life. There is hope. I was...",10,14-Mar-15,32
3,48928,Ethinyl estradiol / levonorgestrel,Birth Control,"""I had been on the pill for many years. When m...",8,8-Dec-16,1
4,75612,L-methylfolate,Depression,"""I have taken anti-depressants for years, with...",10,9-Mar-17,54
...,...,...,...,...,...,...,...
60612,176146,Lorazepam,Anxiety,"""About 4 years ago I started having early-morn...",8,21-Nov-17,0
60613,228492,Geodon,Bipolar Disorde,"""I was in a very bad place at the time I start...",3,25-Jul-16,5
60614,93069,Vortioxetine,Depression,"""This is the third med I&#039;ve tried for anx...",2,17-Jul-16,33
60615,132177,Ativan,Anxiety,"""I was super against taking medication. I&#039...",9,16-Aug-16,61


In [51]:
# stratified split to create new train and test set
df_train, df_test = stratified_split(df=df_train, splitby=["drugName","condition"], frac=0.9)
df_train

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount
0,94765,Abilify,Bipolar Disorde,"""I feel the Abilify working on my depression. ...",10,17-Nov-13,39
1,94810,Abilify,Bipolar Disorde,"""This medicine made me nervous all the time, I...",5,27-Dec-13,19
2,94558,Abilify,Bipolar Disorde,"""I absolutely hated this drug. My thoughts cyc...",1,11-Oct-16,8
3,94601,Abilify,Bipolar Disorde,"""I love this medication, changed my life. I ta...",9,6-Nov-15,28
4,94592,Abilify,Bipolar Disorde,"""I have taken abilify for almost a year now on...",5,13-Dec-15,9
...,...,...,...,...,...,...,...
53766,59106,Zyprexa Zydis,Bipolar Disorde,"""I haven&#039;t been taking this for very long...",8,27-Aug-08,17
53767,59092,Zyprexa Zydis,Bipolar Disorde,"""I was switched to Zyprexa Zydis about two wee...",9,6-Oct-15,10
53768,59100,Zyprexa Zydis,Bipolar Disorde,"""I recently started on Zyprexa Zydis due to so...",9,7-Jul-10,14
53769,112446,depo-subQ provera 104,Birth Control,"""Not only did I gain ten pounds in just a few ...",1,19-Dec-16,0


In [52]:
df_train.groupby(["condition"]).size()

condition
Acne                5035
Anxiety             5318
Bipolar Disorde     3803
Birth Control      25916
Depression          8161
Pain                5538
dtype: int64

In [53]:
df_test.groupby(["condition"]).size()

condition
Acne                553
Anxiety             586
Bipolar Disorde     421
Birth Control      2872
Depression          908
Pain                607
dtype: int64

In [54]:
# remove unecessary columns
df_train = df_train[["review","condition","drugName"]]
df_test = df_test[["review","condition","drugName"]]

In [55]:
# save as csv

dataset_id = "drug_review_stratified"
os.makedirs(f"{data_folder}/{dataset_id}/", exist_ok=True)
save_df_as_csv(df_train, f"{data_folder}/{dataset_id}/{dataset_id}_train.csv")
save_df_as_csv(df_test, f"{data_folder}/{dataset_id}/{dataset_id}_test.csv")

'/home/lucien/projects/Text-Classification-Transformers/data/drug_review_stratified/drug_review_stratified_test.csv'

In [75]:
dataset_id = "banking77"
raw_dataset = datasets.load_from_disk(f"{data_folder}/banking77").rename_column("label", "labels")
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 10003
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 3080
    })
})

In [76]:
raw_dataset["train"].features["labels"]

ClassLabel(names=['activate_my_card', 'age_limit', 'apple_pay_or_google_pay', 'atm_support', 'automatic_top_up', 'balance_not_updated_after_bank_transfer', 'balance_not_updated_after_cheque_or_cash_deposit', 'beneficiary_not_allowed', 'cancel_transfer', 'card_about_to_expire', 'card_acceptance', 'card_arrival', 'card_delivery_estimate', 'card_linking', 'card_not_working', 'card_payment_fee_charged', 'card_payment_not_recognised', 'card_payment_wrong_exchange_rate', 'card_swallowed', 'cash_withdrawal_charge', 'cash_withdrawal_not_recognised', 'change_pin', 'compromised_card', 'contactless_not_working', 'country_support', 'declined_card_payment', 'declined_cash_withdrawal', 'declined_transfer', 'direct_debit_payment_not_recognised', 'disposable_card_limits', 'edit_personal_details', 'exchange_charge', 'exchange_rate', 'exchange_via_app', 'extra_charge_on_statement', 'failed_transfer', 'fiat_currency_support', 'get_disposable_virtual_card', 'get_physical_card', 'getting_spare_card', 'gett

In [65]:
dataset_id = "drug_review_raw"
dataset_train = datasets.load_dataset("csv", data_files=f"{data_folder}/{dataset_id}/{dataset_id}_train.csv", features=)
dataset_test = datasets.load_dataset("csv", data_files=f"{data_folder}/{dataset_id}/{dataset_id}_test.csv")
dataset_test

DatasetDict({
    train: Dataset({
        features: ['review', 'condition', 'drugName'],
        num_rows: 6846
    })
})

In [68]:
dataset_test = dataset_test.rename_column("condition", "labels").rename_column("review", "text")
dataset_test

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'drugName'],
        num_rows: 6846
    })
})