In [None]:
import pandas as pd

## Combining 2 datastes - original "ade_classification_dataset" & "PsyTAR dataset"

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Load the original dataset
file_path = '/content/drive/MyDrive/NLP Project/NLP Project - Datasets/ade_classification_dataset.csv'
ade = pd.read_csv(file_path)

# Show the first few rows
ade.head()

Unnamed: 0,text,label
0,Intravenous azithromycin-induced ototoxicity.,1
1,"Immobilization, while Paget's bone disease was...",1
2,Unaccountable severe hypercalcemia in a patien...,1
3,METHODS: We report two cases of pseudoporphyri...,1
4,METHODS: We report two cases of pseudoporphyri...,1


In [None]:
# Load the second dataset
file_path = '/content/drive/MyDrive/NLP Project/NLP Project - Datasets/PsyTAR_dataset.csv'
psy = pd.read_csv(file_path)

# Show the first few rows
psy.head()

Unnamed: 0,id,comment_id,drug_id,sentence_index,sentences,ADR,WD,EF,INF,SSI,DI,Findings,others,rating,category
0,1.0,1.0,lexapro.1,1.0,"extreme weight gain, short-term memory loss, h...",1.0,,,,,,,0,1.0,ssri
1,2.0,1.0,lexapro.1,2.0,I am detoxing from Lexapro now.,,,,,,,,0,1.0,ssri
2,3.0,1.0,lexapro.1,3.0,I slowly cut my dosage over several months and...,,,,,,,,1,1.0,ssri
3,4.0,1.0,lexapro.1,4.0,I am now 10 days completely off and OMG is it ...,,,,,,,,1,1.0,ssri
4,5.0,1.0,lexapro.1,5.0,"I have flu-like symptoms, dizziness, major moo...",,1.0,,,,,,0,1.0,ssri


In [None]:
# Extract drug name (before the dot)
psy['drug_name'] = psy['drug_id'].astype(str).str.split('.').str[0]

# Add drug name to the sentence
psy['text'] = psy['drug_name'] + ": " + psy['sentences']

# Keep only rows where ADR is either NaN (label 0) or exactly 1.0 (label 1)
psytar_filtered = psy[psy['ADR'].isna() | (psy['ADR'] == 1.0)].copy()

# Assign labels: NaN -> 0, 1.0 -> 1
psytar_filtered['label'] = psytar_filtered['ADR'].apply(lambda x: 0 if pd.isna(x) else 1)

# Re-check the label distribution to verify correctness
label_counts = psytar_filtered['label'].value_counts()
label_counts

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,3841
1,2168


In [None]:
# Add dataset source
ade['dataset'] = 'ade_classification_dataset'
psytar_filtered['dataset'] = 'PsyTAR_dataset'

# Keep only required columns
ade_final = ade[['text', 'label', 'dataset']]
psytar_final = psytar_filtered[['text', 'label', 'dataset']]

# Combine datasets
combined_df = pd.concat([ade_final, psytar_final], ignore_index=True)

In [None]:
combined_df.head(-50)

Unnamed: 0,text,label,dataset
0,Intravenous azithromycin-induced ototoxicity.,1,ade_classification_dataset
1,"Immobilization, while Paget's bone disease was...",1,ade_classification_dataset
2,Unaccountable severe hypercalcemia in a patien...,1,ade_classification_dataset
3,METHODS: We report two cases of pseudoporphyri...,1,ade_classification_dataset
4,METHODS: We report two cases of pseudoporphyri...,1,ade_classification_dataset
...,...,...,...
29470,effexorXR: I feel SO much better.,0,PsyTAR_dataset
29471,effexorXR: I can face the world again.,0,PsyTAR_dataset
29472,effexorXR: Trouble Staying asleep for more tha...,1,PsyTAR_dataset
29473,effexorXR: Loss of appetite (good side effect)...,1,PsyTAR_dataset


In [None]:
# Define save path
save_path = '/content/drive/MyDrive/NLP Project/NLP Project - Datasets/combined_dataset.csv'

# Save combined DataFrame to Google Drive
combined_df.to_csv(save_path, index=False)

print(f"Combined dataset saved to: {save_path}")

Combined dataset saved to: /content/drive/MyDrive/NLP Project/NLP Project - Datasets/combined_dataset.csv
