# Loading and Preprocessing Data:

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

# File paths for Train, Test, and Synthetic datasets
train_file = '/content/drive/MyDrive/Colab Notebooks/INFO 5731/Group 9 Products/Project Share Folder/UCI ML Drug Review dataset (originals) and Synthetic Data/drugsComTrain_raw.csv'
test_file = '/content/drive/MyDrive/Colab Notebooks/INFO 5731/Group 9 Products/Project Share Folder/UCI ML Drug Review dataset (originals) and Synthetic Data/drugsComTest_raw.csv'
synthetic_file = '/content/drive/MyDrive/Colab Notebooks/INFO 5731/Group 9 Products/Project Share Folder/UCI ML Drug Review dataset (originals) and Synthetic Data/Synthetic Data.csv'

# Load datasets
train_data = pd.read_csv(train_file)
test_data = pd.read_csv(test_file)
synthetic_data = pd.read_csv(synthetic_file)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


### Remove Noise

In [2]:
# Drop unnecessary columns from datasets. We chose to drop these because they don't add
  # any value to determining the content of the reviews or help the ML model determine side effects.
train_data = train_data.drop(columns=['uniqueID', 'date', 'usefulCount', 'rating'])
test_data = test_data.drop(columns=['uniqueID', 'date', 'usefulCount', 'rating'])
synthetic_data = synthetic_data.drop(columns=['unique_id', 'date', 'useful_count', 'rating', 'source'])

# Remove rows with null values
train_data.dropna(inplace=True)
test_data.dropna(inplace=True)
synthetic_data.dropna(inplace=True)

# Clean text function using stopwords and punctuation removal
def clean_text(text):
    tokens = nltk.word_tokenize(text.lower())
    cleaned_tokens = [word for word in tokens if word not in stop_words and word not in punctuations]
    return ' '.join(cleaned_tokens)

# Remove HTML entities (example: "&amp;" becomes "&")
train_data['review'] = train_data['review'].str.replace(r'&#\d+;', '', regex=True)
test_data['review'] = test_data['review'].str.replace(r'&#\d+;', '', regex=True)
synthetic_data['review'] = synthetic_data['review'].str.replace(r'&#\d+;', '', regex=True)

### Normalize, Standardize, and Lemmatize the data

In [3]:
# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to lemmatize entire text
def lemmatize_text(text):
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

# Remove special characters (retain only alphanumeric characters and spaces)
train_data['review'] = train_data['review'].str.replace(r'[^a-zA-Z0-9\s]', '', regex=True)
test_data['review'] = test_data['review'].str.replace(r'[^a-zA-Z0-9\s]', '', regex=True)
synthetic_data['review'] = synthetic_data['review'].str.replace(r'[^a-zA-Z0-9\s]', '', regex=True)

# Convert all text to lowercase
train_data['cleaned_review'] = train_data['review'].str.lower()
test_data['cleaned_review'] = test_data['review'].str.lower()
synthetic_data['cleaned_review'] = synthetic_data['review'].str.lower()

In [4]:
# Sort the datasets alphabetically by the 'drug_name' column
train_data = train_data.sort_values(by='drugName')
test_data = test_data.sort_values(by='drugName')
synthetic_data = synthetic_data.sort_values(by='drug_name')

train_data = train_data[train_data['cleaned_review'] != '']
test_data = test_data[test_data['cleaned_review'] != '']
synthetic_data = synthetic_data[synthetic_data['cleaned_review'] != '']

In [5]:
# Rename and save cleaned datasets:

train_data_cleaned = train_data.copy()
test_data_cleaned = test_data.copy()
synthetic_data_cleaned = synthetic_data.copy()

# Save cleaned datasets
train_data_cleaned.to_csv('/content/drive/MyDrive/Colab Notebooks/INFO 5731/Group 9 Products/Project Share Folder/cleaned_train_data.csv', index=False)
test_data_cleaned.to_csv('/content/drive/MyDrive/Colab Notebooks/INFO 5731/Group 9 Products/Project Share Folder/cleaned_test_data.csv', index=False)
synthetic_data_cleaned.to_csv('/content/drive/MyDrive/Colab Notebooks/INFO 5731/Group 9 Products/Project Share Folder/cleaned_synthetic_data.csv', index=False)

In [6]:
# Inspect data:
print('\nCleaned Train Data:')
print(train_data_cleaned.head())
print('\nCleaned Test Data:')
print(test_data_cleaned.head())
print('\nCleaned Synthetic Data:')
print(synthetic_data_cleaned.head())


Cleaned Train Data:
                                   drugName                 condition  \
9892              A + D Cracked Skin Relief  Bacterial Skin Infection   
18402                            A / B Otic              Otitis Media   
77682  Abacavir / dolutegravir / lamivudine             HIV Infection   
59740  Abacavir / dolutegravir / lamivudine             HIV Infection   
90400  Abacavir / dolutegravir / lamivudine             HIV Infection   

                                                  review  \
9892   I have severe cracked skin on my hands  Ive tr...   
18402  It numbs the pain It makes my ear feel heavier...   
77682  Update on prior review  after being on triumeq...   
59740  I started taking Triumeq almost two years afte...   
90400  After taking complera for 5 years switched to ...   

                                          cleaned_review  
9892   i have severe cracked skin on my hands  ive tr...  
18402  it numbs the pain it makes my ear feel heavier...  
77

In [7]:
# Check for missing values
print(train_data_cleaned['cleaned_review'].isnull().sum())
print(test_data_cleaned['cleaned_review'].isnull().sum())

0
0


In [8]:
print(train_data_cleaned['cleaned_review'].apply(type).value_counts())
print(test_data_cleaned['cleaned_review'].apply(type).value_counts())

cleaned_review
<class 'str'>    160396
Name: count, dtype: int64
cleaned_review
<class 'str'>    53471
Name: count, dtype: int64


In [9]:
print(train_data_cleaned[train_data_cleaned['cleaned_review'] == ''])
print(test_data_cleaned[test_data_cleaned['cleaned_review'] == ''])

Empty DataFrame
Columns: [drugName, condition, review, cleaned_review]
Index: []
Empty DataFrame
Columns: [drugName, condition, review, cleaned_review]
Index: []
