<a href="https://colab.research.google.com/github/M-Sravya/drug-review/blob/main/preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

# Sample review data
reviews = [
    "I smoked for 50+ years.  Took it for one week and that was it.  I didn&#039;t think it was possible for me to quit.  It has been 6 years now.  Great product."
]

df = pd.DataFrame(reviews, columns=['review'])


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
df['review'] = df['review'].str.lower()
print(df.head())


                                              review
0  i smoked for 50+ years.  took it for one week ...


In [None]:
import string

def remove_punctuation(text):
  """Removes punctuation from a given text."""
  translator = str.maketrans('', '', string.punctuation)
  return text.translate(translator)

df['review'] = df['review'].apply(remove_punctuation)
print(df.head())


                                              review
0  i smoked for 50 years  took it for one week an...


In [None]:
def remove_numbers(text):
  """Removes numbers from a given text."""
  output = re.sub(r'\d+', '', text)
  return output

df['review'] = df['review'].apply(remove_numbers)
print(df.head())


                                              review
0  i smoked for  years  took it for one week and ...


In [None]:
df['review'] = df['review'].apply(lambda x: x.split())
print(df.head())


                                              review
0  [i, smoked, for, years, took, it, for, one, we...


In [None]:
stop_words = set(stopwords.words('english'))
df['review'] = df['review'].apply(lambda x: [word for word in x if not word in stop_words])
print(df.head())


                                              review
0  [smoked, years, took, one, week, didnt, think,...


In [None]:
lemmatizer = WordNetLemmatizer()
df['review'] = df['review'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
print(df.head())


                                              review
0  [smoked, year, took, one, week, didnt, think, ...


In [None]:
df['review'] = df['review'].apply(lambda x: ' '.join(x))
print(df.head())


                                              review
0  smoked year took one week didnt think possible...


In [None]:
import pandas as pd
import numpy as np

# Sample review data (replace with your actual data)
data = {'drugname': ['valsartan', 'cialis', 'cialis',np.nan, 'levora'],
        'condition': ['Acne', 'Depression', np.nan, 'Acne', 'Birth control']}

df = pd.DataFrame(data)


print(df)


    drugname      condition
0  valsartan           Acne
1     cialis     Depression
2     cialis            NaN
3        NaN           Acne
4     levora  Birth control


In [None]:
# Handling NaN values
df.fillna('data not available', inplace=True)

print(df)

             drugname           condition
0           valsartan                Acne
1              cialis          Depression
2              cialis  data not available
3  data not available                Acne
4              levora       Birth control


In [None]:
import pandas as pd

# Sample data
data = {'drugname': ['valsartan', 'cialis', 'cialis', 'NaN', 'levora'],
        'condition': ['Acne', 'Depression', 'Depression', 'Acne', 'Birth control']}
df = pd.DataFrame(data)
print(df)

    drugname      condition
0  valsartan           Acne
1     cialis     Depression
2     cialis     Depression
3        NaN           Acne
4     levora  Birth control


In [None]:
import pandas as pd

# Sample data
data = {'drugname': ['valsartan', 'cialis', 'cialis', 'NaN', 'levora'],
        'condition': ['Acne', 'Depression', 'Depression', 'Acne', 'Birth control']}
df = pd.DataFrame(data)

# Remove duplicates based on 'column1' and 'column2'
df_without_duplicates = df.drop_duplicates(subset=['drugname', 'condition'])

print(df_without_duplicates)


    drugname      condition
0  valsartan           Acne
1     cialis     Depression
3        NaN           Acne
4     levora  Birth control
