<a href="https://colab.research.google.com/github/M-Sravya/drug-review/blob/main/Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

# Create a DataFrame with duplicate entries
data = {'reviews': ["I bought this@# drug! It's AMAZING!! <html>But not sure about side-effects...</html> Highly recommend!",
                    "I bought this@# drug! It's AMAZING!! <html>But not sure about side-effects...</html> Highly recommend!",
                    None]}
df = pd.DataFrame(data)

# Remove duplicates
df = df.drop_duplicates()
print("After removing duplicates:", df)


After removing duplicates:                                              reviews
0  I bought this@# drug! It's AMAZING!! <html>But...
2                                               None


In [None]:
# Filling missing values (if any) with a placeholder like 'Unknown'
df['reviews'].fillna('Data not available', inplace=True)
print("After handling missing values:", df)


After handling missing values:                                              reviews
0  I bought this@# drug! It's AMAZING!! <html>But...
2                                 Data not available


In [None]:
import re
from bs4 import BeautifulSoup

# Sample review
review = "I bought this@# drug! It's AMAZING!! <html>But not sure about side-effects...</html> Highly recommend!"

# Step 1: Remove HTML tags
def remove_html_tags(text):
    return BeautifulSoup(text, 'html.parser').get_text()

review_no_html = remove_html_tags(review)

# Step 2: Insert a new line after "but not sure"
def insert_newline(text, phrase):
    # Replace the phrase with the phrase followed by a newline
    return text.replace(phrase, phrase + '\n')

# Specific phrase to move to a new line
phrase = "But not sure"

# Applying the newline insertion
formatted_review = insert_newline(review_no_html, phrase)

print("Removal of HTML Tags:\n")
print(formatted_review)


Removal of HTML Tags:

I bought this@# drug! It's AMAZING!! But not sure
 about side-effects... Highly recommend!


In [None]:
# New step: Convert to sentence case
# Applying the newline insertion
phrase = "but not sure"
df['reviews'] = df['reviews'].apply(lambda x: insert_newline(x, phrase))

def sentence_case(text):
    return '. '.join([sentence.capitalize() for sentence in text.split('. ')])

df['reviews'] = df['reviews'].apply(sentence_case)
print("\nAfter converting to sentence case:\n")
print("\n".join(df['reviews']))


After converting to sentence case:

I bought this drug! it's amazing!! but not sure
 about side-effects... Highly recommend!
Data not available


In [None]:
# Step 2: Insert a new line after "but not sure"
def insert_newline(text, phrase):
    return text.replace(phrase, phrase + '\n').strip()

phrase = "but not sure"
df['reviews'] = df['reviews'].apply(lambda x: insert_newline(x, phrase))

# Step 3: Remove special characters
def remove_special_characters(text):
    # Updated regex pattern
    cleaned_text = re.sub(r'[&,*@$\/|\\:;<>,#^]', '', text)
    # Remove multiple new lines and trim extra spaces
    cleaned_text = re.sub(r'\n+', '\n', cleaned_text).strip()
    return cleaned_text

df['reviews'] = df['reviews'].apply(remove_special_characters)
# Output the cleaned reviews
print("\nAfter removing special characters:")
print("\n".join(df['reviews']))


After removing special characters:
I bought this drug! it's amazing!! but not sure
 about side-effects... Highly recommend!
Data not available


In [None]:
import nltk
from nltk.tokenize import word_tokenize
# Ensure NLTK resources are downloaded
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# Step 2: Insert a new line after "but not sure"
def insert_newline(text, phrase):
    return text.replace(phrase, phrase + '\n').strip()

phrase = "but not sure"
df['reviews'] = df['reviews'].apply(lambda x: insert_newline(x, phrase))
# Step 5: Tokenization

def tokenize(text):
    return word_tokenize(text)

df['tokens'] = df['reviews'].apply(tokenize)

# Output the cleaned reviews and tokens
print("\nAfter removing special characters and tokenizing:\n")
print(df[['reviews', 'tokens']])


After removing special characters and tokenizing:

                                             reviews  \
0  I bought this drug! it's amazing!! but not sur...   
2                                 Data not available   

                                              tokens  
0  [I, bought, this, drug, !, it, 's, amazing, !,...  
2                             [Data, not, available]  


In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# Step 6: Remove stop words
nltk.download('stopwords')
nltk_stop_words = set(stopwords.words('english'))


After removing stop words:

['I', 'bought', 'drug!', 'amazing!!', 'sure', 'side-effects...', 'Highly', 'recommend!']
['Data', 'available']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Step 7: Lemmatization
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

def lemmatize_words(tokens):
    return [lemmatizer.lemmatize(word) for word in tokens]

df['tokens'] = df['tokens'].apply(lemmatize_words)
print("\nAfter lemmatization:\n")
print("\n".join([str(tokens) for tokens in df['tokens']]))


[nltk_data] Downloading package wordnet to /root/nltk_data...



After lemmatization:

['I', 'bought', 'drug!', 'amazing!!', 'sure', 'side-effects...', 'Highly', 'recommend!']
['Data', 'available']


In [None]:
# Step 8: Joining tokens back into a sentence
df['processed_review'] = df['tokens'].apply(lambda x: ' '.join(x))
print("\nAfter joining words back:\n")
print("\n".join(df['processed_review']))


After joining words back:

I bought drug! amazing!! sure side-effects... Highly recommend!
Data available


In [None]:
# Step 9: Padding (assuming max length of 10)
# Step 2: Insert a new line after a specific phrase
def insert_newline(text, phrase):
    # Replace the phrase with the phrase followed by a newline and remove extra newlines
    text_with_newline = text.replace(phrase, phrase + '\n').strip()
    # Remove multiple new lines and trim extra spaces
    return re.sub(r'\n+', '\n', text_with_newline)

phrase = "but not sure"
df['reviews'] = df['reviews'].apply(lambda x: insert_newline(x, phrase))
max_length = 10

def pad_or_truncate(tokens, max_length):
    return tokens[:max_length] + ['<PAD>'] * (max_length - len(tokens)) if len(tokens) < max_length else tokens[:max_length]

df['padded_review'] = df['tokens'].apply(lambda x: pad_or_truncate(x, max_length))
print("\nAfter padding:\n")
print("\n".join([str(tokens) for tokens in df['padded_review']]))


After padding:

['I', 'bought', 'this', 'drug', '!', 'it', "'s", 'amazing', '!', '!']
['Data', 'not', 'available', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Preprocessed reviews
preprocessed_reviews = [
    "I bought drug amazing drugscompany sure sideeffects highly recommend",
    "Sideeffects terrible recommended",
    "Loved result highly effective drug expensive",
    "Drug affordable sure safe",
    "Product review better"
]

# Corresponding sentiments for the preprocessed reviews
sentiments = ['Positive', 'Negative', 'Positive', 'Neutral', 'Neutral']

# Create a DataFrame with the preprocessed reviews and sentiments
df_preprocessed = pd.DataFrame({
    'processed_review': preprocessed_reviews,
    'review_sentiment': sentiments
})

# Apply Label Encoding
label_encoder = LabelEncoder()

# Fit and transform the sentiment labels
df_preprocessed['encoded_sentiment'] = label_encoder.fit_transform(df_preprocessed['review_sentiment'])

# Display the DataFrame with encoded sentiments
print("\nDataFrame with encoded sentiments:\n", df_preprocessed[['processed_review', 'review_sentiment', 'encoded_sentiment']])

# Display mapping of labels
print("\nMapping of Sentiments:\n")
for sentiment, code in zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)):
    print(f"{sentiment}: {code}")



DataFrame with encoded sentiments:
                                     processed_review review_sentiment  \
0  I bought drug amazing drugscompany sure sideef...         Positive   
1                   Sideeffects terrible recommended         Negative   
2       Loved result highly effective drug expensive         Positive   
3                          Drug affordable sure safe          Neutral   
4                              Product review better          Neutral   

   encoded_sentiment  
0                  2  
1                  0  
2                  2  
3                  1  
4                  1  

Mapping of Sentiments:

Negative: 0
Neutral: 1
Positive: 2
