In [12]:
import pandas as pd
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk


# Read the TSV file and convert it to CSV format
df = pd.read_csv('drugsComTrain_raw.tsv', sep='\\t', engine = 'python')
df.to_csv('drugsComTrain_raw.csv', index=False)

# Select the 'condition' and 'review' columns
df = df[['condition', 'review']]

# Select the first 500 rows
df = df.head(500)

# Filter the rows where 'review' column contains string values
df = df[df['review'].apply(lambda x: isinstance(x, str))]

# Function to apply stemming
def apply_stemming(text):
    if isinstance(text, str):  # Check if the input is a string
        ps = PorterStemmer()
        words = word_tokenize(text)
        stemmed_words = [ps.stem(word) for word in words]
        return ' '.join(stemmed_words)
    else:
        return ''  # Return an empty string for non-string values

# Function to apply lemmatization
def apply_lemmatization(text):
    if isinstance(text, str):  # Check if the input is a string
        lemmatizer = WordNetLemmatizer()
        words = word_tokenize(text)
        lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
        return ' '.join(lemmatized_words)
    else:
        return ''  # Return an empty string for non-string values

# Function for sentiment analysis
def apply_sentiment_analysis(text):
    sid = SentimentIntensityAnalyzer()
    sentiment_scores = sid.polarity_scores(text)
    return 'Positive' if sentiment_scores['compound'] >= 0 else 'Negative'


In [2]:
# Display the result
df

Unnamed: 0,condition,review
206461,Left Ventricular Dysfunction,"""""""It has no side effect, I take it in combina..."
95260,ADHD,"""""""My son is halfway through his fourth week o..."
"We have tried many different medications and so far this is the most effective.""""""","April 27, 2010",192
92703,Birth Control,"""""""I used to take another oral contraceptive, ..."
"The positive side is that I didn&#039;t have any other side effects. The idea of being period free was so tempting... Alas.""""""","December 14, 2009",17
...,...,...
64773,Anxiety and Stress,"""""""Istarted taking Sertraline 11 months ago du..."
14638,GERD,"""""""Works every time."""""""
199374,Birth Control,"""""""I have had no negative side effects wit thi..."
261,Abnormal Uterine Bleeding,"""""""I have been using the Depo for 2 1/2 years ..."


In [22]:
#Run only once at the start
nltk.data.path.append(r'C:\nltk_data')  # Replace with the path to your nltk_data folder
nltk.download('punkt_tab', download_dir=r'C:\nltk_data')  # Download punkt to the specific folder
nltk.download('wordnet', download_dir=r'C:\nltk_data')  # Download wordnet to the specific folder


[nltk_data] Downloading package punkt_tab to C:\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to C:\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Apply stemming and lemmatization
#df['review_stemmed'] = df['review'].apply(apply_stemming)
df['review_lemmatized'] = df['review'].apply(apply_lemmatization)

# Display the result
print (df['review_lemmatized'])

206461                                                                                                                            `` `` '' It ha no side effect , I take it in c...
95260                                                                                                                             `` `` '' My son is halfway through his fourth ...
We have tried many different medications and so far this is the most effective."""                                                                                              192
92703                                                                                                                             `` `` '' I used to take another oral contracep...
The positive side is that I didn&#039;t have any other side effects. The idea of being period free was so tempting... Alas."""                                                   17
                                                                                                    

In [4]:
# Display the result
print (df['review'])

206461                                                                                                                            """It has no side effect, I take it in combina...
95260                                                                                                                             """My son is halfway through his fourth week o...
We have tried many different medications and so far this is the most effective."""                                                                                              192
92703                                                                                                                             """I used to take another oral contraceptive, ...
The positive side is that I didn&#039;t have any other side effects. The idea of being period free was so tempting... Alas."""                                                   17
                                                                                                    

In [5]:
# Apply sentiment analysis
df['sentiment_score'] = df['review_lemmatized'].apply(apply_sentiment_analysis)

# Display the result
df.head()

Unnamed: 0,condition,review,review_lemmatized,sentiment_score
206461,Left Ventricular Dysfunction,"""""""It has no side effect, I take it in combina...","`` `` '' It ha no side effect , I take it in c...",Positive
95260,ADHD,"""""""My son is halfway through his fourth week o...",`` `` '' My son is halfway through his fourth ...,Positive
"We have tried many different medications and so far this is the most effective.""""""","April 27, 2010",192,192,Positive
92703,Birth Control,"""""""I used to take another oral contraceptive, ...",`` `` '' I used to take another oral contracep...,Negative
"The positive side is that I didn&#039;t have any other side effects. The idea of being period free was so tempting... Alas.""""""","December 14, 2009",17,17,Positive


When lemmatization returns invalid or trimmed words, it typically happens because:

Context Missing: Lemmatization depends on knowing the part of speech (POS) of the word to find its correct lemma. Without the correct POS tagging, the lemmatizer might produce a lemma that is incorrect or overly trimmed.

For example, the word "better" could be lemmatized to "good" if it's understood as an adjective, but if no POS tag is provided, it might be returned incorrectly.

Ambiguity in Word Forms: The lemmatizer may not have enough context to choose the correct base form of a word, leading to incorrect or shortened versions.

Incorrect Input Data: If the input text has non-standard characters, incomplete words, or special formatting (like punctuation marks or numbers), the lemmatizer might return improperly trimmed words.

The WordNetLemmatizer from NLTK requires the correct part of speech (POS) to accurately lemmatize words. If you don't specify the POS tag, it defaults to "noun," which may cause issues. You can use NLTK’s pos_tag function to tag each word before lemmatization.

In [11]:
# Preprocess the text (remove non-alphabetic characters and convert to lowercase)
import re
def preprocess_text(text):
    # Remove non-alphabetic characters (punctuation, numbers, special characters)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert text to lowercase for consistency
    text = text.lower()
    return text

In [15]:
df['review_cleaned'] = df['review'].apply(preprocess_text)

# Display the result
print (df['review_cleaned'])

206461                                                                                                                            it has no side effect i take it in combination...
95260                                                                                                                             my son is halfway through his fourth week of i...
We have tried many different medications and so far this is the most effective."""                                                                                                 
92703                                                                                                                             i used to take another oral contraceptive whic...
The positive side is that I didn&#039;t have any other side effects. The idea of being period free was so tempting... Alas."""                                                     
                                                                                                    

In [16]:
#Apply Lemmatization to pre-processed text
df['review_lemmatized'] = df['review_cleaned'].apply(apply_lemmatization)

# Display the result
print (df['review_lemmatized'])

206461                                                                                                                            it ha no side effect i take it in combination ...
95260                                                                                                                             my son is halfway through his fourth week of i...
We have tried many different medications and so far this is the most effective."""                                                                                                 
92703                                                                                                                             i used to take another oral contraceptive whic...
The positive side is that I didn&#039;t have any other side effects. The idea of being period free was so tempting... Alas."""                                                     
                                                                                                    

In [5]:
#Try all in one and save the results to a CSV file
import pandas as pd
import re
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer

# Download necessary NLTK data
#nltk.download('punkt')
#nltk.download('wordnet')
#nltk.download('vader_lexicon)

# Load the dataset
file_path = 'drugsComTrain_raw.csv'
df = pd.read_csv(file_path)

# Step 1: Select the 'review' column
df = df[['review']].dropna()

# Select the first 500 rows
df = df.head(500)

# Step 1: Preprocess the text (cleaning)
def preprocess_text(text):
    # Remove non-alphabetic characters (punctuation, numbers, special characters)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert text to lowercase
    text = text.lower()
    return text


# Step 3: Lemmatization
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    words = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

# Apply the cleaning and lemmatization
df['cleaned_review'] = df['review'].apply(preprocess_text)
df['lemmatized_review'] = df['cleaned_review'].apply(lemmatize_text)

# Step 4: Sentiment Analysis
sia = SentimentIntensityAnalyzer()

def sentiment_analysis(text):
    sentiment_score = sia.polarity_scores(text)
    return sentiment_score['compound']

# Apply sentiment analysis
df['sentiment_score'] = df['lemmatized_review'].apply(sentiment_analysis)

# Show the first 5 results
print(df[['review', 'lemmatized_review', 'sentiment_score']].head())

# Save the results to a new CSV file
df.to_csv('drug_reviews_lemmatized_sentiment.csv', index=False)

                                              review  \
0  """It has no side effect, I take it in combina...   
1  """My son is halfway through his fourth week o...   
2                                                192   
3  """I used to take another oral contraceptive, ...   
4                                                 17   

                                   lemmatized_review  sentiment_score  
0  it ha no side effect i take it in combination ...           0.0516  
1  my son is halfway through his fourth week of i...           0.8598  
2                                                              0.0000  
3  i used to take another oral contraceptive whic...          -0.5029  
4                                                              0.0000  


# Include POS tagging... 
The pos_tag function assigns a POS tag to each word, which helps determine whether the word is a noun, verb, adjective, etc.

In [16]:
import pandas as pd
import re
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk import pos_tag

# Download necessary NLTK data
nltk.data.path.append(r'C:\nltk_data')  # Replace with the path to your nltk_data folder
#nltk.download('punkt')
#nltk.download('wordnet')
nltk.download('vader_lexicon', download_dir=r'C:\nltk_data')
nltk.download('averaged_perceptron_tagger_eng', download_dir=r'C:\nltk_data')

# Load the dataset
file_path = 'drugsComTrain_raw.csv'
df = pd.read_csv(file_path)

# Step 1: Select the 'review' column
df = df[['review']].dropna()

# Select the first 500 rows
df = df.head(500)

# Step 1: Preprocess the text (cleaning)
def preprocess_text(text):
    # Remove non-alphabetic characters (punctuation, numbers, special characters)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert text to lowercase
    text = text.lower()
    return text

# Helper function to convert POS tag to format that WordNetLemmatizer can understand
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default to noun if no match

# Step 3: Lemmatization
lemmatizer = WordNetLemmatizer()

def lemmatize_text_with_pos(text):
    words = word_tokenize(text)
    pos_tags = pos_tag(words)  # Get POS tags for the text
    lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in pos_tags]
    return ' '.join(lemmatized_words)

# Apply the cleaning and lemmatization
df['cleaned_review'] = df['review'].apply(preprocess_text)
df['lemmatized_review'] = df['cleaned_review'].apply(lemmatize_text_with_pos)

# Step 4: Sentiment Analysis
sia = SentimentIntensityAnalyzer()

def sentiment_analysis(text):
    sentiment_score = sia.polarity_scores(text)
    return sentiment_score['compound']

# Apply sentiment analysis
df['sentiment_score'] = df['lemmatized_review'].apply(sentiment_analysis)

# Show the first 5 results
print(df[['review', 'lemmatized_review', 'sentiment_score']].head())

# Save the results to a new CSV file
df.to_csv('drug_reviews_lemmatized_sentiment.csv', index=False)

[nltk_data] Downloading package vader_lexicon to C:\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger_eng.zip.


                                              review  \
0  """It has no side effect, I take it in combina...   
1  """My son is halfway through his fourth week o...   
2                                                192   
3  """I used to take another oral contraceptive, ...   
4                                                 17   

                                   lemmatized_review  sentiment_score  
0  it have no side effect i take it in combinatio...          -0.2960  
1  my son be halfway through his fourth week of i...           0.8281  
2                                                              0.0000  
3  i use to take another oral contraceptive which...          -0.5932  
4                                                              0.0000  
