In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
import numpy as np
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [2]:
# Create a Scraper function
def scraper(base_url):
    # Web scraping - fetching the reviews from the webpage using BeautifulSoup

    # loop through a range of page numbers
    for i in range(1):

        # Creating an empty list to store the reviews of each page
        pagewise_reviews = []

        # Query parameter
        query_parameter = "?page=" + str(i)

        # Constructing the URL
        url = base_url + query_parameter

        # Send HTTP request to the URL
        response = requests.get(url)

        # Create a soup object and parse the HTML page
        soup = bs(response.content, 'html.parser')

        # Finding all the elements having reviews using class attribute
        rev_div = soup.findAll("div", attrs={"class", "media-body"})

        # loop through all the divs and append
        for j in range(len(rev_div)):
            # finding all the p tags to fetch only the review text
            pagewise_reviews.append(rev_div[j].text)

        # writing all the reviews into a list
        for k in range(len(pagewise_reviews)):
            all_pages_reviews.append(pagewise_reviews[k])

        # return the final list of reviews
    return all_pages_reviews

In [3]:
# Define the base URL
df_link = pd.read_csv(r"URL_Link.tab", delimiter='\t')

In [4]:
all_review_df = pd.DataFrame()
for index, link in df_link.iterrows():
    url = link["Link"]

    # Create an empty list to store all review
    all_pages_reviews = []
    # Driver code
    reviews = scraper(url)

    # Storing in a dataframe
    reviews_df = pd.DataFrame({'Name': link["Name"], 'Review': reviews})
    all_review_df = all_review_df.append(reviews_df)

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


In [5]:
# all_review_df.to_csv("reviews.txt", index=False)

In [6]:
def replace_all(text, replaced_list, new):
    for word in replaced_list:
        text = text.replace(word, new)
    return text

In [7]:
all_review_df

Unnamed: 0,Name,Review
0,University of Malaya Medical Centre,5Kang N. 10 months ago Was admitted to the em...
1,University of Malaya Medical Centre,5M. 5 months ago Dr Rajwin frm public health ...
2,University of Malaya Medical Centre,5Chioma A. 1 year ago Am so happy I went to t...
3,University of Malaya Medical Centre,5Super Hunter Malaysia V. 1 year ago Fresh Sq...
4,University of Malaya Medical Centre,5Kev L. 1 year ago Very good hospital. Price ...
...,...,...
46,Columbia Asia Hospital - Petaling Jaya,4azrin z. 9 months ago Columbia Asia PJ is ve...
47,Columbia Asia Hospital - Petaling Jaya,"4Warren C. 9 months ago Dr Bheena is 5 stars,..."
48,Columbia Asia Hospital - Petaling Jaya,4Kannan R. 9 months ago Good service 🙂
49,Columbia Asia Hospital - Petaling Jaya,4Aishwarya R. 9 months ago I have been coming...


In [8]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, SpaceTokenizer
from nltk.stem import WordNetLemmatizer
  
lemmatizer = WordNetLemmatizer()

def word_tokenize_remove_stopwords_lemmatize(example_sent):
    space_tokenizer = SpaceTokenizer()
    stop_words = set(stopwords.words('english'))

    word_tokens = space_tokenizer.tokenize(example_sent)

    
    lemmatized = []
    
    for w in word_tokens:
        lemmatized.append(lemmatizer.lemmatize(w))
    
    filtered_sentence = [w.lower() for w in lemmatized if not w.lower() in stop_words]
    
    filtered_sentence = ' '.join([str(f) for f in filtered_sentence])
    
    return filtered_sentence

word_tokenize_remove_stopwords_lemmatize("doctors is are bad i cb, cannot tahan can't")

"doctor bad cb, cannot tahan can't"

In [9]:
import emoji
def remove_emoji(string):
    return emoji.get_emoji_regexp().sub(u'', string)

In [10]:
df_all_sentence_list = pd.DataFrame()
for index, row in all_review_df.iterrows():
    sentence = str(row["Review"])
    sentence = remove_emoji(sentence)
    sentence = replace_all(sentence, ["Dr.", "dr."], "Dr")
    sentence = replace_all(sentence, ["months ago", "month ago", "years ago", "year ago", ".", "!", "?", "\n"], "#####")
    sentence = replace_all(sentence, ["(Translated by Google)"], "")
    sentence = sentence.split("(Original)")[0]
    sentence_list = sentence.split("#####")
    sentence_list = sentence_list[2:]
    sentence_list =  [i.strip() for i in sentence_list]
    sentence_list = [word_tokenize_remove_stopwords_lemmatize(i) for i in sentence_list]
    for sen in sentence_list:
        if sen != "":
            df_all_sentence_list = df_all_sentence_list.append(pd.DataFrame([{"Name": row["Name"], "Sentence": sen}]))
df_all_sentence_list = df_all_sentence_list.reset_index(drop=True)

  This is separate from the ipykernel package so we can avoid doing imports until


In [11]:
df_result = df_all_sentence_list.copy()

In [12]:
df_result["Sentence_Category"] = np.where(df_result["Sentence"].str.contains("Dr|Doctors|Doctor|dr|doctors|doctor"), "Doctor", "Hospital")

In [13]:
def sentiment_analysis(df_result, column):
    sentence_sentiment_compound_score_list = []
    sentence_sentiment_result_list = []
    sid = SentimentIntensityAnalyzer()

    for word in df_result[column]:
        compound_score = sid.polarity_scores(word)['compound']
        sentence_sentiment_compound_score_list.append(compound_score)
        if compound_score >= 0.3:
            sentence_sentiment_result_list.append("Positive")
        elif compound_score <= -0.3:
            sentence_sentiment_result_list.append("Negative")
        else:
            sentence_sentiment_result_list.append("Neutral")
    return sentence_sentiment_compound_score_list, sentence_sentiment_result_list
sentence_sentiment_compound_score_list, sentence_sentiment_result_list = sentiment_analysis(df_result, "Sentence")

In [14]:
df_result["Compound_Score"] = sentence_sentiment_compound_score_list
df_result["Sentiment_Result"] = sentence_sentiment_result_list

In [15]:
df_word_frequency = pd.DataFrame()
for index, row in df_result.iterrows():
    current_name = row["Name"]
    current_sentence = replace_all(row["Sentence"], [",", "(", ")", "*"], "").split(" ")
    for s in current_sentence:
        if s != "":
            df_word_frequency = df_word_frequency.append([{"Name": current_name, "Word": s}])
df_word_frequency = df_word_frequency.reset_index(drop=True)

In [16]:
df_word_frequency = pd.DataFrame(df_word_frequency.groupby(["Name", "Word"]).size().reset_index())
df_word_frequency.columns = ["Name", "Word", "Frequency"]

In [17]:
word_sentiment_score, word_sentiment_result = sentiment_analysis(df_word_frequency, "Word")

In [18]:
df_word_frequency["Word_Compound_Score"] = word_sentiment_score
df_word_frequency["Word_Sentiment_Result"] = word_sentiment_result

In [19]:
df_word_frequency

Unnamed: 0,Name,Word,Frequency,Word_Compound_Score,Word_Sentiment_Result
0,Columbia Asia Hospital - Petaling Jaya,&,4,0.0,Neutral
1,Columbia Asia Hospital - Petaling Jaya,-,2,0.0,Neutral
2,Columbia Asia Hospital - Petaling Jaya,10,2,0.0,Neutral
3,Columbia Asia Hospital - Petaling Jaya,1045am,1,0.0,Neutral
4,Columbia Asia Hospital - Petaling Jaya,11,2,0.0,Neutral
...,...,...,...,...,...
2664,University of Malaya Medical Centre,yg,1,0.0,Neutral
2665,University of Malaya Medical Centre,you,3,0.0,Neutral
2666,University of Malaya Medical Centre,young,3,0.0,Neutral
2667,University of Malaya Medical Centre,zero,2,0.0,Neutral


In [20]:
df_word_frequency = df_word_frequency.sort_values(by=["Name", "Frequency"], ascending=False)

In [21]:
df_word_frequency = df_word_frequency.reset_index(drop=True)

In [22]:
df_word_frequency

Unnamed: 0,Name,Word,Frequency,Word_Compound_Score,Word_Sentiment_Result
0,University of Malaya Medical Centre,patient,31,0.000,Neutral
1,University of Malaya Medical Centre,doctor,30,0.000,Neutral
2,University of Malaya Medical Centre,wa,26,0.000,Neutral
3,University of Malaya Medical Centre,hospital,18,0.000,Neutral
4,University of Malaya Medical Centre,time,15,0.000,Neutral
...,...,...,...,...,...
2664,Columbia Asia Hospital - Petaling Jaya,xrays,1,0.000,Neutral
2665,Columbia Asia Hospital - Petaling Jaya,yeah,1,0.296,Neutral
2666,Columbia Asia Hospital - Petaling Jaya,you’ll,1,0.000,Neutral
2667,Columbia Asia Hospital - Petaling Jaya,️,1,0.000,Neutral


In [23]:
df_result.to_csv("Sentiment_Analysis_Result.csv", index=False)

In [24]:
df_word_frequency.to_csv("Word_Frequency_Result.csv", index=False)