In [None]:
import pandas as pd
import numpy as np
import nltk
import re

In [None]:
df = pd.read_csv('sample_data/hospital-comments-report.csv', 
                 encoding='utf-8"',engine = 'python', encoding_errors='ignore')

In [None]:
df.head(3)

Unnamed: 0,Comment Title,Liked,Disliked,Sentiment
0,All the staff were exceptionally friendly and ...,,,Positive
1,Very professional but with a friendly approach...,,,Positive
2,"Friendly Staff at all times, they put me at ea...",,,Positive


In [None]:
len(df)

13771

In [None]:
df.isnull().sum()

Comment Title      357
Liked             1549
Disliked          3174
Sentiment          346
dtype: int64

In [None]:
df = df.drop('Liked', axis = 1)

In [None]:
df = df.drop('Disliked', axis = 1)

In [None]:
df.head()

Unnamed: 0,Comment Title,Sentiment
0,All the staff were exceptionally friendly and ...,Positive
1,Very professional but with a friendly approach...,Positive
2,"Friendly Staff at all times, they put me at ea...",Positive
3,The care and consideration I received from the...,Positive
4,"Everyone from the reception staff,pre-op nurse...",Positive


In [None]:
df.columns = ['comment', 'sentiment']

In [None]:
df.head(50)

Unnamed: 0,comment,sentiment
0,All the staff were exceptionally friendly and ...,Positive
1,Very professional but with a friendly approach...,Positive
2,"Friendly Staff at all times, they put me at ea...",Positive
3,The care and consideration I received from the...,Positive
4,"Everyone from the reception staff,pre-op nurse...",Positive
5,Going into hospital is a daunting experience b...,Positive
6,"friendly staff, efficient sevice, very relaxed...",Positive
7,The beautiful surroundings and the hospital it...,Positive
8,"I enjoyed my stay,the staff were wonderful,the...",Positive
9,I attended the treatment centre for surgery an...,Positive


In [None]:
df.sentiment.value_counts()

Positive    12275
Negative     1146
Neutral         4
Name: sentiment, dtype: int64

In [None]:
df = df[df.sentiment != "I can't tell"]

In [None]:
X = df['comment']
y = df['sentiment']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = .2)

In [None]:
print("Training set size:", len(X_train))
print("Validation set size:", len(X_val))
print("Test set size:", len(X_test))

Training set size: 8812
Validation set size: 2204
Test set size: 2755


In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
def ascii_only(text):
    return text.encode("ascii", "ignore").decode()

In [None]:
def make_lower(text):
    return text.lower()

In [None]:
# Find links and remove them

def remove_links(text):
    p = '((http\w*:\/\/)?(www\.\w+)?(\w+\.(com|co|ly|ch|org|net)+)(\/\w+)?)'
    return re.sub(p, ' ', text)

In [None]:
# Remove html symbol
def remove_html(text):
    return re.sub("[^A-Za-z0-9 ]\w+", ' ', text)

In [None]:
# Remove mentions
def remove_mentions(text):
    return re.sub("(@\w*)", ' ', text)

In [None]:
# Remove unnecessary punctuations
import string
punctuations = string.punctuation
punctuations = punctuations + '�' + string.digits

def remove_punctuations(text, punctuations):
    table_ = str.maketrans('', '', punctuations)
    return text.translate(table_) 

In [None]:
def remove_brandwords(text):
    p = '''#?(iphone|ipad|sxsw|hcsm|google|apple|cisco|austin|
    atari|intel|mac|pc|blackberry|android|linux|ubuntu)[a-z0-9]*'''
    return re.sub(p, ' ', text)

In [None]:
import nltk
nltk.download('stopwords')

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

sw = (stopwords.words('english'))

def lemmatize(x, sw):
    wnl = WordNetLemmatizer()
    return ' '.join([wnl.lemmatize(w) for w in x.split() if w not in sw])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
def preprocessing(df, punctuations, stopwords):
    df['comment'] = df.comment.apply(lambda x: ascii_only(x))
    df['comment'] = df.comment.apply(lambda x: make_lower(x))
    df['comment'] = df.comment.apply(lambda x: remove_links(x))
    df['comment'] = df.comment.apply(lambda x: remove_html(x))
    df['comment'] = df.comment.apply(lambda x: remove_mentions(x))
    df['comment'] = df.comment.apply(lambda x: remove_punctuations(x, punctuations))
    df['comment'] = df.comment.apply(lambda x: remove_brandwords(x))
    df['comment'] = df.comment.apply(lambda x: lemmatize(x, sw))
    
    return df

In [None]:
df['comment'] = df['comment'].astype(str)

In [None]:
import nltk
nltk.download('wordnet')
dataset = preprocessing(pd.DataFrame(df), punctuations, sw)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
dataset.head(50)

Unnamed: 0,comment,sentiment
0,staff exceptionally friendly made feel ease,Positive
1,professional friendly approach staff asked bet...,Positive
2,friendly staff time put ease arrival nothing m...,Positive
3,care consideration received medical house staf...,Positive
4,everyone reception staff nurse catering staff ...,Positive
5,going hospital daunting experience stay agph r...,Positive
6,friendly staff efficient sevice relaxed atmosp...,Positive
7,beautiful surroundings hospital world found wh...,Positive
8,enjoyed stay staff wonderful food good time or...,Positive
9,attended treatment centre surgery found hospit...,Positive


In [None]:
dataset.to_csv(r'preprocessed_hospital-comments-report_updated.csv', index=True)