In [1]:
import pandas as pd
import numpy as np
import string
import re
import nltk
from bs4 import BeautifulSoup
import unicodedata
#import contractions
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords, wordnet
from nltk import pos_tag
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
import joblib

In [4]:
data = pd.read_csv("reviews.csv", engine = 'python')

In [5]:
def strip_html_tag(text):
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text()
    return stripped_text

In [6]:
# remove accented characters
def strip_accents(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

In [7]:
# remove special characters
def strip_special_characters(text):
    text = re.sub('[^a-zA-z0-9\s]', '', text)
    return text

In [8]:
CONTRACTION_MAP = {"ain't": "is not","aren't": "are not", "can't": "cannot", "can't've": "cannot have",
                   "'cause": "because", "dr" : "doctor",
                   "could've": "could have",
                   "couldn't": "could not",
                   "couldn't've": "could not have",
                   "didn't": "did not",
                   "doesn't": "does not",
                   "don't": "do not",
                   "hadn't": "had not",
                   "hadn't've": "had not have",
                   "hasn't": "has not",
                   "haven't": "have not",
                   "he'd": "he would",
                   "he'd've": "he would have",
                   "he'll": "he will",
                   "he'll've": "he will have",
                   "he's": "he is",
                   "how'd": "how did",
                   "how'd'y": "how do you",
                   "how'll": "how will",
                   "how's": "how is",
                   "I'd": "I would",
                   "I'd've": "I would have",
                   "I'll": "I will",
                   "I'll've": "I will have",
                   "I'm": "I am",
                   "I've": "I have",
                   "i'd": "i would",
                   "i'd've": "i would have",
                   "i'll": "i will",
                   "i'll've": "i will have",
                   "i'm": "i am",
                   "i've": "i have",
                   "isn't": "is not",
                   "it'd": "it would",
                   "it'd've": "it would have",
                   "it'll": "it will",
                   "it'll've": "it will have",
                   "it's": "it is",
                   "let's": "let us",
                   "ma'am": "madam",
                   "mayn't": "may not",
                   "might've": "might have",
                   "mightn't": "might not",
                   "mightn't've": "might not have",
                   "must've": "must have",
                   "mustn't": "must not",
                   "mustn't've": "must not have",
                   "needn't": "need not",
                   "needn't've": "need not have",
                   "o'clock": "of the clock",
                   "oughtn't": "ought not",
                   "oughtn't've": "ought not have",
                   "shan't": "shall not",
                   "sha'n't": "shall not",
                   "shan't've": "shall not have",
                   "she'd": "she would",
                   "she'd've": "she would have",
                   "she'll": "she will",
                   "she'll've": "she will have",
                   "she's": "she is",
                   "should've": "should have",
                   "shouldn't": "should not",
                   "shouldn't've": "should not have",
                   "so've": "so have",
                   "so's": "so as",
                   "that'd": "that would",
                   "that'd've": "that would have",
                   "that's": "that is",
                   "there'd": "there would",
                   "there'd've": "there would have",
                   "there's": "there is",
                   "they'd": "they would",
                   "they'd've": "they would have",
                   "they'll": "they will",
                   "they'll've": "they will have",
                   "they're": "they are",
                   "they've": "they have","to've": "to have",
                   "wasn't": "was not",
                   "we'd": "we would",
                   "we'd've": "we would have",
                   "we'll": "we will",
                   "we'll've": "we will have",
                   "we're": "we are",
                   "we've": "we have",
                   "weren't": "were not",
                   "what'll": "what will",
                   "what'll've": "what will have",
                   "what're": "what are",
                   "what's": "what is",
                   "what've": "what have",
                   "when's": "when is",
                   "when've": "when have",
                   "where'd": "where did",
                   "where's": "where is",
                   "where've": "where have",
                   "who'll": "who will",
                   "who'll've": "who will have",
                   "who's": "who is",
                   "who've": "who have",
                   "why's": "why is",
                   "why've": "why have",
                   "will've": "will have",
                   "won't": "will not",
                   "won't've": "will not have",
                   "would've": "would have",
                   "wouldn't": "would not",
                   "wouldn't've": "would not have",
                   "y'all": "you all",
                   "y'all'd": "you all would",
                   "y'all'd've": "you all would have",
                   "y'all're": "you all are",
                   "y'all've": "you all have",
                   "you'd": "you would",
                   "you'd've": "you would have",
                   "you'll": "you will",
                   "you'll've": "you will have",
                   "you're": "you are",
                   "you've": "you have"}

In [9]:
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

In [10]:
#pos tagging
def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [11]:
def clean_review(text):
    #remove html tags
    text = strip_html_tag(text)

    #convert accented characters
    text = strip_accents(text)
  
    #expand contractions
    text = expand_contractions(text)

    #lower case 
    text = text.lower()
    
    #remove special characters
    text = strip_special_characters(text)

    # tokenize text and remove puncutation
    text = [word.strip(string.punctuation) for word in text.split(" ")]

    # remove stop words
    stopword_list = set(stopwords.words('english'))
    stopword_list.remove('not')
    stopword_list.remove('no')

  
    text = [x for x in text if x not in stopword_list]
 
    # pos tag text
    pos_tags = pos_tag([word for word in text if word])

    # lemmatize text
    text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]

    text = " ".join(text)
    return(text)

In [12]:
data["review_clean"] = data["Reviews"].apply(lambda x: clean_review(x))

In [13]:
corpus = data["review_clean"].values

In [14]:
def sent(data):
    if data['Rating'] < 3:
        val = 'neg' #0
        
    elif data['Rating'] >= 3:
        val = 'pos' #1
    else:
        val = 'neutral'
    return val
data['sentiment'] = data.apply(sent, axis=1)

In [15]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [16]:
y = data['sentiment'].values

In [17]:
y = le.fit_transform(y)

In [18]:
reviews = data['review_clean'].tolist()
labels = y.tolist()

In [19]:
# Creating Bag of Words model
cv = CountVectorizer()
reviews_cv = cv.fit_transform(reviews)

In [20]:
# Creating a pickle file for the CountVectorizer model
joblib.dump(cv, "cv.pkl")


# Model Building
X_train, X_test, y_train, y_test = train_test_split(
    reviews_cv, labels, test_size=0.20, random_state=0)

# Fitting Naive Bayes to the Training set
classifier = MultinomialNB(alpha=0.2)
classifier.fit(X_train, y_train)


# Creating a pickle file for the Multinomial Naive Bayes model
joblib.dump(classifier, "model_nb.pkl")

['model_nb.pkl']

In [21]:
#Logistic regression
lr_classifier =  LogisticRegression(max_iter=1000)
lr_classifier.fit(X_train, y_train)
joblib.dump(lr_classifier, "model_lr.pkl")

['model_lr.pkl']

In [22]:
svc_classifier = LinearSVC()
svc_classifier.fit(X_train,y_train)
joblib.dump(svc_classifier,"model_svc.pkl")

['model_svc.pkl']

In [23]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
training_accuracy_mnb = accuracy_score(y_train, classifier.predict(X_train))
training_accuracy_lr = accuracy_score(y_train, lr_classifier.predict(X_train))
training_accuracy_svc = accuracy_score(y_train, svc_classifier.predict(X_train))
print("mnb: ", training_accuracy_mnb)
print("lr: ", training_accuracy_lr)
print("svc: ", training_accuracy_svc)

mnb:  0.9777992277992278
lr:  0.9927606177606177
svc:  0.9927606177606177


In [24]:
mnb_prediction = classifier.predict(X_test)
lr_prediction = lr_classifier.predict(X_test)
svc_prediction =svc_classifier.predict(X_test)

In [25]:
model = joblib.load(open('model_nb.pkl', 'rb'))
cv = joblib.load(open('cv.pkl', 'rb'))


In [26]:
testing_accuracy_mnb = accuracy_score(y_test, mnb_prediction)
lr_test_accuracy = accuracy_score(y_test, lr_prediction)
svc_test_accuracy = accuracy_score(y_test, svc_prediction)
print("mnb: ", testing_accuracy_mnb)
print("lr: ", lr_test_accuracy)
print("Svc: ", svc_test_accuracy)

mnb:  0.8783783783783784
lr:  0.8474903474903475
Svc:  0.8436293436293436


In [27]:
print(classification_report(y_test, mnb_prediction))

              precision    recall  f1-score   support

           0       0.80      0.82      0.81       166
           1       0.91      0.91      0.91       352

    accuracy                           0.88       518
   macro avg       0.86      0.86      0.86       518
weighted avg       0.88      0.88      0.88       518



In [28]:
LABELS = ['Negative', 'Positive']
def get_prediction(review):
    # Preprocessing
    #print(type(review))
    review_data = clean_review(review)
    #print("clean: ",review_data)
    vectorizer = cv.transform([review_data]).toarray()
    prediction = model.predict(vectorizer)
    pred_labels = LABELS[int(prediction[0])]
    
    
    print('REVIEW: ', review, '\nPREDICTION: ', prediction , '\nLabel: ',pred_labels)

In [29]:
get_prediction(""" Doctors are fine. Billing/Insurance staff is unprofessional and incompetent to a level that I have never experienced at any other medical office. Aside from lacking basic decency and common sense, I had to file a formal complaint as the staff violated numerous HIPAA laws. This place is a mess. """)

REVIEW:   Doctors are fine. Billing/Insurance staff is unprofessional and incompetent to a level that I have never experienced at any other medical office. Aside from lacking basic decency and common sense, I had to file a formal complaint as the staff violated numerous HIPAA laws. This place is a mess.  
PREDICTION:  [0] 
Label:  Negative
