In [1]:
import re
from sklearn import tree
from matplotlib import pyplot as plt
import matplotlib.lines as mlines
import matplotlib.patches as mpatches
import pandas as pd
import numpy as np

In [None]:
class DataFrameMods:
    @staticmethod
    def set_id(key, ids):
        return ids.index(key)
    

class TextManipulator:
    @staticmethod
    def strip_spaces(text):
        return " ".join(text.split())

    @staticmethod
    def replace_words(text):

        text = re.sub(r'(not good)|(better)|(not bad)|(great)|(had no side effects)|(saved me)|' \
                     '(quick reduction of symptoms)|(It does work)|(helped me)|(allowed me to stay awake)|' \
                     '(cleared up my acne)|(drug is pretty amazing)|(no problem(s)?)|(happy)|(best)|(amazing)|(insane)|' \
                     '(very well)|(super)|(cool)|(fine)|(given me my life back)|(well)|' \
                     '((no|minimize) side effect(s)?)|(love this)|(recommend (it)? (to|for) everyone)|' \
                     '(no negative)|(positive)|(happy)|(happiest person ever)|(fantastic)|(works for me)|' \
                     '(nice)|(ok)|(okay)|(miracle)|(not that bad)|(recommend [^\s]* this)|(accept)|(brilliant)' \
                     '(wonderful)|(very effective)|(fabulous)|(genius)|(healed)|(impressive)|(love this)|(satisfied)|' \
                     '(outstanding)|(saved my life)|(it works)', 'good', text)

        text = re.sub(r'(not good)|(no reaction)|(no problems)|(waste of money)|(It didnt seem to have any effect)|'
                      r'(horrible breakouts)|(horrible acne)|(boobs would hurt)|(wired)|(less effective)|'
                      r'(least effective)|(didnt helped me)|(nightmare)|(horrible)|(this is not it)|'
                      r'(not very well)|(worst)|(not cool)|(not fine)|((was( a)?)? side effects are)|'
                      r'(not happy)|(painful)|(absolutely done taking this)|(will not be taking this any longer)|'
                      r'((dont|not) works for me)|(terrible)|(nothing to prevent)|(not ok)|(not okay)|(worse)|'
                      r'(massacre)|(suffering)|(not recommend)|(almost killed me)|(not impressive)|(angry)|(fail)|'
                      r'(hate)|(stupid)|(disappointed)|(dissatisfied)|(does not work)|(does not help)', 'bad', text)

        return text

    @staticmethod
    def remove_unnecessary_words(text):
        text = re.sub(r'(&[^;]{3,6};)|(but)|(why)|(where)|[\'\./\?:-]', "", text)

        return text

    @staticmethod
    def manipulate(text):
        text = text.lower()
        text = TextManipulator.strip_spaces(text)
        text = TextManipulator.remove_unnecessary_words(text)
        text = TextManipulator.strip_spaces(text)

        text = TextManipulator.replace_words(text)

        return text

    @staticmethod
    def count_result(text):
        goods_and_bads = re.findall(r'bad|good', text);
        good_count = goods_and_bads.count('good')
        bad_count = goods_and_bads.count('bad')

        if good_count > bad_count:
            return 1 # good
        elif good_count < bad_count:
            return -1 # bad
        elif good_count == 0 and bad_count == 0:
            return 'false' # false data(ignored)
        else:
            return 0 # neutral
        
    @staticmethod
    def makeFeature(column1, column2):
        return [column1, column2]


In [None]:
text_manipulator = TextManipulator()

# Loading a .csv file
data_frame = pd.read_csv('drugsComTrain_raw.csv', usecols=['drugName', 'condition', 'review', 'rating'])

conditions = data_frame['condition']
conditions = list(dict.fromkeys(conditions))

drugNames = data_frame['drugName']
drugNames = list(dict.fromkeys(drugNames))

# Text processing
data_frame['review_replaced'] = data_frame.apply(lambda row: text_manipulator.manipulate(row['review']), axis=1)

# Goods and Bads
data_frame['review_result'] = data_frame.apply(lambda row: text_manipulator.count_result(row['review_replaced']), axis=1)

# Defining unique IDs
data_frame['condition_id'] = data_frame.apply(lambda row: DataFrameMods.set_id(row['condition'], conditions), axis=1)
data_frame['drugName_id'] = data_frame.apply(lambda row: DataFrameMods.set_id(row['drugName'], drugNames), axis=1)

# Features column
data_frame['features'] = data_frame.apply(lambda row: text_manipulator.makeFeature(row['drugName_id'], row['condition_id']), axis=1)


In [None]:
# Removing 'false' values
data_frame = data_frame[data_frame.review_result != 'false']
data_frame

In [5]:
# Decision tree
clf = tree.DecisionTreeClassifier()
features = data_frame['features'].tolist()
labels = data_frame['review_result'].tolist()
clf.fit(features, labels)

results = data_frame[['review_result','features']].copy()
machineData = results['features'].tolist()
results['machine'] = clf.predict(machineData)

In [None]:
# Test data
production_data = pd.read_csv('./drugsComTest_raw.csv', usecols=['drugName', 'condition', 'review', 'rating'])

conditions = production_data['condition']
conditions = list(dict.fromkeys(conditions))

drugNames = production_data['drugName']
drugNames = list(dict.fromkeys(drugNames))

# Defining unique IDs
production_data['condition_id'] = production_data.apply(lambda row: DataFrameMods.set_id(row['condition'], conditions), axis=1)
production_data['drugName_id'] = production_data.apply(lambda row: DataFrameMods.set_id(row['drugName'], drugNames), axis=1)

# Features column
production_data['features'] = production_data.apply(lambda row: text_manipulator.makeFeature(row['drugName_id'], row['condition_id']), axis=1)

In [None]:
# Evaluate model using production dataset
y_pred = clf.predict(production_data['features'].tolist())

In [None]:
fig, ax = plt.subplots(figsize=(20, 40))
ax.tick_params(axis='x', direction='out', length=3, width=5, labelrotation=90, labelsize=13)
ax.tick_params(axis='y', direction='out', length=3, width=5, labelsize=13)

# Plot ground truth based on rating
for row in production_data.to_numpy():
    ground_truth = row[3]
    if ground_truth > 6:
        plt.scatter(row[1], row[0], marker='+', c='g', s=330, zorder=2)
    elif ground_truth < 4:
        plt.scatter(row[1], row[0], marker='x', c='g', s=250, zorder=2)
    else:
        plt.scatter(row[1], row[0], marker='o', c='g', s=200, zorder=2)

# Plot the decision tree predictions
for row, y_hat in zip(production_data.to_numpy(), y_pred):
    if y_hat == 1:
        plt.scatter(row[1], row[0], marker='+', c='b', s=200, zorder=3)
    elif y_hat == -1:
        plt.scatter(row[1], row[0], marker='x', c='b', s=170, zorder=3)
    else:
        plt.scatter(row[1], row[0], marker='o', c='b', s=150, zorder=3)
        
# Plot the dictionary-based review results
for row in production_data.to_numpy():
    review_result = row[5]
    if review_result == 1:
        plt.scatter(row[1], row[0], marker='+', c='r', s=100, zorder=4)
    elif review_result == -1:
        plt.scatter(row[1], row[0], marker='x', c='r', s=70, zorder=4)
    else:
        plt.scatter(row[1], row[0], marker='o', c='r', s=50, zorder=4)
        
red = mpatches.Patch(color='r', label='Decision tree')
blue = mpatches.Patch(color='b', label='Dictionary')
green = mpatches.Patch(color='g', label='Ground truth')

times = mlines.Line2D([], [], color='black', marker='x',
                          markersize=10, label='Negative review')
plus = mlines.Line2D([], [], color='black', marker='+',
                          markersize=10, label='Positive review')
circle = mlines.Line2D([], [], color='black', marker='o',
                          markersize=10, label='Neutral review')
ax.grid(lw=1, zorder=0)

models_legend = plt.legend(handles=[red, blue, green],title="Models", fancybox=True, loc=1)
reviews_legend = plt.legend(handles=[plus, times, circle], title="Reviews", fancybox=True, loc=2)
ax.add_artist(models_legend)
ax.add_artist(reviews_legend)
plt.title('Models performance on drugs.com data')


In [None]:
fig.savefig('./models_performance.jpg', dpi=500, bbox_inches='tight', pad_inches=3, papertype='a4', optimize=True)