# Modeling: with Categorization

In this section, our group tried different modeling to predict if the ease of use, effectiveness, helpfulness of review, and reviewer profile (e.g., status, gender, treatment_preiod, and age_group) can predict diabete patients' satisfaction of drugs.

## Import data and Initial EDA

### Import libraries

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

import warnings
warnings.filterwarnings('ignore')
from sklearn.ensemble import RandomForestClassifier

from sklearn.tree import DecisionTreeClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, accuracy_score, recall_score, classification_report

### Import file

In [2]:
df = pd.read_csv('../data/cleaned/cleaned_df.csv')

### Check null values

In [3]:
df.isnull().sum()

Rdate                   0
comment                 9
condition               0
drug                    0
easeofuse               0
effectiveness           0
helpful                 0
reviewer              253
satisfaction            0
patient                 0
caregiver               0
male                    0
female                  0
treatment_period_1      0
treatment_period_2      0
treatment_period_3      0
treatment_period_4      0
treatment_period_5      0
treatment_period_6      0
treatment_period_7      0
age_group_1             0
age_group_2             0
age_group_3             0
age_group_4             0
age_group_5             0
age_group_6             0
age_group_7             0
dtype: int64

In [8]:
df.dropna(inplace = True)

In [9]:
df.head()

Unnamed: 0,Rdate,comment,condition,drug,easeofuse,effectiveness,helpful,reviewer,satisfaction,patient,...,treatment_period_5,treatment_period_6,treatment_period_7,age_group_1,age_group_2,age_group_3,age_group_4,age_group_5,age_group_6,age_group_7
0,9/19/2007 9:03:53 AM,The medication has helped me keep my 2 month s...,Type 2 Diabetes Mellitus,metformin oral,5,4,22,"bjorn, 65-74 Male on Treatment for 2 to less ...",3,1,...,1,0,0,0,0,0,0,0,0,0
1,9/18/2007 11:18:23 AM,The medication has helped me keep my 2 month s...,Type 2 Diabetes Mellitus,metformin oral,5,4,22,"cutie54, 45-54 Female on Treatment for 1 to 6...",5,1,...,0,0,0,0,0,0,0,0,0,0
2,3/2/2015 7:26:37 PM,I have been taking Tanzeum for four weeks and ...,Type 2 Diabetes Mellitus,Tanzeum subcutaneous,5,5,15,65-74 Female on Treatment for 1 to 6 months (...,5,1,...,0,0,0,0,0,0,0,0,0,0
3,1/4/2015 6:47:30 AM,I have been taking Tanzeum for four weeks and ...,Type 2 Diabetes Mellitus,Tanzeum subcutaneous,4,2,15,"MarkW, 65-74 Male on Treatment for 1 to 6 mon...",1,1,...,0,0,0,0,0,0,0,0,0,0
4,10/20/2014 3:13:21 PM,I have been taking Tanzeum for four weeks and ...,Type 2 Diabetes Mellitus,Tanzeum subcutaneous,5,5,15,"Les, 55-64 Female (Patient)",5,1,...,0,0,0,0,0,0,0,0,0,0


### Select only necessary variables

In [10]:
mdf = df.drop(columns = ['Rdate', 'drug', 'comment', 'condition', 'reviewer']).copy()

In [11]:
mdf.head()

Unnamed: 0,easeofuse,effectiveness,helpful,satisfaction,patient,caregiver,male,female,treatment_period_1,treatment_period_2,...,treatment_period_5,treatment_period_6,treatment_period_7,age_group_1,age_group_2,age_group_3,age_group_4,age_group_5,age_group_6,age_group_7
0,5,4,22,3,1,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,5,4,22,5,1,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
2,5,5,15,5,1,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
3,4,2,15,1,1,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,5,5,15,5,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
mdf['satisfaction'].value_counts()

1    1496
5    1487
4     868
3     832
2     552
Name: satisfaction, dtype: int64

In [15]:
mdf.dtypes

easeofuse             int64
effectiveness         int64
helpful               int64
satisfaction          int64
patient               int64
caregiver             int64
male                  int64
female                int64
treatment_period_1    int64
treatment_period_2    int64
treatment_period_3    int64
treatment_period_4    int64
treatment_period_5    int64
treatment_period_6    int64
treatment_period_7    int64
age_group_1           int64
age_group_2           int64
age_group_3           int64
age_group_4           int64
age_group_5           int64
age_group_6           int64
age_group_7           int64
dtype: object

In [16]:
mdf.dropna(inplace = True)

In [17]:
X = mdf.drop(columns = 'satisfaction', axis = 1)
y = mdf['satisfaction']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.15,
                                                    stratify=y,
                                                    random_state=42)

In [19]:
# Scale data

ss = StandardScaler()

X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)


In [20]:
rfc = RandomForestClassifier()

In [21]:
rfc.fit(X_train_sc, y_train)

RandomForestClassifier()

In [22]:
rfc.score(X_train_sc, y_train), rfc.score(X_test_sc, y_test)

(0.8503034389750506, 0.5788804071246819)

In [23]:
rfc_predict = rfc.predict(X_test_sc)

In [24]:
n_estimators = [100, 300, 500, 800, 1200]
max_depth = [5, 8, 15, 25, 30]
min_samples_split = [2, 5, 10, 15, 100]
min_samples_leaf = [1, 2, 5, 10] 

hyperF = dict(n_estimators = n_estimators, max_depth = max_depth,  
              min_samples_split = min_samples_split, 
             min_samples_leaf = min_samples_leaf)

gridF = GridSearchCV(rfc, hyperF, cv = 3, verbose = 1, 
                      n_jobs = -1)
bestF = gridF.fit(X_train_sc, y_train)


Fitting 3 folds for each of 500 candidates, totalling 1500 fits


In [25]:
bestF

GridSearchCV(cv=3, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'max_depth': [5, 8, 15, 25, 30],
                         'min_samples_leaf': [1, 2, 5, 10],
                         'min_samples_split': [2, 5, 10, 15, 100],
                         'n_estimators': [100, 300, 500, 800, 1200]},
             verbose=1)

In [26]:
gridF.score(X_train_sc, y_train), gridF.score(X_test_sc, y_test)

(0.6758822207237581, 0.6475826972010178)

In [35]:
mnb = MultinomialNB()

In [36]:
# Scale data

mms = MinMaxScaler()

X_train_sc = mms.fit_transform(X_train)
X_test_sc = mms.transform(X_test)

In [37]:
mnb.fit(X_train_sc, y_train)

MultinomialNB()

In [38]:
mnb.score(X_train_sc, y_train), mnb.score(X_test_sc, y_test)

(0.47156664418970556, 0.4643765903307888)

In [39]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [40]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV 

In [41]:
gbc = GradientBoostingClassifier()
params = {
    "n_estimators":[250,500],
    "max_depth":[3,5,7],
    "learning_rate":[0.01,0.1,1]
}

take 24m 18.1s to run

In [42]:
cv = GridSearchCV(gbc, params, cv=5)
cv.fit(X_train_sc,y_train)

GridSearchCV(cv=5, estimator=GradientBoostingClassifier(),
             param_grid={'learning_rate': [0.01, 0.1, 1],
                         'max_depth': [3, 5, 7], 'n_estimators': [250, 500]})

In [43]:
def display(results):
    print(f'Best parameters are: {results.best_params_}')
    print("\n")
    mean_score = results.cv_results_['mean_test_score']
    std_score = results.cv_results_['std_test_score']
    params = results.cv_results_['params']
    for mean,std,params in zip(mean_score,std_score,params):
        print(f'{round(mean,3)} + or -{round(std,3)} for the {params}')

In [44]:
display(cv)

Best parameters are: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 500}


0.663 + or -0.016 for the {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 250}
0.663 + or -0.017 for the {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 500}
0.659 + or -0.019 for the {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 250}
0.647 + or -0.015 for the {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 500}
0.639 + or -0.013 for the {'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 250}
0.624 + or -0.009 for the {'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 500}
0.646 + or -0.02 for the {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 250}
0.637 + or -0.016 for the {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 500}
0.614 + or -0.01 for the {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 250}
0.592 + or -0.01 for the {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 500}
0.585 + or -0.013 for the {'learning_rate': 0.1, 'max

In [45]:
print(classification_report(y_test,cv.predict(X_test_sc)))

              precision    recall  f1-score   support

           1       0.78      0.78      0.78       225
           2       0.42      0.06      0.11        83
           3       0.45      0.56      0.50       125
           4       0.49      0.59      0.54       130
           5       0.81      0.87      0.84       223

    accuracy                           0.66       786
   macro avg       0.59      0.57      0.55       786
weighted avg       0.65      0.66      0.64       786



In [46]:
cv.score(X_train_sc, y_train), cv.score(X_test_sc, y_test)

(0.6767812991683524, 0.6628498727735369)