<a href="https://colab.research.google.com/github/Madhurika1292/Medicines-and-Common-Treatment-Recommendation-System/blob/main/MRS_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Medicine Recommendation System

## Loading libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import surprise
import seaborn as sns
from sklearn import naive_bayes
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import precision_score, recall_score, precision_recall_curve,f1_score, fbeta_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score, make_scorer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from surprise import SVD
from textblob import TextBlob


import warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    warnings.warn("deprecated", DeprecationWarning)

from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, Bidirectional, LSTM, BatchNormalization, Dropout
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
import keras
from keras.models import Sequential
from keras.layers import Dense
import random

from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier



## Data Loading

In [2]:
# Importing dataset from drive
from google.colab import drive
drive.mount('/content/gdrive/')

import sys
sys.path.append('/content/gdrive/My Drive/Drugscom')

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [3]:
!ls gdrive/MyDrive/Drugscom/

Drugscom_test.csv     Drugscom_train.csv
drugsComTest_raw.csv  drugsComTrain_raw.csv


In [4]:
#Loading data
Drugscom_train=pd.read_csv('/content/gdrive/My Drive/Drugscom/Drugscom_train.csv')
Drugscom_test=pd.read_csv('/content/gdrive/My Drive/Drugscom/Drugscom_test.csv')

In [5]:
#Before
# checking for null values
Drugscom_train.isnull().sum()

Unnamed: 0      0
uniqueId        0
drugName        0
condition       0
review          0
rating          0
date            0
usefulCount     0
treatments      0
clean_review    6
dtype: int64

In [6]:
# checking for null values
Drugscom_test.isnull().sum()

Unnamed: 0      0
uniqueId        0
drugName        0
condition       0
review          0
rating          0
date            0
usefulCount     0
treatments      0
clean_review    2
dtype: int64

In [7]:
# Dropping null values
Drugscom_train = Drugscom_train.dropna(axis=0)
Drugscom_test = Drugscom_test.dropna(axis=0)

In [8]:
# checking for null values
Drugscom_train.isnull().sum()

Unnamed: 0      0
uniqueId        0
drugName        0
condition       0
review          0
rating          0
date            0
usefulCount     0
treatments      0
clean_review    0
dtype: int64

In [9]:
# checking for null values
Drugscom_test.isnull().sum()

Unnamed: 0      0
uniqueId        0
drugName        0
condition       0
review          0
rating          0
date            0
usefulCount     0
treatments      0
clean_review    0
dtype: int64

In [10]:
#Changing the "Unnamed: 0" column to uniqueId as it represents the unique id of the drugs
Drugscom_train=Drugscom_train.drop('Unnamed: 0', axis='columns')
Drugscom_test=Drugscom_test.drop('Unnamed: 0', axis='columns')

In [11]:
#Drugscom_train=Drugscom_train.drop(Drugscom_train[(Drugscom_train['rating'] > 4.0) & (Drugscom_train['rating'] < 6.0)].index)
#Drugscom_test=Drugscom_test.drop(Drugscom_test[(Drugscom_test['rating'] > 4.0) & (Drugscom_test['rating'] < 6.0)].index)

In [12]:
#Drugscom_train['sentiment'] = np.where(Drugscom_train['rating'] >= 7, 1, 0)
#Drugscom_test['sentiment'] = np.where(Drugscom_test['rating'] >= 7, 1, 0)

In [13]:
Drugscom_train.head()

Unnamed: 0,uniqueId,drugName,condition,review,rating,date,usefulCount,treatments,clean_review,sentiment
0,131173,A / B Otic,Otitis Media,"""It numbs the pain. It makes my ear feel heavi...",10,2009-09-23,20,"ANALGESICS,ANTIBIOTIC THERAPY,Tympanostomy Tub...",numb pain make ear feel heavier least throb su...,1
1,153899,Amoxicillin,Otitis Media,"""Perforation in my right tympanic membrane (ea...",8,2011-04-12,16,"ANALGESICS,ANTIBIOTIC THERAPY,Tympanostomy Tub...",perfor right tympan membran eardrum linger upp...,1
2,153715,Amoxicillin,Otitis Media,"""This medication did not clear up the infectio...",2,2017-05-31,0,"ANALGESICS,ANTIBIOTIC THERAPY,Tympanostomy Tub...",medic not clear infect well notic odorless dis...,0
3,153780,Amoxicillin,Otitis Media,"""My son who was 7 months old at the time was p...",1,2016-12-09,0,"ANALGESICS,ANTIBIOTIC THERAPY,Tympanostomy Tub...",son month old time prescrib amoxicillin ear in...,0
4,153838,Amoxicillin,Otitis Media,"""This medication works amazingly for ear infec...",9,2013-10-02,12,"ANALGESICS,ANTIBIOTIC THERAPY,Tympanostomy Tub...",medic work amaz ear infect ear infect count li...,1


In [14]:
Drugscom_train.sentiment.value_counts(normalize=True)

1    0.696646
0    0.303354
Name: sentiment, dtype: float64

In [15]:
def specific_condition_data(df,condition):
  
  return df[df['condition']==condition]

## Splitting data

In [16]:
training_data=specific_condition_data(Drugscom_train,'Acne')
testing_data=specific_condition_data(Drugscom_test,'Acne')

In [None]:
pol = lambda x: TextBlob(x).sentiment.polarity
sub = lambda x: TextBlob(x).sentiment.subjectivity

In [None]:
training_data['polarity'] = training_data["clean_review"].apply(pol)
training_data['subjectivity'] = training_data["clean_review"].apply(sub)
testing_data['polarity'] = testing_data["clean_review"].apply(pol)
testing_data['subjectivity'] = testing_data["clean_review"].apply(sub)


In [None]:
def sentiments(polarity):
 if (polarity >= 0) :
   return 1
 elif polarity < 0:
   return 0

In [None]:
# Add sentiments to the data
training_data["sentiment"] = training_data["polarity"].apply(sentiments)
testing_data["sentiment"] = testing_data["polarity"].apply(sentiments)

In [17]:
X_train=training_data.clean_review
y_train=training_data.sentiment
X_test=testing_data.clean_review
y_test=testing_data.sentiment

In [18]:
cv = CountVectorizer(max_features = 8000, ngram_range = (4, 4))

In [19]:
X_train_cv=cv.fit_transform(X_train)
X_test_cv=cv.transform(X_test)

## Model creation

### Logistic regression,  Multinomial Naive Bayes, Bernoulli Naive Bayes

In [20]:
def grid_best_parameter(X_train_cv,y_train,X_test_cv,y_test):

  #Report to carry out the metrices and scores of different models
  report_table = [[]]
  
  #creating parameter

  #C= [1.0, 0.5, 0.1]
  C=np.logspace(-50, 50, 100)
  alpha= [0.01, 0.1, 0.5, 1.0, 10.0]
  k_range = list(range(1, 16))
  
  

  # creating pipeline
  pipe_lr=Pipeline([('clf' , LogisticRegression(random_state=42))])
  pipe_mnb=Pipeline([('clf', MultinomialNB())])
  pipe_bnb=Pipeline([('clf', BernoulliNB())])
  pipe_svc = Pipeline([('clf', LinearSVC())])
  pipe_dtc = Pipeline([('clf', DecisionTreeClassifier())])
  pipe_rfc = Pipeline([('clf', RandomForestClassifier(random_state=42))])




  #creating parameter

  param_lr =[{'clf__penalty': ['l1', 'l2'],
		          'clf__C': C,
		          'clf__solver': ['liblinear','lbfgs']}] 

  param_mnb=[{'clf__alpha': alpha}]

  param_bnb=[{'clf__alpha': alpha}]
  param_svc = {'clf__C':np.arange(0.01,100,10)}
  param_dtc = {'clf__criterion':['gini','entropy'],'clf__max_depth':[4,5,6,7,8,9,10,11,12,15,20,30,40,50,70,90,120,150]}
  param_rfc = { 
    'clf__n_estimators': [200, 500],
    'clf__max_features': ['auto', 'sqrt', 'log2'],
    'clf__max_depth' : [4,5,6,7,8],
    'clf__criterion' :['gini', 'entropy']
}





  # creating gridsearch

  grid_lr=GridSearchCV(estimator=pipe_lr,param_grid=param_lr,scoring='accuracy',cv=10,n_jobs=-1)
  grid_mnb=GridSearchCV(estimator=pipe_mnb,param_grid=param_mnb,cv=10,verbose=True, n_jobs=-1)
  grid_bnb=GridSearchCV(estimator=pipe_bnb,param_grid=param_bnb,cv=10,verbose=True, n_jobs=-1)
  grid_svc = GridSearchCV(estimator=pipe_svc,param_grid=param_svc,cv=10,return_train_score=True, n_jobs=1)
  grid_dtc = GridSearchCV(estimator=pipe_dtc,param_grid=param_dtc,cv=10, n_jobs=1)
  grid_rfc = GridSearchCV(estimator=pipe_rfc,param_grid=param_rfc,cv=10, n_jobs=1)




  #listing pipelines

  grid=[grid_lr, grid_mnb, grid_bnb, grid_svc,grid_dtc, grid_rfc]

  #dictionary of pipelines and models

  grid_dict={0: 'Logistic Regression', 1: 'Multi Nomial NB', 2: 'Bernoulli NB', 3:'Linear SVC', 4:'DecisionTreeClassifier', 5:'RandomForestClassifier'}

  # fitting grid search 

  print('Performing model optimizations...')
  best_accuracy = 0.0
  best_model = 0
  best_grid = ''

  for i, g in enumerate(grid):
    print('\nEstimator: %s' % grid_dict[i])	
    # Fit grid search	
    g.fit(X_train_cv, y_train)
    # Best params
    #print('Best params: %s' % g.best_params_)
    # Best training data accuracy
    #print('Best training accuracy: %.3f' % g.best_score_)
    # Predict on train data with best params
    train_prediction = g.predict(X_train_cv)
    # Predict on test data with best params
    prediction = g.predict(X_test_cv)
    # Test data accuracy of model with best params
    #print('Test set accuracy score for best params: %.3f ' % accuracy_score(y_test, prediction))
    try:
      #calculate roc_auc_score for training data
      roc_auc_score_train=roc_auc_score(train_prediction, y_train)

      #calculate roc_auc_score for testing data
      roc_auc_score_test=roc_auc_score(prediction, y_test)

    except ValueError:
      pass
   
    # Track best (highest test accuracy) model
    report_table = report_table+ [[grid_dict[i], g.best_params_, g.score(X_train_cv, y_train), g.score(X_test_cv, y_test), roc_auc_score_train, roc_auc_score_test]]  
    if accuracy_score(y_test, prediction) > best_accuracy:
      best_accuracy = accuracy_score(y_test, prediction)
      best_model = g
      best_grid = i
  #print('\nClassifier with best test set accuracy: %s' % grid_dict[best_grid])
  #print('\nAccuracy score:%s' % best_accuracy)
  report_table.pop(0)
  report = pd.DataFrame(report_table,columns = ['Model name', 'Model parameter', 'Train accuracy', 'Test accuracy', 'Train auc score', 'Test auc score'])
  return report


#https://www.kdnuggets.com/2018/01/managing-machine-learning-workflows-scikit-learn-pipelines-part-3.html

In [21]:
#grid_best_parameter(X_train_cv,y_train,X_test_cv,y_test)

### Deep learning model

#### Sequential model

In [22]:
Print

NameError: ignored

In [24]:
# fitting model
model1 = keras.models.Sequential()

model1.add(keras.layers.Dense(200, input_shape=(8000,)))
model1.add(keras.layers.BatchNormalization())
model1.add(keras.layers.Activation('relu'))
model1.add(keras.layers.Dropout(0.5))

model1.add(keras.layers.Dense(300))
model1.add(keras.layers.BatchNormalization())
model1.add(keras.layers.Activation('relu'))
model1.add(keras.layers.Dropout(0.5))


model1.add(keras.layers.Dense(400))
model1.add(keras.layers.BatchNormalization())
model1.add(keras.layers.Activation('relu'))
model1.add(keras.layers.Dropout(0.5))

model1.add(keras.layers.Dense(100, activation='relu'))
model1.add(keras.layers.Dense(1, activation='sigmoid'))


In [28]:
model1.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy','mse'])

In [29]:
model1.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 200)               1600200   
_________________________________________________________________
batch_normalization (BatchNo (None, 200)               800       
_________________________________________________________________
activation (Activation)      (None, 200)               0         
_________________________________________________________________
dropout (Dropout)            (None, 200)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 300)               60300     
_________________________________________________________________
batch_normalization_1 (Batch (None, 300)               1200      
_________________________________________________________________
activation_1 (Activation)    (None, 300)               0

In [30]:
model1_train = model1.fit(X_train_cv.todense(), y_train, epochs=20, batch_size=100)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
X_train_cv.todense().shape

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

fig, loss_ax = plt.subplots()

acc_ax = loss_ax.twinx()

loss_ax.set_ylim([0.0, 1.0])
acc_ax.set_ylim([0.0, 1.0])

loss_ax.plot(model1_train.history['loss'], 'y', label='train loss')
acc_ax.plot(model1_train.history['accuracy'], 'b', label='train acc')

loss_ax.set_xlabel('Epoch')
loss_ax.set_ylabel('Loss')
acc_ax.set_ylabel('Accuray')

loss_ax.legend(loc='upper left')
acc_ax.legend(loc='lower left')

plt.show()
#https://github.com/sharmaroshan/Drugs-Recommendation-using-Reviews/blob/master/DrugsAnalysis.ipynb

In [None]:
prediction_model1 = model1.predict(X_test_cv,batch_size=32)


In [None]:
X_train.head()

In [None]:
X_train_cv.shape


In [None]:
model3 = Sequential()
model3.add(keras.layers.Embedding(5348, 100, input_length=8000))
model3.add(keras.layers.Conv1D(128, 5, activation='softmax'))
model3.add(keras.layers.GlobalMaxPooling1D())
model3.add(keras.layers.Dense(10, activation='softmax'))
model3.add(keras.layers.Dense(1, activation='softmax'))
model3.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model3.summary()

In [None]:
model3_train = model3.fit(X_train_cv.todense(), y_train, epochs=10, batch_size=150)

#### Predictions

In [None]:
Print

In [None]:
def predicted_recommend(predictions,testing_data):

  predict_df=pd.DataFrame(testing_data.sentiment)
  predict_df['predicted_values']=pd.Series(predictions, index=predict_df.index)
  recommend_index=list(predict_df[predict_df['predicted_values']=='1'].index)
  recommend_dataframe=testing_data.loc[recommend_index]
  recommend_dataframe_grouped=recommend_dataframe[['drugName','rating']].groupby(['drugName']).max()
  recommend_dataframe_grouped.sort_values(by='rating', ascending=False)[:10]
  recommend_dataframe_grouped=recommend_dataframe_grouped.reset_index()
  recommend_dataframe_grouped=recommend_dataframe_grouped.sort_values(by='rating', ascending=False)[:10]
 
  return recommend_dataframe_grouped

In [None]:
predicted_recommend(predictions,testing_data)