<a href="https://colab.research.google.com/github/Madhurika1292/Medicines-and-Common-Treatment-Recommendation-System/blob/main/MRS_logistic_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Medicine Recommendation System

## Loading libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import naive_bayes
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import precision_score, recall_score, precision_recall_curve,f1_score, fbeta_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score, make_scorer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB



## Data Loading

In [2]:
# Importing dataset from drive
from google.colab import drive
drive.mount('/content/gdrive/')

import sys
sys.path.append('/content/gdrive/My Drive/Drugscom')

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [3]:
!ls gdrive/MyDrive/Drugscom/

Drugscom_test.csv     Drugscom_train.csv
drugsComTest_raw.csv  drugsComTrain_raw.csv


In [4]:
#Loading data
Drugscom_train=pd.read_csv('/content/gdrive/My Drive/Drugscom/Drugscom_train.csv')
Drugscom_test=pd.read_csv('/content/gdrive/My Drive/Drugscom/Drugscom_test.csv')

In [5]:
#Before
# checking for null values
Drugscom_train.isnull().sum()

Unnamed: 0      0
uniqueId        0
drugName        0
condition       0
review          0
rating          0
date            0
usefulCount     0
treatments      0
clean_review    6
dtype: int64

In [6]:
# checking for null values
Drugscom_test.isnull().sum()

Unnamed: 0      0
uniqueId        0
drugName        0
condition       0
review          0
rating          0
date            0
usefulCount     0
treatments      0
clean_review    2
dtype: int64

In [7]:
# Dropping null values
Drugscom_train = Drugscom_train.dropna(axis=0)
Drugscom_test = Drugscom_test.dropna(axis=0)

In [8]:
# checking for null values
Drugscom_train.isnull().sum()

Unnamed: 0      0
uniqueId        0
drugName        0
condition       0
review          0
rating          0
date            0
usefulCount     0
treatments      0
clean_review    0
dtype: int64

In [9]:
# checking for null values
Drugscom_test.isnull().sum()

Unnamed: 0      0
uniqueId        0
drugName        0
condition       0
review          0
rating          0
date            0
usefulCount     0
treatments      0
clean_review    0
dtype: int64

In [10]:
#Changing the "Unnamed: 0" column to uniqueId as it represents the unique id of the drugs
Drugscom_train=Drugscom_train.drop('Unnamed: 0', axis='columns')
Drugscom_test=Drugscom_test.drop('Unnamed: 0', axis='columns')

In [11]:
Drugscom_train=Drugscom_train.drop(Drugscom_train[(Drugscom_train['rating'] > 4.0) & (Drugscom_train['rating'] < 6.0)].index)
Drugscom_test=Drugscom_test.drop(Drugscom_test[(Drugscom_test['rating'] > 4.0) & (Drugscom_test['rating'] < 6.0)].index)

In [12]:
Drugscom_train['sentiment'] = np.where(Drugscom_train['rating'] >= 7, '1', '0')
Drugscom_test['sentiment'] = np.where(Drugscom_test['rating'] >= 7, '1', '0')

In [13]:
Drugscom_train.head()

Unnamed: 0,uniqueId,drugName,condition,review,rating,date,usefulCount,treatments,clean_review,sentiment
0,131173,A / B Otic,Otitis Media,"""It numbs the pain. It makes my ear feel heavi...",10,2009-09-23,20,"ANALGESICS,ANTIBIOTIC THERAPY,Tympanostomy Tub...",numb pain make ear feel heavier least throb su...,1
1,153899,Amoxicillin,Otitis Media,"""Perforation in my right tympanic membrane (ea...",8,2011-04-12,16,"ANALGESICS,ANTIBIOTIC THERAPY,Tympanostomy Tub...",perfor right tympan membran eardrum linger upp...,1
2,153715,Amoxicillin,Otitis Media,"""This medication did not clear up the infectio...",2,2017-05-31,0,"ANALGESICS,ANTIBIOTIC THERAPY,Tympanostomy Tub...",medic not clear infect well notic odorless dis...,0
3,153780,Amoxicillin,Otitis Media,"""My son who was 7 months old at the time was p...",1,2016-12-09,0,"ANALGESICS,ANTIBIOTIC THERAPY,Tympanostomy Tub...",son month old time prescrib amoxicillin ear in...,0
4,153838,Amoxicillin,Otitis Media,"""This medication works amazingly for ear infec...",9,2013-10-02,12,"ANALGESICS,ANTIBIOTIC THERAPY,Tympanostomy Tub...",medic work amaz ear infect ear infect count li...,1


In [14]:
Drugscom_train.sentiment.value_counts(normalize=True)

1    0.696646
0    0.303354
Name: sentiment, dtype: float64

In [15]:
def specific_condition_data(df,condition):
  
  return df[df['condition']==condition]

## Splitting data

In [16]:
training_data=specific_condition_data(Drugscom_train,'Acne')
testing_data=specific_condition_data(Drugscom_test,'Acne')

In [17]:
X_train=training_data.clean_review
y_train=training_data.sentiment
X_test=testing_data.clean_review
y_test=testing_data.sentiment

In [18]:
cv = CountVectorizer(max_features = 20000, ngram_range = (4, 4))

In [19]:
X_train_cv=cv.fit_transform(X_train)
X_test_cv=cv.transform(X_test)

## Model creation

### Logistic Regression

In [20]:
def grid_best_parameter(X_train_cv, y_train):

  pipe = Pipeline([('classifier' , LogisticRegression())])

  param_grid = [
    {'classifier' : [LogisticRegression()],
     'classifier__penalty' : ['l1', 'l2'],
    #'classifier__C' : np.logspace(-20, 20, 50),
     'classifier__C' : np.logspace(-50, 50, 100),
    'classifier__solver' : ['liblinear','lbfgs']}]

  clf = GridSearchCV(pipe, param_grid = param_grid, cv = 50, verbose=True, n_jobs=-1)
  best_clf = clf.fit(X_train_cv, y_train)
  bp= best_clf.best_estimator_

  return bp

In [21]:
grid_best_parameter(X_train_cv,y_train)

Fitting 50 folds for each of 400 candidates, totalling 20000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 522 tasks      | elapsed:    7.3s
[Parallel(n_jobs=-1)]: Done 5322 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 10080 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 11192 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 12396 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done 13824 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-1)]: Done 15655 tasks      | elapsed:  9.9min
[Parallel(n_jobs=-1)]: Done 17810 tasks      | elapsed: 12.9min
[Parallel(n_jobs=-1)]: Done 20000 out of 20000 | elapsed: 16.0min finished


Pipeline(memory=None,
         steps=[('classifier',
                 LogisticRegression(C=3.1992671377973845, class_weight=None,
                                    dual=False, fit_intercept=True,
                                    intercept_scaling=1, l1_ratio=None,
                                    max_iter=100, multi_class='auto',
                                    n_jobs=None, penalty='l2',
                                    random_state=None, solver='liblinear',
                                    tol=0.0001, verbose=0, warm_start=False))],
         verbose=False)

In [22]:
model_lr=LogisticRegression(solver='liblinear',max_iter=100,multi_class='auto',penalty='l2',C=3.2)


In [23]:
model_lr.fit(X_train_cv, y_train)

LogisticRegression(C=3.2, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [24]:
predictions=model_lr.predict(X_test_cv)

In [25]:
accuracy_score(y_test,predictions)

0.7869318181818182

#### Predictions

In [26]:
def predicted_recommend(predictions,testing_data):

  predict_df=pd.DataFrame(testing_data.sentiment)
  predict_df['predicted_values']=pd.Series(predictions, index=predict_df.index)
  recommend_index=list(predict_df[predict_df['predicted_values']=='1'].index)
  recommend_dataframe=testing_data.loc[recommend_index]
  recommend_dataframe_grouped=recommend_dataframe[['drugName','rating']].groupby(['drugName']).max()
  recommend_dataframe_grouped.sort_values(by='rating', ascending=False)[:10]
  recommend_dataframe_grouped=recommend_dataframe_grouped.reset_index()
  recommend_dataframe_grouped=recommend_dataframe_grouped.sort_values(by='rating', ascending=False)[:10]
 
  return recommend_dataframe_grouped

In [27]:
predicted_recommend(predictions,testing_data)

Unnamed: 0,drugName,rating
44,Gildess Fe 1.5 / 30,10
77,Tretinoin,10
35,Drospirenone / ethinyl estradiol / levomefolat...,10
36,Duac,10
37,Epiduo,10
82,Veltin,10
39,Ethinyl estradiol / norethindrone,10
40,Ethinyl estradiol / norgestimate,10
41,Finacea,10
42,Gianvi,10


### Naive Bayes Algorithmn

In [28]:
pipe_1 = Pipeline([('classifier' , MultinomialNB())])

param_grid_mnb = [
    {'classifier' : [MultinomialNB()],
     #'classifier__alpha' :[1, 0.1, 0.01, 0.001, 0.0001, 0.00001]
     'classifier__alpha': [0.01, 0.1, 0.5, 1.0, 10.0]
    }]

clf = GridSearchCV(pipe_1, param_grid = param_grid_mnb, cv = 50, verbose=True, n_jobs=-1)
best_clf_1 = clf.fit(X_train_cv, y_train)
bp_1= best_clf_1.best_estimator_

Fitting 50 folds for each of 5 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 164 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:    4.3s finished


In [29]:
bp_1

Pipeline(memory=None,
         steps=[('classifier',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [30]:
model_mnb = MultinomialNB(alpha=1)


In [31]:
model_mnb.fit(X_train_cv, y_train)


MultinomialNB(alpha=1, class_prior=None, fit_prior=True)

In [32]:
predictions_mnb=model_mnb.predict(X_test_cv)

In [33]:
accuracy_score(predictions_mnb,y_test)

0.7659090909090909