In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


# ** Project Tittle: Predicting Patient’s Condition based on review.**

Import Libraraies 

In [2]:
import numpy as np 
import pandas as pd 
import plotly.express as px
from sklearn.model_selection import train_test_split,cross_val_score, cross_val_predict
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm

In [3]:
#train data
train_data = pd.read_csv('/content/drive/MyDrive/data/drugsComTest_raw.csv/drugsComTest_raw.csv')
#test data
test_data = pd.read_csv('/content/drive/MyDrive/data/drugsComTrain_raw.csv/drugsComTrain_raw.csv')

#shape of the datas before droping null values
print('The shape of train data:',train_data.shape)
print('The shape of test data:',test_data.shape)

# checking null values
print ("Null values in the dataset : \n", train_data.isnull().sum(axis = 0))
print ("Null values in the dataset : \n", test_data.isnull().sum(axis = 0))

The shape of train data: (53766, 7)
The shape of test data: (161297, 7)
Null values in the dataset : 
 uniqueID         0
drugName         0
condition      295
review           0
rating           0
date             0
usefulCount      0
dtype: int64
Null values in the dataset : 
 uniqueID         0
drugName         0
condition      899
review           0
rating           0
date             0
usefulCount      0
dtype: int64


**Data Preprocessing**

#### Dropping Nan values

In [4]:
#removing null values
train_data = train_data.dropna()
test_data = test_data.dropna()

#After removing the null values 
print('Shape of the train data:',train_data.shape)
print('Shape of the test data:',test_data.shape)

# checking null values
print ("Null values in the dataset : \n", train_data.isnull().sum(axis = 0))
print ("Null values in the dataset : \n", test_data.isnull().sum(axis = 0))

Shape of the train data: (53471, 7)
Shape of the test data: (160398, 7)
Null values in the dataset : 
 uniqueID       0
drugName       0
condition      0
review         0
rating         0
date           0
usefulCount    0
dtype: int64
Null values in the dataset : 
 uniqueID       0
drugName       0
condition      0
review         0
rating         0
date           0
usefulCount    0
dtype: int64


#### Data Cleaning and lowercasing

In [5]:
# Data cleaning using Regex or Regular Expression
import re

def clean_text(x):
    pattern = r'[^a-zA-z0-9\s]'
    text = re.sub(pattern, '', x)
    return x

def clean_numbers(x):
    if bool(re.search(r'\d', x)):
        x = re.sub('[0-9]{5,}', '#####', x)
        x = re.sub('[0-9]{4}', '####', x)
        x = re.sub('[0-9]{3}', '###', x)
        x = re.sub('[0-9]{2}', '##', x)
        x = re.sub('#{1}','',x)
        x = re.sub('[&;]{2}',"'",x)
        x = re.sub('\r\n','',x)
        x = re.sub('</span>','',x)
        x = re.sub('"','',x)
    return x

In [6]:
contraction_dict = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have"}
def _get_contractions(contraction_dict):
    contraction_re = re.compile('(%s)' % '|'.join(contraction_dict.keys()))
    return contraction_dict, contraction_re
contractions, contractions_re = _get_contractions(contraction_dict)
def replace_contractions(text):
    def replace(match):
        return contractions[match.group(0)]
    return contractions_re.sub(replace, text)
# Usage
replace_contractions("this's a text with contraction")

'this is a text with contraction'

In [7]:
#For Train data
# lower the text
train_data["review"] = train_data["review"].apply(lambda x: x.lower())
train_data["condition"] =train_data['condition'].apply(lambda x:x.lower())

# Clean the text
train_data["review"] = train_data["review"].apply(lambda x: clean_text(x))
train_data["condition"] =train_data['condition'].apply(lambda x: clean_text(x))

# Clean numbers
train_data["review"] = train_data["review"].apply(lambda x: clean_numbers(x))
train_data["condition"] =train_data['condition'].apply(lambda x: clean_numbers(x))

# Clean Contractions
train_data["review"] = train_data["review"].apply(lambda x: replace_contractions(x))

# For Test data
# lower the text
test_data["review"] = test_data["review"].apply(lambda x: x.lower())
test_data["condition"] =test_data['condition'].apply(lambda x: x.lower())

# Clean the text
test_data["review"] = test_data["review"].apply(lambda x: clean_text(x))
test_data["condition"] =test_data['condition'].apply(lambda x: clean_text(x))

# Clean numbers
test_data["review"] = test_data["review"].apply(lambda x: clean_numbers(x))
test_data["condition"] =test_data['condition'].apply(lambda x: clean_numbers(x))

# Clean Contractions
test_data["review"] = test_data["review"].apply(lambda x: replace_contractions(x))

In [8]:
discard = ["found"]

#drop rows that contain the partial string "users found this comment helpful" in the conference column
train_data =train_data[~train_data.condition.str.contains('|'.join(discard))]
test_data = test_data[~test_data.condition.str.contains('|'.join(discard))]

#### Class Distribution of the datase

In [9]:
def plot_bar_chart(df):
    counts_series = df.condition.value_counts()
    counts_df = pd.DataFrame(counts_series)
    counts_df.reset_index(level=0, inplace=True)
    
    number_of_classes(df)
    
    fig= px.bar(counts_df, x="index", y="condition",orientation="v",
               height=400,
               title='xc')
    fig.show()
    
def number_of_classes(df):
    print("Number of classes: ",len(df["condition"].unique()))

plot_bar_chart(train_data)

Number of classes:  664


In [10]:
# We will keep those classes which have more than 20 values in them
index_counts = train_data["condition"].value_counts()[train_data.condition.value_counts()>=20].index
train_data = train_data[train_data["condition"].isin(index_counts)]

number_of_classes(train_data)

Number of classes:  210


#### Balancing the imbalce through undersampling

In [11]:
condition_over200 = train_data["condition"].value_counts()[train_data.condition.value_counts() >= 200].index

for condition in condition_over200:
    # randomly shuffle the samples
    condition_samples = train_data[train_data["condition"]==condition]
    condition_samples = condition_samples.sample(frac=1).reset_index(drop=True)

    # extract only 200
    condition_samples = condition_samples[:200]

    train_data = train_data[train_data["condition"]!=condition]
    # put it back
    train_data = pd.concat([train_data, condition_samples], ignore_index=True)

#### Filtering the label

In [12]:
def filter_labels(labels):
    labels = labels.tolist()
    labels_truth = []
    for label in labels:
        if label[0].isdigit():
            labels_truth.append(False)
        else:
            labels_truth.append(True)
    return labels_truth

In [13]:
#For Train data
train_data = train_data[filter_labels(train_data["condition"])]
print("Train ", number_of_classes(train_data))

#For Test data
test_data = test_data[filter_labels(test_data["condition"])]
print("Test ", number_of_classes(test_data))

Number of classes:  210
Train  None
Number of classes:  811
Test  None


In [14]:
# keeping those classes which are in the training set, in to the test set
test_data = test_data[test_data["condition"].isin(train_data["condition"])]
number_of_classes(test_data)
plot_bar_chart(train_data)

Number of classes:  210
Number of classes:  210


In [15]:
train_data["label"] = train_data['condition']
test_data["label"] = test_data['condition']

In [16]:
train_data = train_data.sample(frac=1)
X = train_data['review']
Y = train_data['label']

**Feature Extraction**

In [17]:
count_vectorizer = CountVectorizer(ngram_range=(1,2))
X_count_vec = count_vectorizer.fit_transform(X)
#Y_count_vec = count_vectorizer.fit_transform(Y)
print(X_count_vec)
#print(Y_count_vec)

  (0, 13108)	2
  (0, 206093)	1
  (0, 136689)	4
  (0, 70613)	2
  (0, 6890)	1
  (0, 40000)	1
  (0, 213231)	1
  (0, 71916)	1
  (0, 16538)	4
  (0, 271745)	1
  (0, 111918)	3
  (0, 334882)	1
  (0, 159306)	1
  (0, 108583)	1
  (0, 260815)	1
  (0, 216217)	1
  (0, 264963)	1
  (0, 91820)	1
  (0, 330712)	3
  (0, 203520)	1
  (0, 268027)	1
  (0, 335852)	1
  (0, 271835)	1
  (0, 200997)	1
  (0, 49473)	1
  :	:
  (19837, 184832)	1
  (19837, 24077)	1
  (19837, 285822)	1
  (19837, 300431)	1
  (19837, 311876)	1
  (19837, 70643)	1
  (19837, 85575)	1
  (19837, 63799)	1
  (19837, 227085)	1
  (19837, 326280)	1
  (19837, 59002)	1
  (19837, 195455)	1
  (19837, 326281)	1
  (19837, 326238)	1
  (19837, 59003)	1
  (19837, 89282)	1
  (19837, 210544)	1
  (19837, 227089)	1
  (19837, 315523)	1
  (19837, 136232)	1
  (19837, 184835)	1
  (19837, 203850)	1
  (19837, 204896)	1
  (19837, 285825)	1
  (19837, 190504)	1


**Model Training and Testing**

In [18]:

# training the Random Forest Classifier on complete training data
fin_clf = RandomForestClassifier(n_estimators=100)
fin_clf.fit(X_count_vec, Y)

# transforming test_data with count vectorizer
X_test_vec = count_vectorizer.transform(test_data['review'])

# getting preds on the test data
preds = fin_clf.predict(X_test_vec)

In [19]:
from sklearn.metrics import accuracy_score
print("Accuracy on test data: ", accuracy_score(test_data["label"], 
                                               preds))

Accuracy on test data:  0.5244757073374824


In [20]:
print(preds)

['adhd' 'abnormal uterine bleeding' 'abnormal uterine bleeding' ...
 'rheumatoid arthritis' 'underactive thyroid' 'constipation, chronic']


In [22]:
preds = fin_clf.predict(X_test_vec[0])

In [25]:
print(preds)

['adhd']


In [23]:
count_vectorizer.inverse_transform(X_test_vec[0])

[array(['all', 'all the', 'and', 'and she', 'and slept', 'and so',
        'and with', 'at', 'at school', 'be', 'be on', 'became',
        'became concerned', 'bed', 'bed was', 'been', 'been problem',
        'began', 'began this', 'behavior', 'behavior is', 'better',
        'called', 'concerned', 'concerned when', 'could', 'could hardly',
        'cranky', 'cranky and', 'days', 'days have', 'days he', 'days see',
        'did', 'did at', 'different', 'different medications', 'doctor',
        'doctor on', 'dose', 'dose he', 'drive', 'drive home', 'effective',
        'emotional', 'ever', 'far', 'far this', 'few', 'few days', 'for',
        'for him', 'for nearly', 'for two', 'fourth', 'fourth week',
        'free', 'from', 'from school', 'get', 'get out', 'getting',
        'getting up', 'good', 'good thing', 'halfway', 'halfway through',
        'hardly', 'hardly get', 'have', 'have been', 'have tried', 'he',
        'he began', 'he could', 'he did', 'he is', 'he should',
        'h

In [24]:
test_data['review'].iloc[0]

'my son is halfway through his fourth week of intuniv. we became concerned when he began this last week, when he started taking the highest dose he will be on. for two days, he could hardly get out of bed, was very cranky, and slept for nearly 8 hours on a drive home from school vacation (very unusual for him.) i called his doctor on monday morning and she said to stick it out a few days. see how he did at school, and with getting up in the morning. the last two days have been problem free. he is much more agreeable than ever. he is less emotional (a good thing), less cranky. he is remembering all the things he should. overall his behavior is better. we have tried many different medications and so far this is the most effective.'

In [None]:
from sklearn import svm
clf_svm = svm.SVC()
clf_svm.fit(X_count_vec,Y)
pred_svm = clf_svm.predict(X_test_vec)


In [27]:
from sklearn.metrics import accuracy_score
print("Accuracy on test data: ", accuracy_score(test_data["label"], 
                                                pred_svm))

Accuracy on test data:  0.4183881596150829


In [28]:
preds = fin_clf.predict(X_test_vec[0])

In [None]:
print(preds)