In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

In [2]:
# Loading dataset from kaggle after connect noteboook with drive
# https://www.kaggle.com/datasets/ujjwalaggarwal402/medicine-dataset/data
df_raw = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/machine_learning/ML_Projects/data/medicine_dataset.csv')
df_raw.head()

Unnamed: 0,Name,Category,Dosage Form,Strength,Manufacturer,Indication,Classification
0,Acetocillin,Antidiabetic,Cream,938 mg,Roche Holding AG,Virus,Over-the-Counter
1,Ibuprocillin,Antiviral,Injection,337 mg,CSL Limited,Infection,Over-the-Counter
2,Dextrophen,Antibiotic,Ointment,333 mg,Johnson & Johnson,Wound,Prescription
3,Clarinazole,Antifungal,Syrup,362 mg,AbbVie Inc.,Pain,Prescription
4,Amoxicillin,Antifungal,Tablet,802 mg,Teva Pharmaceutical Industries Ltd.,Wound,Over-the-Counter


In [3]:
df = df_raw.copy()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Name            50000 non-null  object
 1   Category        50000 non-null  object
 2   Dosage Form     50000 non-null  object
 3   Strength        50000 non-null  object
 4   Manufacturer    50000 non-null  object
 5   Indication      50000 non-null  object
 6   Classification  50000 non-null  object
dtypes: object(7)
memory usage: 2.7+ MB


In [4]:
df.describe(include='object')

Unnamed: 0,Name,Category,Dosage Form,Strength,Manufacturer,Indication,Classification
count,50000,50000,50000,50000,50000,50000,50000
unique,64,8,8,999,20,8,2
top,Metostatin,Antidepressant,Inhaler,347 mg,Boehringer Ingelheim GmbH,Infection,Over-the-Counter
freq,860,6354,6364,77,2587,6393,25015


In [5]:
df['Category'].value_counts()

Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
Antidepressant,6354
Analgesic,6340
Antiseptic,6315
Antifungal,6289
Antipyretic,6280
Antiviral,6185
Antidiabetic,6171
Antibiotic,6066


In [6]:
df = df.drop_duplicates()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Name            50000 non-null  object
 1   Category        50000 non-null  object
 2   Dosage Form     50000 non-null  object
 3   Strength        50000 non-null  object
 4   Manufacturer    50000 non-null  object
 5   Indication      50000 non-null  object
 6   Classification  50000 non-null  object
dtypes: object(7)
memory usage: 2.7+ MB


In [7]:
df = df[['Category', 'Dosage Form', 'Strength', 'Indication', 'Classification']]
df.head()

Unnamed: 0,Category,Dosage Form,Strength,Indication,Classification
0,Antidiabetic,Cream,938 mg,Virus,Over-the-Counter
1,Antiviral,Injection,337 mg,Infection,Over-the-Counter
2,Antibiotic,Ointment,333 mg,Wound,Prescription
3,Antifungal,Syrup,362 mg,Pain,Prescription
4,Antifungal,Tablet,802 mg,Wound,Over-the-Counter


## 1. Prediction of Classification column

In [8]:
df_classification = df.copy()
df_classification.head()

Unnamed: 0,Category,Dosage Form,Strength,Indication,Classification
0,Antidiabetic,Cream,938 mg,Virus,Over-the-Counter
1,Antiviral,Injection,337 mg,Infection,Over-the-Counter
2,Antibiotic,Ointment,333 mg,Wound,Prescription
3,Antifungal,Syrup,362 mg,Pain,Prescription
4,Antifungal,Tablet,802 mg,Wound,Over-the-Counter


In [9]:
df_classification['Classification'] = df_classification['Classification'].map({'Over-the-Counter': 1, 'Prescription': 0})
df_classification['Strength'] = df_classification['Strength'].str.replace(' mg', '').astype(int)
df_classification.head()

Unnamed: 0,Category,Dosage Form,Strength,Indication,Classification
0,Antidiabetic,Cream,938,Virus,1
1,Antiviral,Injection,337,Infection,1
2,Antibiotic,Ointment,333,Wound,0
3,Antifungal,Syrup,362,Pain,0
4,Antifungal,Tablet,802,Wound,1


In [10]:
df_classification = pd.get_dummies(df_classification, columns=['Category', 'Dosage Form', 'Indication'], drop_first=True)

In [11]:
target = df_classification.pop('Classification')
target.head()

Unnamed: 0,Classification
0,1
1,1
2,0
3,0
4,1


In [12]:
data = df_classification.copy().astype(int)
data.head()

Unnamed: 0,Strength,Category_Antibiotic,Category_Antidepressant,Category_Antidiabetic,Category_Antifungal,Category_Antipyretic,Category_Antiseptic,Category_Antiviral,Dosage Form_Cream,Dosage Form_Drops,...,Dosage Form_Ointment,Dosage Form_Syrup,Dosage Form_Tablet,Indication_Diabetes,Indication_Fever,Indication_Fungus,Indication_Infection,Indication_Pain,Indication_Virus,Indication_Wound
0,938,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
1,337,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
2,333,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
3,362,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
4,802,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1


In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, target, stratify=target)

print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

X_train shape: (37500, 22)
X_test shape: (12500, 22)
y_train shape: (37500,)
y_test shape: (12500,)


In [14]:
classification_scores = {}

### Logistic regression

In [15]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

score = log_reg.score(X_test, y_test)
score

0.4988

In [16]:
classification_scores['Logistic Regression'] = score
classification_scores

{'Logistic Regression': 0.4988}

### KNN

In [17]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

knn = KNeighborsClassifier()
knn_grid = GridSearchCV(knn, {'n_neighbors': range(3, 7), 'n_jobs': [-1]}, cv=5)
knn_grid.fit(X_train, y_train)

score = knn_grid.score(X_test, y_test)
score

  _data = np.array(data, dtype=dtype, copy=copy,


0.502

In [18]:
knn_grid.best_params_

{'n_jobs': -1, 'n_neighbors': 3}

In [19]:
classification_scores['KNN'] = score
classification_scores

{'Logistic Regression': 0.4988, 'KNN': 0.502}

### Decision tree

In [20]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier()
params = {
    'criterion': ['gini', 'entropy'],
    'max_depth': range(3, 10),
    'min_samples_split': range(2, 6),
    'min_samples_leaf': range(1, 6)
    }
tree_grid = GridSearchCV(tree, params, cv=5)
tree_grid.fit(X_train, y_train)

score = tree_grid.score(X_test, y_test)
print(tree_grid.best_params_)
print(score)

{'criterion': 'entropy', 'max_depth': 7, 'min_samples_leaf': 2, 'min_samples_split': 3}
0.502


  _data = np.array(data, dtype=dtype, copy=copy,


In [21]:
classification_scores['Decision Tree'] = score
classification_scores

{'Logistic Regression': 0.4988, 'KNN': 0.502, 'Decision Tree': 0.502}

### Random Forest

In [22]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier()
params = {
    'criterion': ['gini', 'entropy'],
    'n_estimators': range(50, 101, 10),
    'max_depth': range(3, 10),
    'min_samples_split': range(3, 6),
    'min_samples_leaf': range(3, 6)
    }
forest_grid = GridSearchCV(forest, params, cv=3)
forest_grid.fit(X_train, y_train)

score = forest_grid.score(X_test, y_test)
print(forest_grid.best_params_)
print(score)

{'criterion': 'gini', 'max_depth': 3, 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 90}
0.49296


In [23]:
classification_scores['Random Forest'] = score
classification_scores

{'Logistic Regression': 0.4988,
 'KNN': 0.502,
 'Decision Tree': 0.502,
 'Random Forest': 0.49296}

### SVM

In [24]:
from sklearn.svm import SVC

svc = SVC()
params = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    }
svc_grid = GridSearchCV(svc, params, cv=3)
svc_grid.fit(X_train, y_train)

score = svc_grid.score(X_test, y_test)
print(svc_grid.best_params_)
print(score)

{'kernel': 'linear'}
0.50112


In [25]:
classification_scores['SVC'] = score
classification_scores

{'Logistic Regression': 0.4988,
 'KNN': 0.502,
 'Decision Tree': 0.502,
 'Random Forest': 0.49296,
 'SVC': 0.50112}

### Naive Bayes

In [26]:
from sklearn.naive_bayes import GaussianNB

bayes = GaussianNB()
bayes.fit(X_train, y_train)

score = bayes.score(X_test, y_test)
score

0.49832

In [27]:
classification_scores['Naive Bayes'] = score
classification_scores

{'Logistic Regression': 0.4988,
 'KNN': 0.502,
 'Decision Tree': 0.502,
 'Random Forest': 0.49296,
 'SVC': 0.50112,
 'Naive Bayes': 0.49832}

In [28]:
scores_df = pd.DataFrame(classification_scores.items(), columns=['Model', 'Score'])
scores_df

Unnamed: 0,Model,Score
0,Logistic Regression,0.4988
1,KNN,0.502
2,Decision Tree,0.502
3,Random Forest,0.49296
4,SVC,0.50112
5,Naive Bayes,0.49832


In [29]:
px.bar(scores_df, x='Model', y='Score', title='Classification models scores')

### Summary: Classifiers can't predict if it's prescription or Over-the-Counter with high score. Maybe models was overfitted? Let's change data proportions

In [30]:
X_train, X_test, y_train, y_test = train_test_split(data, target, stratify=target, train_size=0.2)

print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

X_train shape: (10000, 22)
X_test shape: (40000, 22)
y_train shape: (10000,)
y_test shape: (40000,)


In [31]:
classification_scores = {}

# Logistic Regression

log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

log_reg_score = log_reg.score(X_test, y_test)
print(log_reg_score)
classification_scores['Logistic Regression'] = log_reg_score

# KNN
knn = KNeighborsClassifier()
knn_grid = GridSearchCV(knn, {'n_neighbors': range(3, 7), 'n_jobs': [-1]}, cv=5)
knn_grid.fit(X_train, y_train)

knn_score = knn_grid.score(X_test, y_test)
print(knn_score)
classification_scores['KNN'] = knn_score

# Decision tree
tree = DecisionTreeClassifier()
params = {
    'criterion': ['gini', 'entropy'],
    'max_depth': range(3, 10),
    'min_samples_split': range(2, 6),
    'min_samples_leaf': range(1, 6)
    }
tree_grid = GridSearchCV(tree, params, cv=5)
tree_grid.fit(X_train, y_train)

tree_score = tree_grid.score(X_test, y_test)
print(tree_score)
classification_scores['Decision Tree'] = tree_score

# Random Forest
forest = RandomForestClassifier()
params = {
    'criterion': ['gini', 'entropy'],
    'n_estimators': range(50, 101, 10),
    'max_depth': range(3, 10),
    'min_samples_split': range(3, 6),
    'min_samples_leaf': range(3, 6)
    }
forest_grid = GridSearchCV(forest, params, cv=3)
forest_grid.fit(X_train, y_train)

forest_score = forest_grid.score(X_test, y_test)
print(forest_score)
classification_scores['Random Forest'] = forest_score

# Naive Bayes
bayes = GaussianNB()
bayes.fit(X_train, y_train)

bayes_score = bayes.score(X_test, y_test)
print(bayes_score)
classification_scores['Naive Bayes'] = bayes_score

# SVM
svc = SVC()
params = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    }
svc_grid = GridSearchCV(svc, params, cv=3)
svc_grid.fit(X_train, y_train)

svc_score = svc_grid.score(X_test, y_test)
print(svc_score)
classification_scores['SVC'] = svc_score

0.50145



invalid value encountered in cast



0.4996



invalid value encountered in cast



0.503



invalid value encountered in cast



0.498775
0.499025
0.4989


### Even with only 10k samples it's almost the same score like 50% Let's try use only 5k samples to train

In [33]:
X_train, X_test, y_train, y_test = train_test_split(data, target, stratify=target, train_size=0.1)

print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

X_train shape: (5000, 22)
X_test shape: (45000, 22)
y_train shape: (5000,)
y_test shape: (45000,)


In [34]:
classification_scores = {}

# Logistic Regression

log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

log_reg_score = log_reg.score(X_test, y_test)
print(log_reg_score)
classification_scores['Logistic Regression'] = log_reg_score

# KNN
knn = KNeighborsClassifier()
knn_grid = GridSearchCV(knn, {'n_neighbors': range(3, 7), 'n_jobs': [-1]}, cv=5)
knn_grid.fit(X_train, y_train)

knn_score = knn_grid.score(X_test, y_test)
print(knn_score)
classification_scores['KNN'] = knn_score

# Decision tree
tree = DecisionTreeClassifier()
params = {
    'criterion': ['gini', 'entropy'],
    'max_depth': range(3, 10),
    'min_samples_split': range(2, 6),
    'min_samples_leaf': range(1, 6)
    }
tree_grid = GridSearchCV(tree, params, cv=5)
tree_grid.fit(X_train, y_train)

tree_score = tree_grid.score(X_test, y_test)
print(tree_score)
classification_scores['Decision Tree'] = tree_score

# Random Forest
forest = RandomForestClassifier()
params = {
    'criterion': ['gini', 'entropy'],
    'n_estimators': range(50, 101, 10),
    'max_depth': range(3, 10),
    'min_samples_split': range(3, 6),
    'min_samples_leaf': range(3, 6)
    }
forest_grid = GridSearchCV(forest, params, cv=3)
forest_grid.fit(X_train, y_train)

forest_score = forest_grid.score(X_test, y_test)
print(forest_score)
classification_scores['Random Forest'] = forest_score

# Naive Bayes
bayes = GaussianNB()
bayes.fit(X_train, y_train)

bayes_score = bayes.score(X_test, y_test)
print(bayes_score)
classification_scores['Naive Bayes'] = bayes_score

# SVM
svc = SVC()
params = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    }
svc_grid = GridSearchCV(svc, params, cv=3)
svc_grid.fit(X_train, y_train)

svc_score = svc_grid.score(X_test, y_test)
print(svc_score)
classification_scores['SVC'] = svc_score

0.5027333333333334



invalid value encountered in cast



0.5031777777777777



invalid value encountered in cast



0.5003111111111112



invalid value encountered in cast



0.49977777777777777
0.5056
0.4996888888888889


### As in previous tries, scores oscillates around 50%. Last one try to reduce train data to only 1k samples.

In [36]:
X_train, X_test, y_train, y_test = train_test_split(data, target, stratify=target, train_size=0.02)

print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

X_train shape: (1000, 22)
X_test shape: (49000, 22)
y_train shape: (1000,)
y_test shape: (49000,)


In [42]:
classification_scores = {}

# Logistic Regression

log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

log_reg_score = log_reg.score(X_test, y_test)
print(log_reg_score)
classification_scores['Logistic Regression'] = log_reg_score

# KNN
knn = KNeighborsClassifier()
knn_grid = GridSearchCV(knn, {'n_neighbors': range(3, 7), 'n_jobs': [-1]}, cv=5)
knn_grid.fit(X_train, y_train)

knn_score = knn_grid.score(X_test, y_test)
print(knn_score)
classification_scores['KNN'] = knn_score

# Decision tree
tree = DecisionTreeClassifier()
params = {
    'criterion': ['gini', 'entropy'],
    'max_depth': range(3, 10),
    'min_samples_split': range(2, 6),
    'min_samples_leaf': range(1, 6)
    }
tree_grid = GridSearchCV(tree, params, cv=5)
tree_grid.fit(X_train, y_train)

tree_score = tree_grid.score(X_test, y_test)
print(tree_score)
classification_scores['Decision Tree'] = tree_score

# Random Forest
forest = RandomForestClassifier()
params = {
    'criterion': ['gini', 'entropy'],
    'n_estimators': range(50, 101, 10),
    'max_depth': range(3, 10),
    'min_samples_split': range(3, 6),
    'min_samples_leaf': range(3, 6)
    }
forest_grid = GridSearchCV(forest, params, cv=3)
forest_grid.fit(X_train, y_train)

forest_score = forest_grid.score(X_test, y_test)
print(forest_score)
classification_scores['Random Forest'] = forest_score

# Naive Bayes
bayes = GaussianNB()
bayes.fit(X_train, y_train)

bayes_score = bayes.score(X_test, y_test)
print(bayes_score)
classification_scores['Naive Bayes'] = bayes_score

# SVM
svc = SVC()
params = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    }
svc_grid = GridSearchCV(svc, params, cv=3)
svc_grid.fit(X_train, y_train)

svc_score = svc_grid.score(X_test, y_test)
print(svc_score)
classification_scores['SVC'] = svc_score


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



0.4986938775510204



invalid value encountered in cast



0.49889795918367347



invalid value encountered in cast



0.49351020408163265
0.49451020408163265
0.4993265306122449
0.49895918367346936


In [43]:
scores_df = pd.DataFrame(classification_scores.items(), columns=['Model', 'Score'])
scores_df

Unnamed: 0,Model,Score
0,Logistic Regression,0.498694
1,KNN,0.498898
2,Decision Tree,0.49351
3,Random Forest,0.49451
4,Naive Bayes,0.499327
5,SVC,0.498959


In [44]:
px.bar(scores_df, x='Model', y='Score', title='Classification models scores, 50k samples, train size=0.02')

### Even worse scores