## User based recommendation system

In [6]:
import pandas as pd
import numpy as np
from scipy import sparse
from sklearn.metrics.pairwise import pairwise_distances, cosine_distances, cosine_similarity

In [7]:
df = pd.read_csv('../data/cleaned/cleaned_df.csv')

In [8]:
df.isnull().sum()

Rdate                   0
comment                 9
condition               0
drug                    0
easeofuse               0
effectiveness           0
helpful                 0
reviewer              253
satisfaction            0
patient                 0
caregiver               0
male                    0
female                  0
treatment_period_1      0
treatment_period_2      0
treatment_period_3      0
treatment_period_4      0
treatment_period_5      0
treatment_period_6      0
treatment_period_7      0
age_group_1             0
age_group_2             0
age_group_3             0
age_group_4             0
age_group_5             0
age_group_6             0
age_group_7             0
dtype: int64

In [9]:
df.dropna(inplace = True)

In [10]:
sat_df = df[['drug', 'satisfaction', 'reviewer']]

In [11]:
pivot = pd.pivot_table(sat_df, index='drug', columns='reviewer', values='satisfaction')
pivot

reviewer,"&quot;Jean&quot;, 55-64 Female (Patient)",(Patient),"******, 35-44 Female on Treatment for less than 1 month (Patient)","*, 55-64 Female on Treatment for 1 to 6 months (Patient)",", 75 or over Male on Treatment for 1 to less than 2 years (Patient)","0726, 55-64 Female on Treatment for 2 to less than 5 years (Patient)","1, 55-64 (Patient)",13-18 Female on Treatment for 1 to 6 months (Patient),"143webmd, 65-74 Male on Treatment for 2 to less than 5 years (Patient)","19 years on insulin, 35-44 Female on Treatment for 5 to less than 10 years (Patient)",...,"zandalee, 65-74 Female (Patient)","zdv6zd, 65-74 Female on Treatment for less than 1 month (Patient)","zeborah harmon, 45-54 Female on Treatment for less than 1 month (Patient)","zebulonboy, 55-64 Male on Treatment for 1 to 6 months (Patient)","zippy, 65-74 on Treatment for less than 1 month (Patient)","zman, 55-64 Male on Treatment for less than 1 month (Patient)","zohan777, 45-54 Male on Treatment for 1 to 6 months (Patient)","zorro, 65-74 Male on Treatment for 1 to 6 months (Patient)","zya4re, 45-54 Female on Treatment for less than 1 month (Caregiver)","zzzzzzzz, 75 or over Male on Treatment for 10 years or more (Patient)"
drug,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Actoplus MET oral,,,,,,,,,,,...,,,,,,,,,,
Actos oral,,2.0,,,,,,,1.0,,...,,,,,,,,,1.0,2.0
Admelog U-100 Insulin lispro subcutaneous,,,,,,,,,,,...,,,,,,,,,,
Afrezza inhalation,,,,,,,,,,,...,,,,,,,,,,
Amaryl oral,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
metformin oral,,1.0,,,,,1.0,1.0,,,...,,1.0,,,,,,,,
nateglinide oral,,,,,,,,,,,...,,,,,,,,,,
pioglitazone oral,,,,,,,,,,,...,,,,,,,,,,
pioglitazone-metformin oral,,,,,,,,,,,...,,,,,,,,,,


In [12]:
pivot.shape

(70, 3911)

In [13]:
sparse_pivot = sparse.csr_matrix(pivot.fillna(0))
print(sparse_pivot)

  (0, 28)	4.0
  (0, 48)	1.0
  (0, 50)	5.0
  (0, 55)	3.0
  (0, 57)	2.0
  (0, 66)	5.0
  (0, 68)	2.5
  (0, 74)	1.0
  (0, 93)	4.0
  (0, 105)	5.0
  (0, 113)	5.0
  (0, 118)	4.0
  (0, 121)	3.0
  (0, 125)	4.0
  (0, 127)	4.0
  (0, 134)	2.0
  (0, 152)	3.5
  (0, 155)	5.0
  (0, 156)	2.0
  (0, 371)	5.0
  (0, 548)	4.0
  (0, 732)	3.0
  (0, 751)	3.0
  (0, 763)	1.0
  (0, 829)	3.0
  :	:
  (67, 3737)	5.0
  (67, 3822)	4.0
  (67, 3856)	2.0
  (67, 3900)	5.0
  (68, 79)	3.0
  (68, 155)	4.0
  (68, 163)	4.0
  (68, 384)	3.0
  (68, 1042)	4.0
  (68, 1795)	3.0
  (69, 108)	5.0
  (69, 111)	3.0
  (69, 121)	1.0
  (69, 155)	5.0
  (69, 369)	3.0
  (69, 438)	5.0
  (69, 575)	4.0
  (69, 966)	1.0
  (69, 1519)	1.0
  (69, 1563)	2.0
  (69, 2916)	2.0
  (69, 3221)	3.0
  (69, 3274)	5.0
  (69, 3578)	2.0
  (69, 3771)	4.0


In [14]:
# Note that a distance of 1 is a similarity of 0.
dists = pairwise_distances(sparse_pivot, metric='cosine')
# dists = cosine_distances(sparse_pivot)                         # Identical but more concise

dists

array([[0.        , 0.94401971, 1.        , ..., 0.9647604 , 0.91443728,
        0.92135545],
       [0.94401971, 0.        , 1.        , ..., 0.9503404 , 0.93531576,
        0.95519507],
       [1.        , 1.        , 0.        , ..., 1.        , 1.        ,
        1.        ],
       ...,
       [0.9647604 , 0.9503404 , 1.        , ..., 0.        , 0.94508574,
        0.96722456],
       [0.91443728, 0.93531576, 1.        , ..., 0.94508574, 0.        ,
        0.82492476],
       [0.92135545, 0.95519507, 1.        , ..., 0.96722456, 0.82492476,
        0.        ]])

In [15]:
# Here, similarity is 1 - distance.
similarities = cosine_similarity(sparse_pivot)

In [16]:
recommender_df = pd.DataFrame(dists, 
                              columns=pivot.index, 
                              index=pivot.index)
recommender_df.head()

drug,Actoplus MET oral,Actos oral,Admelog U-100 Insulin lispro subcutaneous,Afrezza inhalation,Amaryl oral,Apidra subcutaneous,Avandamet oral,Avandaryl oral,Avandia oral,Bydureon subcutaneous,...,glimepiride oral,glipizide oral,glipizide-metformin oral,glyburide oral,glyburide-metformin oral,metformin oral,nateglinide oral,pioglitazone oral,pioglitazone-metformin oral,sitagliptin oral
drug,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Actoplus MET oral,0.0,0.94402,1.0,0.985659,0.969641,1.0,1.0,1.0,0.983891,0.95761,...,0.963123,0.934219,0.964741,0.932174,0.993971,0.951658,1.0,0.96476,0.914437,0.921355
Actos oral,0.94402,0.0,1.0,0.969212,0.921478,1.0,0.974539,0.946608,0.931496,0.95142,...,0.946789,0.881894,0.960017,0.911682,0.950321,0.924438,0.96163,0.95034,0.935316,0.955195
Admelog U-100 Insulin lispro subcutaneous,1.0,1.0,0.0,0.978946,0.964526,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,0.989276,1.0,1.0,1.0,1.0
Afrezza inhalation,0.985659,0.969212,0.978946,0.0,0.948218,1.0,0.996435,0.995499,0.986058,0.983648,...,0.965299,0.94645,0.98111,0.959559,0.996124,0.978343,0.969125,0.997425,1.0,0.977428
Amaryl oral,0.969641,0.921478,0.964526,0.948218,0.0,1.0,0.949943,0.911513,0.968567,0.963265,...,0.968769,0.918648,1.0,0.894334,0.993469,0.950674,0.965319,0.960959,0.930486,1.0


In [41]:
drug_name = recommender_df.reset_index()
drug_name.columns[1:]

Index(['Actoplus MET oral', 'Actos oral',
       'Admelog U-100 Insulin lispro subcutaneous', 'Afrezza inhalation',
       'Amaryl oral', 'Apidra subcutaneous', 'Avandamet oral',
       'Avandaryl oral', 'Avandia oral', 'Bydureon subcutaneous',
       'Byetta subcutaneous', 'DUETACT oral', 'Farxiga oral', 'Fortamet oral',
       'Glucophage XR oral', 'Glucophage oral', 'Glucotrol XL oral',
       'Glucotrol oral', 'Glucovance oral', 'Glumetza oral', 'Glyxambi oral',
       'Humalog KwikPen subcutaneous',
       'Humalog Mix 75-25 KwikPen subcutaneous',
       'Humalog Mix 75-25 subcutaneous', 'Humalog subcutaneous',
       'Humulin 70-30 subcutaneous', 'Invokamet oral', 'Invokana oral',
       'Janumet XR oral', 'Janumet oral', 'Januvia oral', 'Jardiance oral',
       'Jentadueto oral', 'Kombiglyze XR oral', 'Lantus Solostar subcutaneous',
       'Lantus subcutaneous', 'Levemir FlexTouch subcutaneous',
       'Levemir Flexpen subcutaneous', 'Levemir subcutaneous',
       'Micronase ora

In [17]:
search = 'metformin oral'
drugs = pivot[pivot.index.str.contains(search)].index

for drug in drugs:
    print(drug)
    print('Satisfaction (Avg)', pivot.loc[drug, :].mean())
    print('Number of reviews', pivot.T[drug].count())
    print('')
    print('related drugs')
    print(recommender_df[drug].sort_values()[1:11])
    print('')
    print('*******************************************************************************************')
    print('')

glipizide-metformin oral
Satisfaction (Avg) 3.0
Number of reviews 15

related drugs
drug
Jentadueto oral                   0.913699
Glucovance oral                   0.935675
acarbose oral                     0.937655
Ozempic subcutaneous              0.937920
Janumet XR oral                   0.942820
Victoza 2-Pak subcutaneous        0.944726
Lantus Solostar subcutaneous      0.949843
Levemir FlexTouch subcutaneous    0.951436
glipizide oral                    0.955282
nateglinide oral                  0.958882
Name: glipizide-metformin oral, dtype: float64

*******************************************************************************************

glyburide-metformin oral
Satisfaction (Avg) 2.9523809523809526
Number of reviews 21

related drugs
drug
Levemir Flexpen subcutaneous    0.899293
Avandia oral                    0.908559
DUETACT oral                    0.911959
Glumetza oral                   0.912843
Micronase oral                  0.922656
glipizide oral                 

In [28]:
relevant_drugs = pd.DataFrame(recommender_df[drug].sort_values()[1:5]).reset_index()
relevant_drugs

Unnamed: 0,drug,pioglitazone-metformin oral
0,sitagliptin oral,0.824925
1,nateglinide oral,0.832368
2,Tanzeum subcutaneous,0.846381
3,Novolog Mix 70-30 FlexPen subcutaneous,0.855662


In [32]:
recommended = pd.DataFrame(recommender_df[drug].sort_values()[1:5]).reset_index()
recommended
    

Unnamed: 0,drug,Novolog Mix 70-30 FlexPen subcutaneous
0,Humulin 70-30 subcutaneous,0.804783
1,pioglitazone oral,0.837873
2,pioglitazone-metformin oral,0.855662
3,Toujeo SoloStar subcutaneous,0.868238
