In [125]:
import pandas as pd
import nltk
import numpy as np
from collections import Counter
nltk.download('punkt')

from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /Users/stroud/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [126]:
# Load the data
d = pd.read_csv('../data/drugsComTrain_raw.tsv', sep='\t')[:1000]
len(d)

1000

In [127]:
# Clean out broken condition names
notspan = d.condition.apply(lambda x: False if type(x)==float else '</span>' not in x)
d = d[notspan]

In [128]:
# Take a look at some reviews
d.review.head()

0    "It has no side effect, I take it in combinati...
1    "My son is halfway through his fourth week of ...
2    "I used to take another oral contraceptive, wh...
3    "This is my first time using any form of birth...
4    "Suboxone has completely turned my life around...
Name: review, dtype: object

In [129]:
# And ratings
d.rating.head()

0    9.0
1    8.0
2    5.0
3    8.0
4    9.0
Name: rating, dtype: float64

In [130]:
# Split each sentence into words, or "tokens"
d['tokens'] = d.review.apply(word_tokenize).apply(lambda x: x[1:-1])  # shave off quotation marks
d.tokens[0]

['It',
 'has',
 'no',
 'side',
 'effect',
 ',',
 'I',
 'take',
 'it',
 'in',
 'combination',
 'of',
 'Bystolic',
 '5',
 'Mg',
 'and',
 'Fish',
 'Oil']

In [184]:
# count up the 100 most common words
vocab = Counter()
for tokens in d['tokens']:
    vocab.update(tokens)
vocab_counts = vocab.most_common(100)
print(vocab_counts[:10])
vocab = [v[0] for v in vocab_counts]  # remove the numbers
vocab_dict = {v: i for i, v in enumerate(vocab)}

[('I', 5184), ('.', 5067), ('and', 2644), (',', 2557), ('the', 2066), ('to', 1969), (';', 1764), ('&', 1739), ('a', 1684), ('it', 1563)]


In [185]:
# convert each sentence binary indicators
def tokens2array(tokens):
    array = np.zeros((len(vocab),))
    for token in tokens:
        if token in vocab_dict:
            array[vocab_dict[token]] = 1
    return array

In [186]:
d['array'] = d['tokens'].apply(tokens2array)

In [187]:
Xtr = np.stack(d.array.values)
Ytr = np.asarray(d.rating) > 5

In [188]:
Xtr.shape

(984, 100)

In [189]:
Ytr.shape

(984,)

In [190]:
import sklearn

In [216]:
model = sklearn.linear_model.logistic.LogisticRegression(penalty='l1', C=100)

In [217]:
model.fit(Xtr, Ytr)

LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [218]:
model.predict_proba(Xtr)

array([[0.45290057, 0.54709943],
       [0.5148556 , 0.4851444 ],
       [0.37719083, 0.62280917],
       ...,
       [0.75921981, 0.24078019],
       [0.06467981, 0.93532019],
       [0.14972084, 0.85027916]])

In [221]:
(model.predict(Xtr) == Ytr).mean()

0.7571138211382114

In [222]:
Ytr.mean()

0.6920731707317073

In [223]:
# Words in order of usage
for i, word in enumerate(vocab):
    print(i, '\t', word, '\t', vocab_counts[i][1], '\t', model.coef_[0,i])

0 	 I 	 5184 	 -0.16928512648504895
1 	 . 	 5067 	 0.7192804671012172
2 	 and 	 2644 	 -0.4116934417186868
3 	 , 	 2557 	 -0.0411241136152573
4 	 the 	 2066 	 0.012641508902283409
5 	 to 	 1969 	 -0.5325539328969976
6 	 ; 	 1764 	 0.2599293016577009
7 	 & 	 1739 	 0.2179578748263298
8 	 a 	 1684 	 0.2920173413960011
9 	 it 	 1563 	 -0.17777194862415419
10 	 my 	 1538 	 -0.09634047179022644
11 	 # 	 1521 	 0.8596183070653756
12 	 039 	 1516 	 -1.1186394636928518
13 	 for 	 1199 	 -0.2818131758316536
14 	 was 	 1057 	 -0.24145111806818936
15 	 have 	 1033 	 0.3531713062331918
16 	 of 	 1003 	 0.17328096159896428
17 	 on 	 771 	 0.23468611957493105
18 	 in 	 728 	 0.026270987104978882
19 	 is 	 727 	 0.7046297449280254
20 	 me 	 723 	 -0.04203504468730681
21 	 had 	 655 	 0.12100361252422397
22 	 with 	 650 	 0.007246006634633771
23 	 but 	 644 	 0.3640540952397985
24 	 this 	 640 	 -0.45200259066112974
25 	 ! 	 639 	 0.8249159977990878
26 	 that 	 599 	 0.2731913907805206
27 	 t 	 524 	 

In [224]:
# Words in order of coefficient
coef_order = np.argsort(model.coef_[0,:])
for i in coef_order:
    print(i, '\t', vocab[i], '\t', vocab_counts[i][1], '\t', model.coef_[0,i])

12 	 039 	 1516 	 -1.1186394636928518
45 	 s 	 287 	 -0.7950763425027414
27 	 t 	 524 	 -0.7048022892251979
60 	 out 	 238 	 -0.6923893083597499
61 	 months 	 235 	 -0.6731504274737775
30 	 not 	 461 	 -0.6219464178596439
49 	 ) 	 282 	 -0.60059121774811
95 	 bad 	 152 	 -0.5650300418468422
5 	 to 	 1969 	 -0.5325539328969976
97 	 off 	 150 	 -0.5320811243488003
84 	 pill 	 168 	 -0.4531711526711092
24 	 this 	 640 	 -0.45200259066112974
96 	 This 	 150 	 -0.42326294802450765
2 	 and 	 2644 	 -0.4116934417186868
38 	 m 	 319 	 -0.4025168001800114
85 	 weeks 	 168 	 -0.3271421250153155
58 	 The 	 248 	 -0.32066048279228265
81 	 week 	 185 	 -0.29923532513474194
13 	 for 	 1199 	 -0.2818131758316536
51 	 ve 	 277 	 -0.2711361329049879
14 	 was 	 1057 	 -0.24145111806818936
9 	 it 	 1563 	 -0.17777194862415419
0 	 I 	 5184 	 -0.16928512648504895
99 	 did 	 149 	 -0.16169846672259972
50 	 days 	 277 	 -0.15942962910732797
53 	 started 	 266 	 -0.15630600837707598
87 	 took 	 166 	 -0.15200