In [1]:
from __future__ import print_function

In [6]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [3]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [8]:
newsgroups = fetch_20newsgroups(categories=['sci.med', 'sci.space', 'talk.politics.guns']
                                ,remove=('headers', 'footers', 'quotes'))
docs_raw = newsgroups.data
print(len(docs_raw))

1733


In [9]:
# Convert to document-term matrix
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                stop_words = 'english',
                                lowercase = True,
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                max_df = 0.5, 
                                min_df = 10)
dtm_tf = tf_vectorizer.fit_transform(docs_raw)

In [10]:
tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
dtm_tfidf = tfidf_vectorizer.fit_transform(docs_raw)
print(dtm_tfidf.shape)



(1733, 2589)


In [12]:
# for TF DTM
lda_tf = LatentDirichletAllocation(n_components=20, random_state=0)
lda_tf.fit(dtm_tf)
# for TFIDF DTM
lda_tfidf = LatentDirichletAllocation(n_components=20, random_state=0)
lda_tfidf.fit(dtm_tfidf)

LatentDirichletAllocation(n_components=20, random_state=0)

In [13]:
# Visualize
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)

In [14]:
pyLDAvis.sklearn.prepare(lda_tfidf, dtm_tfidf, tfidf_vectorizer)

In [15]:
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer, mds='mmds')

In [16]:
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer, mds='tsne')

## Association rule mining

In [18]:
import pandas as pd
ratings_df = pd.read_csv('../DataSets/ratings_small.csv')
ratings_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
...,...,...,...,...
99999,671,6268,2.5,1065579370
100000,671,6269,4.0,1065149201
100001,671,6365,4.0,1070940363
100002,671,6385,2.5,1070979663


In [23]:
# Select only movies with ratings larger than 3
ratings_df = ratings_df[ratings_df["rating"] > 3]
ratings_df

Unnamed: 0,userId,movieId,rating,timestamp
4,1,1172,4.0,1260759205
8,1,1339,3.5,1260759125
12,1,1953,4.0,1260759191
13,1,2105,4.0,1260759139
20,2,10,4.0,835355493
...,...,...,...,...
99996,671,5991,4.5,1064245387
99997,671,5995,4.0,1066793014
100000,671,6269,4.0,1065149201
100001,671,6365,4.0,1070940363


In [26]:
# Create list of movieIDs for each UserID
ratings_list = ratings_df.groupby("userId")["movieId"].apply(list)
ratings_list

userId
1                               [1172, 1339, 1953, 2105]
2      [10, 17, 39, 47, 50, 110, 150, 153, 222, 253, ...
3      [110, 247, 296, 318, 356, 736, 778, 1197, 1235...
4      [10, 34, 112, 141, 153, 260, 289, 296, 349, 35...
5      [3, 39, 104, 141, 150, 231, 277, 344, 356, 364...
                             ...                        
667    [6, 32, 36, 41, 58, 110, 144, 150, 161, 232, 2...
668    [296, 318, 593, 608, 1213, 1221, 1233, 1358, 2...
669    [223, 260, 785, 913, 968, 1304, 1953, 2395, 23...
670    [1, 25, 34, 36, 47, 50, 318, 457, 527, 593, 60...
671    [1, 36, 50, 230, 260, 296, 318, 356, 457, 529,...
Name: movieId, Length: 671, dtype: object

In [28]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, fpmax, fpgrowth

te = TransactionEncoder()
te_ary = te.fit(ratings_list).transform(ratings_list)
df_te = pd.DataFrame(te_ary, columns=te.columns_)

In [30]:
frequent_itemsets = apriori(df_te, min_support=0.15, use_colnames=True)
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.271237,(1)
1,0.219076,(32)
2,0.242921,(47)
3,0.274218,(50)
4,0.251863,(110)
...,...,...
190,0.165425,"(5952, 4993, 2571)"
191,0.154993,"(4993, 2571, 7153)"
192,0.153502,"(5952, 7153, 2571)"
193,0.187779,"(5952, 4993, 7153)"


In [31]:
from mlxtend.frequent_patterns import association_rules
# Try confidence of .7
association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(47),(296),0.242921,0.400894,0.196721,0.809816,2.020024,0.099336,3.150137
1,(50),(296),0.274218,0.400894,0.204173,0.744565,1.857261,0.094241,2.345436
2,(50),(318),0.274218,0.426230,0.192250,0.701087,1.644858,0.075371,1.919523
3,(110),(356),0.251863,0.406855,0.183308,0.727811,1.788868,0.080837,2.179162
4,(1196),(260),0.299553,0.368107,0.251863,0.840796,2.284106,0.141595,3.969076
...,...,...,...,...,...,...,...,...,...
69,"(1210, 260, 1198)",(1196),0.160954,0.299553,0.152012,0.944444,3.152847,0.103798,12.608048
70,"(1196, 260, 1198)",(1210),0.187779,0.265276,0.152012,0.809524,3.051632,0.102199,3.857303
71,"(1210, 1196)","(260, 1198)",0.216095,0.220566,0.152012,0.703448,3.189282,0.104349,2.628323
72,"(1210, 1198)","(1196, 260)",0.175857,0.251863,0.152012,0.864407,3.432053,0.107720,5.517511


In [32]:
# Try confidence of .3
association_rules(frequent_itemsets, metric="confidence", min_threshold=0.3)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(296),(1),0.400894,0.271237,0.150522,0.375465,1.384268,0.041784,1.166888
1,(1),(296),0.271237,0.400894,0.150522,0.554945,1.384268,0.041784,1.346139
2,(1),(318),0.271237,0.426230,0.154993,0.571429,1.340659,0.039383,1.338798
3,(318),(1),0.426230,0.271237,0.154993,0.363636,1.340659,0.039383,1.145199
4,(1),(356),0.271237,0.406855,0.156483,0.576923,1.418005,0.046129,1.401978
...,...,...,...,...,...,...,...,...,...
325,"(260, 1198)","(1210, 1196)",0.220566,0.216095,0.152012,0.689189,3.189282,0.104349,2.522128
326,(1210),"(1196, 260, 1198)",0.265276,0.187779,0.152012,0.573034,3.051632,0.102199,1.902306
327,(1196),"(1210, 260, 1198)",0.299553,0.160954,0.152012,0.507463,3.152847,0.103798,1.703518
328,(260),"(1210, 1196, 1198)",0.368107,0.160954,0.152012,0.412955,2.565677,0.092764,1.429272


In [33]:
# Try support of .01
frequent_itemsets_1 = apriori(df_te, min_support=0.01, use_colnames=True)
frequent_itemsets_1

MemoryError: Unable to allocate 262. GiB for an array with shape (139542553, 3, 671) and data type bool

### Note for above - computation took about 1 min and then gave error of unable to allocate needed memory

In [34]:
# Try support of .001
frequent_itemsets_2 = apriori(df_te, min_support=0.01, use_colnames=True)
frequent_itemsets_2

MemoryError: Unable to allocate 262. GiB for an array with shape (139542553, 3, 671) and data type bool

### Again: note for above - computation took about 45 seconds and then gave error of unable to allocate needed memory

### Back to different confidence levels

In [35]:
# Try confidence of .9
association_rules(frequent_itemsets, metric="confidence", min_threshold=0.9)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(1221),(858),0.177347,0.277198,0.163934,0.92437,3.334689,0.114774,9.557046
1,(1291),(1198),0.172876,0.295082,0.157973,0.913793,3.096743,0.10696,8.177049
2,(5952),(4993),0.229508,0.251863,0.210134,0.915584,3.635249,0.15233,8.862547
3,"(1210, 1196)",(260),0.216095,0.368107,0.196721,0.910345,2.473042,0.117175,7.048034
4,"(2571, 1196)",(260),0.183308,0.368107,0.165425,0.902439,2.451565,0.097948,6.4769
5,"(1210, 1198)",(260),0.175857,0.368107,0.160954,0.915254,2.486379,0.09622,7.456334
6,"(5952, 260)",(4993),0.153502,0.251863,0.150522,0.980583,3.893319,0.11186,38.529061
7,"(1210, 1198)",(1196),0.175857,0.299553,0.160954,0.915254,3.055401,0.108275,8.265276
8,"(5952, 2571)",(4993),0.175857,0.251863,0.165425,0.940678,3.734881,0.121133,12.611454
9,"(7153, 2571)",(4993),0.165425,0.251863,0.154993,0.936937,3.720028,0.113328,11.863317


- I discovered that a lot of rules tend to have overlap at this higher threshold. It looks like many rules add to the appearance of the same consequents
- Although the support is low across the board, the high confidence reveals that these consequents are true a lot of the time