In [6]:
import pandas as pd
import numpy as np
import pickle
import json
from copy import deepcopy
import itertools
from sklearn.feature_extraction.text import CountVectorizer
from gensim.utils import simple_preprocess
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters
import re
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

In [7]:
df=pd.read_pickle('review_table.pickle')

In [8]:
df['incentivized']=0
df.loc[(df['isemployee']=='true')|(df['freeproduct']=='true'),'incentivized']=1

In [9]:
df['incentivized'].value_counts()

0    58911
1     4073
Name: incentivized, dtype: int64

In [10]:
reviews=df[['ReviewText','incentivized']]

In [11]:
reviews

Unnamed: 0,ReviewText,incentivized
0,I tried ItCosmetics about a year ago and loved...,0
1,I've purchased hundreds of items from Sephora ...,0
2,I'm obsessed with taking care of my skin and I...,0
3,"This is the only ""foundation"" ive ever bought ...",0
4,I have to use 6 blottung sheets after using this,0
5,I absolutely love this cc cream! I have hyper ...,0
6,I love love love love love this foundation. It...,0
7,I have dry skin that tends to have flaky patch...,0
8,This product does what it says. It really brig...,0
9,I have normal to dry skin and this just looked...,0


In [10]:
reviews['tokens'] = reviews['ReviewText'].apply(simple_preprocess)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [11]:
def find_ngrams(input_list, n):
    # Courtesy http://locallyoptimal.com/blog/2013/01/20/elegant-n-gram-generation-in-python/
    ngrams = zip(*[input_list[i:] for i in range(n)])
    flattened_ngrams = list(map(lambda x: '_'.join(x), ngrams))
    return flattened_ngrams

In [12]:
reviews['bigrams'] = reviews['tokens'].apply(lambda x: find_ngrams(x, n=2))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [13]:
reviews['modeling_text_list'] = reviews['tokens'] + reviews['bigrams']
reviews['modeling_text'] = reviews['modeling_text_list'].apply(lambda x: ' '.join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [14]:
vocabulary = set(itertools.chain.from_iterable(reviews['modeling_text_list']))
vectorizer = CountVectorizer(vocabulary=vocabulary,stop_words='english',min_df=2)

In [15]:
reviews_sample=reviews.sample(1000)

In [30]:
msk = np.random.rand(len(reviews)) < 0.6
train = reviews[msk]
test = reviews[~msk]

In [None]:
X_train = vectorizer.fit_transform(train['modeling_text']).todense()
y_train = train['incentivized']

X_test = vectorizer.transform(test['modeling_text']).todense()
y_test = test['incentivized']

In [None]:
nb = GaussianNB()
nb.fit(X_train, y_train)
test['preds'] = nb.predict(X_test)
scores = nb.score(X_test, y_test)

In [21]:
scores

0.925531914893617

In [24]:
f1_score(y_test,test['preds'])

0.0

In [25]:
f1_score(y_train,nb.predict(X_train))

1.0

In [28]:
print(classification_report(y_test, nb.predict(X_test)))

             precision    recall  f1-score   support

          0       0.94      0.99      0.96       176
          1       0.00      0.00      0.00        12

avg / total       0.88      0.93      0.90       188



In [2]:
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.preprocessing import Normalizer

In [127]:
df=reviews.sample(100)

In [14]:
df['tokens'] = df['ReviewText'].apply(simple_preprocess)

In [15]:
df['tokens']

43481    [thanks, creamy, texture, one, would, think, t...
6570     [this, product, feels, amazing, and, weightles...
62819    [good, product, and, excelllent, protection, f...
12096    [its, really, light, weight, and, absorbs, qui...
9765     [this, stuff, is, amazing, if, you, ever, do, ...
41893    [and, have, had, some, lines, on, my, neck, si...
21377    [this, is, amazing, my, client, noticed, my, f...
12615    [so, have, very, sensitive, skin, dry, combo, ...
26787    [was, excited, to, try, this, product, because...
60898    [can, wear, this, moisturizer, all, over, face...
56342    [have, combination, skin, and, pores, that, ar...
49725    [have, been, using, this, for, months, without...
46725    [received, this, product, free, of, charge, fo...
48506    [tried, this, for, use, with, armani, luminous...
19539    [am, vitamin, freak, this, cream, is, horrible...
40608    [wanted, to, buy, the, full, size, but, though...
17410    [this, is, an, amazing, product, it, hydrating.

In [16]:
df['modeling_text'] = df['tokens'].apply(lambda x: ' '.join(x))

In [23]:
df.iloc[0]['tokens']

['thanks',
 'creamy',
 'texture',
 'one',
 'would',
 'think',
 'this',
 'would',
 'be',
 'very',
 'concealing',
 'right',
 'wrong',
 'this',
 'just',
 'even',
 'out',
 'the',
 'color',
 'and',
 'still',
 'have',
 'dark',
 'circles',
 'still',
 'use',
 'it',
 'to',
 'use',
 'it',
 'as',
 'was',
 'too',
 'lazy',
 'to',
 'return',
 'it',
 'but',
 'never',
 'again']

In [18]:
example=df['modeling_text']

In [29]:
vectorizer = CountVectorizer(min_df = 1, stop_words = 'english')
dtm = vectorizer.fit_transform(example)  # dtm: Document-Term Matrix
pd.DataFrame(dtm.toarray(), index=example, columns=vectorizer.get_feature_names()).shape

(100, 1058)

In [24]:
vectorizer.get_feature_names()

['able',
 'absolute',
 'absolutely',
 'absorb',
 'absorbing',
 'absorbs',
 'accentuate',
 'accepts',
 'accidentally',
 'acid',
 'acne',
 'actually',
 'add',
 'added',
 'adding',
 'adjusted',
 'advise',
 'afraid',
 'ago',
 'air',
 'alot',
 'altering',
 'amazing',
 'amazingly',
 'anti',
 'anymore',
 'anyways',
 'appear',
 'appearance',
 'appeared',
 'appearing',
 'appears',
 'application',
 'applied',
 'applies',
 'apply',
 'applying',
 'appreciate',
 'area',
 'areas',
 'aren',
 'argan',
 'armani',
 'asked',
 'assume',
 'austin',
 'awake',
 'away',
 'awesome',
 'awful',
 'ba',
 'baby',
 'bad',
 'badly',
 'bag',
 'balling',
 'balm',
 'bank',
 'bareminerals',
 'basic',
 'bathroom',
 'bb',
 'beat',
 'beautiful',
 'bed',
 'began',
 'believe',
 'benefits',
 'best',
 'better',
 'beware',
 'big',
 'bigger',
 'bit',
 'blemish',
 'blend',
 'blends',
 'blot',
 'blotchy',
 'blush',
 'bobbi',
 'bomb',
 'book',
 'boost',
 'booster',
 'bother',
 'bottle',
 'bottles',
 'bought',
 'box',
 'brand',
 'bre

In [33]:
lsa = TruncatedSVD(10, algorithm = 'randomized')
dtm_lsa = lsa.fit_transform(dtm)
dtm_lsa

array([[ 5.05344890e-01, -3.55457646e-01,  5.29088302e-01,
         2.79271193e-01, -7.83178067e-01, -7.42433184e-01,
         6.86799338e-01,  6.53308688e-01, -1.14898186e-01,
        -1.65619260e-01],
       [ 4.54812515e-01, -7.43246848e-01,  1.03592960e+00,
         6.42630990e-02, -7.05851190e-01,  2.07558643e-01,
        -7.47786567e-01, -9.46325917e-02,  5.12644941e-01,
         4.36645345e-01],
       [ 5.52280955e-01, -2.40475545e-03,  3.32047300e-01,
         7.26546309e-01,  1.20179971e-01,  1.03763209e-01,
        -2.29505664e-01, -1.31183748e-01,  1.30765763e-01,
        -1.52684038e-01],
       [ 2.56503889e+00, -2.20964834e+00, -8.78035570e-02,
         1.97305597e-01,  2.33023992e-01, -1.68051991e+00,
        -1.11026608e+00, -4.71422568e-01, -2.65254439e-01,
        -1.87572986e-01],
       [ 1.23831075e+00,  2.73779702e-01,  1.03052049e+00,
         5.59393768e-01,  6.54685444e-01, -3.95763672e-01,
        -3.48470682e-01, -6.02015040e-01, -2.22053398e-02,
         2.

In [28]:

pd.DataFrame(lsa.components_.round(5),index = ["component_1","component_2",'component_3','component_4','component_5'],
             columns = vectorizer.get_feature_names())

Unnamed: 0,able,absolute,absolutely,absorb,absorbing,absorbs,accentuate,accepts,accidentally,acid,...,wrong,years,yes,yielded,youd,youthful,zero,zit,zone,über
component_1,0.02486,0.00087,0.00338,0.01178,0.00277,0.01828,0.00296,0.00133,0.00045,0.00271,...,0.00604,0.00062,0.00304,0.00259,0.00164,0.00304,0.00353,0.00212,0.05868,0.04972
component_2,0.02778,0.00011,-0.0093,-0.02121,-0.00544,-0.03176,-0.00538,-0.0031,-0.00105,-0.00685,...,-0.01024,-0.00154,-0.00892,-0.00543,-0.00392,-0.00892,-0.00924,-0.00261,0.03717,0.05556
component_3,-0.0097,0.00443,0.00626,0.0138,-0.00556,-0.0049,0.04408,-0.00662,0.00454,0.00455,...,0.01084,0.00455,-0.00367,0.01651,-0.00489,-0.00367,-0.01402,0.00751,-0.03303,-0.0194
component_4,-0.00619,0.00632,-0.00408,0.03742,-0.0069,0.02608,-0.03512,-0.00201,-0.0047,0.03302,...,0.04235,0.00315,-0.00712,-0.01824,-0.00149,-0.00712,-0.00575,0.0063,-0.02371,-0.01239
component_5,-0.00355,0.00386,0.00096,0.00338,0.00408,0.07412,0.0025,0.00029,0.00133,0.02078,...,-0.03632,-0.00052,-0.00045,-0.0281,-0.00133,-0.00045,0.01227,-0.01749,0.05331,-0.00711


In [36]:
topic_probs=pd.DataFrame(dtm_lsa.round(5), index = example, 
             columns =  ["component_1","component_2",'component_3','component_4','component_5',"component_6","component_7",'component_8','component_9','component_10'])

In [212]:
dtm_lsa

array([[ 5.05344890e-01, -3.55457646e-01,  5.29088302e-01,
         2.79271193e-01, -7.83178067e-01, -7.42433184e-01,
         6.86799338e-01,  6.53308688e-01, -1.14898186e-01,
        -1.65619260e-01],
       [ 4.54812515e-01, -7.43246848e-01,  1.03592960e+00,
         6.42630990e-02, -7.05851190e-01,  2.07558643e-01,
        -7.47786567e-01, -9.46325917e-02,  5.12644941e-01,
         4.36645345e-01],
       [ 5.52280955e-01, -2.40475545e-03,  3.32047300e-01,
         7.26546309e-01,  1.20179971e-01,  1.03763209e-01,
        -2.29505664e-01, -1.31183748e-01,  1.30765763e-01,
        -1.52684038e-01],
       [ 2.56503889e+00, -2.20964834e+00, -8.78035570e-02,
         1.97305597e-01,  2.33023992e-01, -1.68051991e+00,
        -1.11026608e+00, -4.71422568e-01, -2.65254439e-01,
        -1.87572986e-01],
       [ 1.23831075e+00,  2.73779702e-01,  1.03052049e+00,
         5.59393768e-01,  6.54685444e-01, -3.95763672e-01,
        -3.48470682e-01, -6.02015040e-01, -2.22053398e-02,
         2.

In [37]:
lsa.explained_variance_ratio_

array([0.16883547, 0.06511754, 0.04117337, 0.03199046, 0.02702805,
       0.02633162, 0.02495997, 0.02270437, 0.02133748, 0.02045833])

In [38]:
topic_probs['max'] = topic_probs.idxmax(axis=1)

In [41]:
topic_probs

Unnamed: 0_level_0,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8,component_9,component_10,max
modeling_text,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
thanks creamy texture one would think this would be very concealing right wrong this just even out the color and still have dark circles still use it to use it as was too lazy to return it but never again,0.50534,-0.35546,0.52909,0.27927,-0.78318,-0.74243,0.68680,0.65331,-0.11490,-0.16562,component_7
this product feels amazing and weightless it leaves dewy look and looks very natural would recommend it to someone who is looking for light coverage weightless foundation,0.45481,-0.74325,1.03593,0.06426,-0.70585,0.20756,-0.74779,-0.09463,0.51264,0.43665,component_3
good product and excelllent protection for your face absorbs quickly,0.55228,-0.00240,0.33205,0.72655,0.12018,0.10376,-0.22951,-0.13118,0.13077,-0.15268,component_4
its really light weight and absorbs quickly into the skin yet it really moisturizes your skin my mom uses it my sister uses this and use this face cream ever since it came out its perfect for all skin types you can go wrong with this face cream also love its rose scent,2.56504,-2.20965,-0.08780,0.19731,0.23302,-1.68052,-1.11027,-0.47142,-0.26525,-0.18757,component_1
this stuff is amazing if you ever do treatment on your face you ll know how bad the peeling looks and if you get sunburnt really badly also this will save you always put on my face makeup after and it made the application so perfect must try,1.23831,0.27378,1.03052,0.55939,0.65469,-0.39576,-0.34847,-0.60202,-0.02221,0.22606,component_1
and have had some lines on my neck since my which assume are due to tech neck ve always been self conscious about them but didn really think there was much to do about it read bunch of reviews on neck cream and finally decided on this one even though there were some negative reviews no regrets this product is awesome did notice some pilling the first few time used it but just lightly brushed them off with dry face cloth ve now learned the key to not having pilling is to only use very small amount of this product leaves your skin feeling very soft and ve noticed significant decrease in the lines on my neck don get me wrong they are still there but are less defined and are fading going to recommend this product to everyone and continue to use it for these awesome results,3.21302,-1.51716,1.39323,4.91562,-3.25850,0.29575,-1.61874,3.72312,0.01169,1.10346,component_4
this is amazing my client noticed my face and asked what ve been using and told her this little jar from sephora and love it my face is clearer and is gone,1.30616,0.36855,0.74006,0.90484,0.20126,-0.37335,-0.27696,-0.82173,-0.12296,-0.20053,component_1
so have very sensitive skin dry combo got sample and have been using it for almost week pros absorbs quickly skin feels very very nice smells like book store maybe not pro for everyone but like it cons expensive absorbs so quickly that you can tell if you ve missed an area of your face made my face bit oilier when woke up why it didn work for me it made me tad greasy but nothing too crazy however ve started to break out not fun my skin is just too sensitive guess,3.98520,-1.77127,-0.55762,0.67707,0.21996,0.37154,0.46709,-1.00763,-1.02577,-2.46403,component_1
was excited to try this product because there was alot of hype around this brand while do like some of their other products like the face gloss and hydrating oil stick wasnt huge fan of this didn hate it but didnt love it didnt really do much for me there is an intstant cooling feeling when you apply it but dont see any differnece in my skin or under eyes when applied,1.70740,-1.21559,0.95122,0.30030,0.30747,0.23529,-0.50912,-0.01682,0.25093,-1.11697,component_1
can wear this moisturizer all over face too much shine and sparkle for me it does have tiny particles of shimmer but do like using it top of cheekbones for glow luminosity looks good as highlighter worn alone also blend cream highlighter on top of brighter days for even more intensity apply before foundation or foundation powder also like to apply just minimal amount after concealer under eyes with ring finger under eyes look so moisturized and dewy looking really brightens wide awake looking and does not accentuate my lines which is great but can see little sparkle which don care for but not extreme the dewy moisturizing look does not last all day and applying it again below eyes over makeup end of day is difficult to blend like the look for what using it for inspire of the sparkles really wish tarte would have eliminated the sparkles in this highlighting moisturizer,3.08179,-2.14333,7.47298,-4.49921,0.00041,4.41270,-1.82186,-1.15421,-0.94145,-0.08856,component_3


In [44]:
for i in range(1,11):
    topic = 'component_{}'.format(i)
    topic_probs[topic_probs['max'] == topic].reset_index()['modeling_text'].to_csv("output/{}.csv".format(topic), index=False)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
pd.DataFrame(cosine_similarity(dtm_lsa,dtm_lsa).round(6), columns =example, index = example)

In [216]:
topic_probs

Unnamed: 0_level_0,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8,component_9,component_10,max
modeling_text,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
thanks creamy texture one would think this would be very concealing right wrong this just even out the color and still have dark circles still use it to use it as was too lazy to return it but never again,0.50534,-0.35546,0.52909,0.27927,-0.78318,-0.74243,0.68680,0.65331,-0.11490,-0.16562,component_7
this product feels amazing and weightless it leaves dewy look and looks very natural would recommend it to someone who is looking for light coverage weightless foundation,0.45481,-0.74325,1.03593,0.06426,-0.70585,0.20756,-0.74779,-0.09463,0.51264,0.43665,component_3
good product and excelllent protection for your face absorbs quickly,0.55228,-0.00240,0.33205,0.72655,0.12018,0.10376,-0.22951,-0.13118,0.13077,-0.15268,component_4
its really light weight and absorbs quickly into the skin yet it really moisturizes your skin my mom uses it my sister uses this and use this face cream ever since it came out its perfect for all skin types you can go wrong with this face cream also love its rose scent,2.56504,-2.20965,-0.08780,0.19731,0.23302,-1.68052,-1.11027,-0.47142,-0.26525,-0.18757,component_1
this stuff is amazing if you ever do treatment on your face you ll know how bad the peeling looks and if you get sunburnt really badly also this will save you always put on my face makeup after and it made the application so perfect must try,1.23831,0.27378,1.03052,0.55939,0.65469,-0.39576,-0.34847,-0.60202,-0.02221,0.22606,component_1
and have had some lines on my neck since my which assume are due to tech neck ve always been self conscious about them but didn really think there was much to do about it read bunch of reviews on neck cream and finally decided on this one even though there were some negative reviews no regrets this product is awesome did notice some pilling the first few time used it but just lightly brushed them off with dry face cloth ve now learned the key to not having pilling is to only use very small amount of this product leaves your skin feeling very soft and ve noticed significant decrease in the lines on my neck don get me wrong they are still there but are less defined and are fading going to recommend this product to everyone and continue to use it for these awesome results,3.21302,-1.51716,1.39323,4.91562,-3.25850,0.29575,-1.61874,3.72312,0.01169,1.10346,component_4
this is amazing my client noticed my face and asked what ve been using and told her this little jar from sephora and love it my face is clearer and is gone,1.30616,0.36855,0.74006,0.90484,0.20126,-0.37335,-0.27696,-0.82173,-0.12296,-0.20053,component_1
so have very sensitive skin dry combo got sample and have been using it for almost week pros absorbs quickly skin feels very very nice smells like book store maybe not pro for everyone but like it cons expensive absorbs so quickly that you can tell if you ve missed an area of your face made my face bit oilier when woke up why it didn work for me it made me tad greasy but nothing too crazy however ve started to break out not fun my skin is just too sensitive guess,3.98520,-1.77127,-0.55762,0.67707,0.21996,0.37154,0.46709,-1.00763,-1.02577,-2.46403,component_1
was excited to try this product because there was alot of hype around this brand while do like some of their other products like the face gloss and hydrating oil stick wasnt huge fan of this didn hate it but didnt love it didnt really do much for me there is an intstant cooling feeling when you apply it but dont see any differnece in my skin or under eyes when applied,1.70740,-1.21559,0.95122,0.30030,0.30747,0.23529,-0.50912,-0.01682,0.25093,-1.11697,component_1
can wear this moisturizer all over face too much shine and sparkle for me it does have tiny particles of shimmer but do like using it top of cheekbones for glow luminosity looks good as highlighter worn alone also blend cream highlighter on top of brighter days for even more intensity apply before foundation or foundation powder also like to apply just minimal amount after concealer under eyes with ring finger under eyes look so moisturized and dewy looking really brightens wide awake looking and does not accentuate my lines which is great but can see little sparkle which don care for but not extreme the dewy moisturizing look does not last all day and applying it again below eyes over makeup end of day is difficult to blend like the look for what using it for inspire of the sparkles really wish tarte would have eliminated the sparkles in this highlighting moisturizer,3.08179,-2.14333,7.47298,-4.49921,0.00041,4.41270,-1.82186,-1.15421,-0.94145,-0.08856,component_3


In [220]:
from sklearn.cluster import KMeans, MiniBatchKMeans
cluster_count = 3
km = MiniBatchKMeans(n_clusters=cluster_count)
#km = KMeans(n_clusters=cluster_count,random_state=3)

lsa_clusters=km.fit_predict(dtm_lsa)

In [230]:
topic_probs['cluster']=lsa_clusters
topic_probs=topic_probs.reset_index()

In [233]:
topic_probs

Unnamed: 0,modeling_text,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8,component_9,component_10,max,cluster
0,thanks creamy texture one would think this wou...,0.50534,-0.35546,0.52909,0.27927,-0.78318,-0.74243,0.68680,0.65331,-0.11490,-0.16562,component_7,1
1,this product feels amazing and weightless it l...,0.45481,-0.74325,1.03593,0.06426,-0.70585,0.20756,-0.74779,-0.09463,0.51264,0.43665,component_3,1
2,good product and excelllent protection for you...,0.55228,-0.00240,0.33205,0.72655,0.12018,0.10376,-0.22951,-0.13118,0.13077,-0.15268,component_4,1
3,its really light weight and absorbs quickly in...,2.56504,-2.20965,-0.08780,0.19731,0.23302,-1.68052,-1.11027,-0.47142,-0.26525,-0.18757,component_1,0
4,this stuff is amazing if you ever do treatment...,1.23831,0.27378,1.03052,0.55939,0.65469,-0.39576,-0.34847,-0.60202,-0.02221,0.22606,component_1,1
5,and have had some lines on my neck since my wh...,3.21302,-1.51716,1.39323,4.91562,-3.25850,0.29575,-1.61874,3.72312,0.01169,1.10346,component_4,0
6,this is amazing my client noticed my face and ...,1.30616,0.36855,0.74006,0.90484,0.20126,-0.37335,-0.27696,-0.82173,-0.12296,-0.20053,component_1,1
7,so have very sensitive skin dry combo got samp...,3.98520,-1.77127,-0.55762,0.67707,0.21996,0.37154,0.46709,-1.00763,-1.02577,-2.46403,component_1,0
8,was excited to try this product because there ...,1.70740,-1.21559,0.95122,0.30030,0.30747,0.23529,-0.50912,-0.01682,0.25093,-1.11697,component_1,1
9,can wear this moisturizer all over face too mu...,3.08179,-2.14333,7.47298,-4.49921,0.00041,4.41270,-1.82186,-1.15421,-0.94145,-0.08856,component_3,1


## Tokenize sentences

In [54]:
from nltk.tokenize import sent_tokenize
import re
import string

In [128]:
df['sentence']=df['ReviewText'].apply(sent_tokenize)

In [130]:
# for i in range(len(df['sentence'])):
#     df['sentence'][:]
#     for j in 
df=df.reset_index()

In [133]:
review_ids = []
review_sentences = []
for review in df.as_matrix():
    curr_review_id = review[0]
    curr_review_sentences = review[-1]
    
    # Divide long sentences even longer if possible!
    review_ids += [curr_review_id] * len(curr_review_sentences)
    review_sentences += curr_review_sentences
    
df_review_sentences = pd.DataFrame({'review_id': review_ids, 'sentence': review_sentences})
df_review_sentences.sample(10)


  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,review_id,sentence
112,58664,"It also caused me to break out a bit, but that..."
431,37652,So I tried this oil (skeptical) and I can't te...
234,56307,Unfortunately I didn't find that it did much o...
286,49331,It's a dry oïl.
405,50686,I give it 4 stars not 5 because of packaging.
127,40383,I wasn't expecting much from Moonf Fruit and h...
148,61759,He bought it and told me to use it without or ...
284,49331,Love love love that primer.
350,48925,"Replacing it at that rate,would cost almost $2..."
544,5001,"I purchased the small jar of this moisturizer,..."


In [137]:
df_review_sentences['sentence']=df_review_sentences['sentence'].apply(lambda x: re.sub('[%s]'% re.escape(string.punctuation),'',x))

In [147]:
import gensim
import numpy as np
import nltk
import os
from nltk.corpus import stopwords

nltk.download('punkt')

stopword = stopwords.words('english')
stopword += ['?','!','.',',',':',';']

[nltk_data] Downloading package punkt to /Users/Minmin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [166]:
def chapter_reader():
    for i in list(df['ReviewText']):
        txt = i
        yield (x for x in 
            gensim.utils.tokenize(txt, lowercase=True, deacc=True, 
                                  errors="ignore")
            if x not in stopword)
corpus = []
dictionary = gensim.corpora.Dictionary(chapter_reader())
dictionary.filter_extremes(no_below=1, no_above=0.8, keep_n=100000)
for values in chapter_reader():
    corpus.append(dictionary.doc2bow(values))

In [167]:
gensim.corpora.MmCorpus.serialize('dgray_corpus.mm', corpus)
corpus = gensim.corpora.MmCorpus('dgray_corpus.mm')

In [169]:
print(corpus[0])

[(0, 3.0), (1, 1.0), (2, 1.0), (3, 1.0), (4, 1.0), (5, 1.0), (6, 2.0), (7, 1.0), (8, 1.0), (9, 1.0), (10, 1.0), (11, 1.0), (12, 1.0), (13, 1.0), (14, 1.0), (15, 1.0), (16, 1.0), (17, 1.0), (18, 1.0), (19, 1.0), (20, 1.0), (21, 1.0), (22, 2.0), (23, 1.0), (24, 1.0), (25, 1.0), (26, 1.0), (27, 1.0), (28, 1.0), (29, 1.0), (30, 3.0), (31, 2.0), (32, 1.0), (33, 1.0), (34, 1.0), (35, 1.0), (36, 1.0), (37, 4.0), (38, 1.0), (39, 1.0), (40, 1.0), (41, 1.0), (42, 1.0), (43, 1.0), (44, 1.0), (45, 1.0), (46, 1.0), (47, 2.0)]


In [159]:
dictionary = gensim.corpora.Dictionary(chapter_reader())

In [161]:
dictionary

<gensim.corpora.dictionary.Dictionary at 0x1a21c67a58>

In [170]:
for i,vector in enumerate(corpus):
    most_index, most_count = max(vector, key=lambda item: item[1])
    print( "Chapter " + str(i+1) + " most used word: ",end='')
    print( dictionary[most_index], most_count)

Chapter 1 most used word: skin 4.0
Chapter 2 most used word: sample 2.0
Chapter 3 most used word: think 1.0
Chapter 4 most used word: skin 3.0
Chapter 5 most used word: skin 1.0
Chapter 6 most used word: oily 2.0
Chapter 7 most used word: skin 3.0
Chapter 8 most used word: face 2.0
Chapter 9 most used word: skin 2.0
Chapter 10 most used word: much 2.0
Chapter 11 most used word: good 1.0
Chapter 12 most used word: product 9.0
Chapter 13 most used word: skin 3.0
Chapter 14 most used word: face 1.0
Chapter 15 most used word: skin 2.0
Chapter 16 most used word: skin 3.0
Chapter 17 most used word: skin 2.0
Chapter 18 most used word: product 2.0
Chapter 19 most used word: skin 4.0
Chapter 20 most used word: much 5.0
Chapter 21 most used word: skin 3.0
Chapter 22 most used word: always 2.0
Chapter 23 most used word: foundation 5.0
Chapter 24 most used word: face 1.0
Chapter 25 most used word: skin 2.0
Chapter 26 most used word: like 2.0
Chapter 27 most used word: using 4.0
Chapter 28 most use

In [171]:
tfidf = gensim.models.TfidfModel(corpus, normalize=True)
corpus_tfidf = tfidf[corpus]
lsi_tfidf = gensim.models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=5)
lsi_tfidf.print_topics()

[(0,
  '0.169*"dry" + 0.148*"moisturizer" + 0.133*"love" + 0.132*"use" + 0.131*"like" + 0.126*"great" + 0.125*"product" + 0.122*"face" + 0.117*"skin" + 0.116*"make"'),
 (1,
  '-0.573*"sensitive" + -0.354*"dry" + -0.243*"perfect" + -0.210*"good" + -0.180*"oil" + -0.166*"moisturizer" + -0.131*"tender" + 0.125*"use" + -0.124*"soft" + 0.115*"size"'),
 (2,
  '-0.296*"amazing" + -0.208*"smells" + 0.205*"sensitive" + -0.197*"moisturizer" + 0.191*"foundation" + 0.183*"dry" + 0.139*"size" + -0.130*"worth" + -0.120*"greasy" + 0.119*"primer"'),
 (3,
  '0.489*"great" + 0.263*"many" + 0.231*"cleaning" + 0.173*"uses" + 0.162*"works" + 0.142*"products" + 0.121*"face" + -0.115*"smell" + -0.112*"sensitive" + 0.107*"get"'),
 (4,
  '-0.235*"smells" + -0.219*"light" + 0.158*"long" + -0.154*"texture" + 0.150*"hydrated" + -0.138*"good" + 0.135*"soft" + -0.135*"like" + 0.124*"uses" + 0.121*"time"')]

In [177]:
lda = gensim.models.LdaModel(corpus, id2word=dictionary, num_topics=5)
lda.print_topics()

[(0,
  '0.013*"skin" + 0.010*"face" + 0.009*"moisturizer" + 0.008*"product" + 0.008*"tried" + 0.006*"many" + 0.006*"products" + 0.006*"love" + 0.006*"works" + 0.005*"foundation"'),
 (1,
  '0.032*"skin" + 0.013*"product" + 0.012*"dry" + 0.009*"use" + 0.009*"oil" + 0.009*"face" + 0.008*"like" + 0.008*"love" + 0.007*"would" + 0.006*"moisturizer"'),
 (2,
  '0.032*"skin" + 0.014*"like" + 0.014*"product" + 0.012*"use" + 0.011*"face" + 0.008*"really" + 0.008*"make" + 0.007*"makeup" + 0.006*"day" + 0.006*"little"'),
 (3,
  '0.022*"skin" + 0.015*"product" + 0.011*"moisturizer" + 0.010*"would" + 0.007*"acne" + 0.006*"really" + 0.006*"definitely" + 0.006*"like" + 0.006*"also" + 0.005*"much"'),
 (4,
  '0.036*"skin" + 0.015*"love" + 0.012*"product" + 0.011*"using" + 0.011*"like" + 0.010*"dry" + 0.009*"oily" + 0.008*"face" + 0.008*"moisturizer" + 0.007*"make"')]

In [174]:
for i in range(0,len(corpus)):
    print ("Dominant Topics for review " + str(i+1) +": ",end='')
    print (lda[corpus[i]])

Dominant Topics for review 1: [(0, 0.98650205)]
Dominant Topics for review 2: [(0, 0.9688467)]
Dominant Topics for review 3: [(0, 0.022437952), (1, 0.022442732), (2, 0.022394856), (3, 0.91011786), (4, 0.022606587)]
Dominant Topics for review 4: [(3, 0.96863097)]
Dominant Topics for review 5: [(0, 0.034108195), (1, 0.033663604), (2, 0.033543896), (3, 0.864773), (4, 0.033911366)]
Dominant Topics for review 6: [(0, 0.9846636)]
Dominant Topics for review 7: [(3, 0.9855229)]
Dominant Topics for review 8: [(0, 0.013603448), (1, 0.013507923), (2, 0.013506315), (3, 0.94576573), (4, 0.013616545)]
Dominant Topics for review 9: [(0, 0.9645026)]
Dominant Topics for review 10: [(0, 0.96876276)]
Dominant Topics for review 11: [(0, 0.03416177), (1, 0.0338393), (2, 0.033679333), (3, 0.8640258), (4, 0.03429385)]
Dominant Topics for review 12: [(1, 0.044107363), (4, 0.9540736)]
Dominant Topics for review 13: [(4, 0.970969)]
Dominant Topics for review 14: [(0, 0.025446137), (1, 0.8982282), (2, 0.02518639

In [209]:
df['ReviewText'][14]

"This is my holy grail, ride-or-die, toner! I it brightens and evens my skin so well, I can't go without it. I forgot it at my apartment while I was visiting my parents and I could tell a definite difference in the the texture and appearance of my skin, even with all my other skincare products. I really, can't recommend this product highly enough."

## Keras

In [178]:
model = gensim.models.KeyedVectors.load_word2vec_format(, binary=True)
embedding_matrix = model.syn0
# Filter out words with index not in w2v range
word_to_index = dict([(k, v.index) for k, v in model.vocab.items()])

Unnamed: 0,index,ReviewText,incentivized,sentence
0,30673,"I bought the .5 oz of this product. I'm\n22, h...",0,"[I bought the .5 oz of this product., I'm\n22,..."
1,10541,I had a sample sized tube of this product and ...,0,[I had a sample sized tube of this product and...
2,49738,It worked but it's not like omg great. Persona...,0,"[It worked but it's not like omg great., Perso..."
3,56131,Great product for combo skin! My skin feels s...,0,"[Great product for combo skin!, My skin feels ..."
4,10718,"I have sensitive ,dry skin and this moisturize...",0,"[I have sensitive ,dry skin and this moisturiz..."
5,41527,I have very oily skin and am super careful abo...,0,[I have very oily skin and am super careful ab...
6,19954,I have extremely sensitive and acne prone skin...,0,[I have extremely sensitive and acne prone ski...
7,48374,It was a very nice moisturizer but it didn't l...,0,[It was a very nice moisturizer but it didn't ...
8,19944,I love this item too. Usually I can't tell if ...,0,"[I love this item too., Usually I can't tell i..."
9,19716,For something that claims to be perfect for se...,1,[For something that claims to be perfect for s...


In [180]:
df['tokens'] = df['ReviewText'].apply(simple_preprocess)

In [184]:
observations = df

In [185]:
observations = observations.sample(frac=1)

In [198]:

embedding_matrix, word_to_index = resources.create_embedding_matrix()

AttributeError: 'DataFrame' object has no attribute 'create_embedding_matrix'

In [186]:
observations

Unnamed: 0,index,ReviewText,incentivized,sentence,tokens
87,41260,I didn't want to like this because it's sooo e...,0,[I didn't want to like this because it's sooo ...,"[didn, want, to, like, this, because, it, sooo..."
55,61325,Its my favorite moisture!!,0,"[Its my favorite moisture!, !]","[its, my, favorite, moisture]"
96,17316,"I have dry/acne-prone skin, so was on the hunt...",0,"[I have dry/acne-prone skin, so was on the hun...","[have, dry, acne, prone, skin, so, was, on, th..."
20,50489,"5 stars for BASIC moisturizer, and its really ...",0,"[5 stars for BASIC moisturizer, and its really...","[stars, for, basic, moisturizer, and, its, rea..."
24,35894,"Wow. I literally never write reviews on here, ...",0,"[Wow., I literally never write reviews on here...","[wow, literally, never, write, reviews, on, he..."
51,55116,I was kind of worried having to spend a lot of...,0,[I was kind of worried having to spend a lot o...,"[was, kind, of, worried, having, to, spend, lo..."
3,56131,Great product for combo skin! My skin feels s...,0,"[Great product for combo skin!, My skin feels ...","[great, product, for, combo, skin, my, skin, f..."
17,58664,"I love the concept of this product, and it is ...",0,"[I love the concept of this product, and it is...","[love, the, concept, of, this, product, and, i..."
19,40383,"Let's be honest, I know a ton of people probab...",0,"[Let's be honest, I know a ton of people proba...","[let, be, honest, know, ton, of, people, proba..."
13,22699,Makes face feel so clean and love the light sceo,0,[Makes face feel so clean and love the light s...,"[makes, face, feel, so, clean, and, love, the,..."


In [183]:
from collections import defaultdict

#default_dict_instance = defaultdict(lambda: word_to_index['UNK'])
#default_dict_instance.update(word_to_index)
word_to_index = default_dict_instance

In [None]:
# Newsgroup20: Convert tokens to indices
    observations['indices'] = observations['tokens'].apply(lambda token_list: map(lambda token: word_to_index[token],
                                                                                  token_list))
    observations['indices'] = observations['indices'].apply(lambda x: numpy.array(x))

    # Newsgroup20: Pad indices list with zeros, so that every article's list of indices is the same length
    observations['padded_indices'] = observations['indices'].apply(lib.pad_sequence)

In [None]:
classification_model = models.gen_conv_model(observations, embedding_matrix, word_to_index)

## LDA

In [253]:
import pandas as pd
import numpy as np
import pickle
import json
from copy import deepcopy
import itertools
from sklearn.feature_extraction.text import CountVectorizer
from gensim.utils import simple_preprocess
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from collections import defaultdict
from gensim.models import word2vec
from gensim import models
from sklearn.preprocessing import Normalizer, normalize
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, MiniBatchKMeans
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import csv
from sklearn.preprocessing import Imputer, StandardScaler, LabelEncoder, OneHotEncoder
from sklearn_pandas import DataFrameMapper, CategoricalImputer
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

In [254]:
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [255]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])


In [256]:
df_raw=pd.read_csv('review_table.csv')

In [260]:
df_raw.sample(1000)['ReviewText']

16235     this is seriously just hyped. it broke me out ...
40407     Lightweight, controls oil and does not make me...
26970     These pods are definitely a great purchase. Yo...
2397      I love this formula of moisturizer. I've never...
8725      I have tried this for two years and I grew to ...
90261     I love my custom serum! 1) the fact that you c...
24888     This spray did nothing to set my makeup, and m...
63247     The idea of this product was so exciting and p...
69931     I know that it doesn't smell the greatest, it'...
19369     I never ever write reviews but this is AWESOME...
91146     My skin feels smooth & hydrated. During the co...
84713     I love this! It's very hydrating and absorbs i...
67109     I got far too excited with the buzz about thes...
14962     Got this as a sample with my Sephora points, I...
15189     Really great stuff! i bought the small size, a...
79738     Honestly, I love masks and probably have tried...
63220     All of drunk elephants product

In [261]:
review_list = list(df_raw.sample(1000)['ReviewText'])

In [262]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(review_list))

print(data_words[:1])

[['in', 'love', 'with', 'this', 'beautiful', 'natural', 'glow']]


In [263]:
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)
bigram_mod = gensim.models.phrases.Phraser(bigram)
print(bigram_mod[data_words[0]])

['in', 'love', 'with', 'this', 'beautiful', 'natural', 'glow']


In [264]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [265]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

In [266]:
len(data_words_bigrams)

1000

In [66]:
import random
data_words_bigrams_sub=random.sample(data_words_bigrams,1000)  

In [267]:
import spacy
nlp = spacy.load('en')

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['love', 'beautiful', 'natural', 'glow']]


In [268]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized 

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1)]]


In [269]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto')
                                           #per_word_topics=True)

In [270]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.046*"makeup" + 0.031*"apply" + 0.026*"wear" + 0.022*"cream" + '
  '0.020*"bottle" + 0.019*"serum" + 0.019*"start" + 0.019*"little" + '
  '0.018*"get" + 0.017*"small"'),
 (1,
  '0.056*"thick" + 0.037*"normal" + 0.031*"glow" + 0.029*"leaf" + '
  '0.029*"beautiful" + 0.028*"everyday" + 0.027*"ball" + 0.023*"harsh" + '
  '0.020*"scrub" + 0.017*"white"'),
 (2,
  '0.052*"big" + 0.039*"store" + 0.038*"next" + 0.036*"soak" + 0.034*"else" + '
  '0.031*"change" + 0.022*"crazy" + 0.020*"zit" + 0.019*"later" + '
  '0.019*"chance"'),
 (3,
  '0.160*"eye" + 0.055*"cream" + 0.046*"cleanser" + 0.037*"provide" + '
  '0.026*"repurchase" + 0.021*"concealer" + 0.019*"dryness" + 0.017*"appear" + '
  '0.017*"firm" + 0.017*"fine_line"'),
 (4,
  '0.037*"quickly" + 0.021*"hormonal" + 0.020*"money" + 0.019*"improvement" + '
  '0.018*"rosacea" + 0.014*"clog" + 0.013*"excited" + 0.012*"residue" + '
  '0.012*"early" + 0.012*"become"'),
 (5,
  '0.061*"not" + 0.059*"spf" + 0.043*"coverage" + 0.031*"tell" + 

In [271]:
##build a matrix with LDA value
for i in range(0,len(corpus)):
    print ("review " + str(i+1) +": ",end='')
    print (lda_model[corpus[i]])

review 1: [(0, 0.073235646), (1, 0.117180966), (2, 0.010042817), (3, 0.015093878), (4, 0.028885769), (5, 0.012302017), (6, 0.023998259), (7, 0.015125177), (9, 0.015133197), (10, 0.3187019), (11, 0.017281123), (12, 0.012066933), (13, 0.05661117), (14, 0.029602936), (16, 0.19958135), (17, 0.011683182), (18, 0.01875277)]
review 2: [(0, 0.07899235), (4, 0.01653833), (6, 0.014195677), (10, 0.22694872), (13, 0.021366168), (14, 0.024209987), (16, 0.52405536), (18, 0.010737634)]
review 3: [(0, 0.030167669), (4, 0.010845554), (5, 0.014681451), (7, 0.027405309), (10, 0.29103017), (13, 0.104838125), (14, 0.01127631), (15, 0.2136527), (16, 0.23941277)]
review 4: [(0, 0.166164), (2, 0.09546312), (3, 0.01039399), (4, 0.019792322), (6, 0.01644346), (7, 0.010363663), (9, 0.010404915), (10, 0.25968623), (11, 0.0118409), (13, 0.10219989), (14, 0.023053098), (16, 0.18081404), (17, 0.039307214), (18, 0.0129046375)]
review 5: [(0, 0.090040445), (3, 0.17329043), (6, 0.21644552), (7, 0.018855672), (10, 0.216

review 354: [(0, 0.16562857), (1, 0.08002655), (4, 0.011585866), (6, 0.08307523), (8, 0.11725983), (10, 0.13509151), (11, 0.029981563), (13, 0.021563042), (14, 0.14103675), (16, 0.16223422)]
review 355: [(0, 0.10262159), (4, 0.046370238), (6, 0.012931033), (10, 0.18865965), (13, 0.03700793), (14, 0.061548036), (16, 0.46307284), (18, 0.010074248)]
review 356: [(0, 0.1293864), (3, 0.15851702), (4, 0.04436731), (6, 0.019298607), (7, 0.015146017), (8, 0.14035727), (10, 0.14293678), (13, 0.029358504), (14, 0.059835814), (16, 0.23406225)]
review 357: [(0, 0.1029801), (4, 0.017100623), (6, 0.1358531), (7, 0.0769528), (10, 0.38053173), (11, 0.010266983), (13, 0.021963524), (14, 0.01773119), (16, 0.15868142), (18, 0.011596795)]
review 358: [(0, 0.040781256), (1, 0.02411797), (4, 0.030416869), (8, 0.022069935), (9, 0.051790953), (10, 0.30090508), (13, 0.0133387875), (14, 0.030805567), (16, 0.36276695), (18, 0.030839188), (19, 0.04722072)]
review 359: [(0, 0.063768156), (3, 0.012274223), (4, 0.02

review 614: [(0, 0.100016795), (3, 0.053205535), (4, 0.022353847), (5, 0.073587604), (7, 0.054366637), (9, 0.0151373185), (10, 0.36195266), (12, 0.024656778), (13, 0.07102409), (16, 0.19623071)]
review 615: [(0, 0.029618107), (2, 0.04654999), (4, 0.031757798), (5, 0.04670104), (9, 0.02706815), (10, 0.2842065), (13, 0.05587726), (14, 0.01092716), (16, 0.3065739), (18, 0.02799938), (19, 0.087987155)]
review 616: [(0, 0.13035816), (1, 0.046801824), (4, 0.019042963), (6, 0.01582087), (7, 0.047832314), (10, 0.2641359), (11, 0.011393422), (13, 0.02596111), (14, 0.07204702), (16, 0.287581), (18, 0.012362782)]
review 617: [(0, 0.060634714), (4, 0.046370246), (6, 0.013840268), (9, 0.040371235), (10, 0.2938778), (13, 0.05181477), (14, 0.01613482), (16, 0.36581627), (18, 0.010463274), (19, 0.03614908)]
review 618: [(0, 0.061509114), (3, 0.012274223), (4, 0.023489682), (5, 0.010003904), (6, 0.019940754), (7, 0.05900162), (9, 0.012320691), (10, 0.388217), (11, 0.014057991), (13, 0.030169439), (14, 

review 761: [(0, 0.042985603), (2, 0.018971385), (4, 0.039471313), (6, 0.08676146), (9, 0.020250577), (10, 0.19666885), (12, 0.050516024), (13, 0.104925565), (16, 0.26251248), (17, 0.019110983), (18, 0.123170674)]
review 762: [(0, 0.04391762), (3, 0.024562573), (8, 0.70198125), (10, 0.07270919), (13, 0.012565059), (14, 0.010025921), (16, 0.067593716)]
review 763: [(0, 0.07933463), (4, 0.026178498), (6, 0.04314662), (9, 0.043710224), (10, 0.2631444), (11, 0.03985285), (13, 0.011667477), (16, 0.42671), (18, 0.023238476)]
review 764: [(0, 0.060765818), (1, 0.053158954), (3, 0.011726569), (4, 0.022441614), (6, 0.01866766), (7, 0.011750886), (9, 0.011755238), (10, 0.33808222), (11, 0.05804405), (13, 0.073438145), (14, 0.02302154), (16, 0.20291887), (18, 0.014569197), (19, 0.05227817)]
review 765: [(0, 0.053352166), (3, 0.010011002), (4, 0.018348278), (6, 0.015245568), (10, 0.17621534), (11, 0.010977829), (13, 0.23242864), (14, 0.119543344), (16, 0.28074786), (18, 0.011911785)]
review 766: [

review 948: [(0, 0.08762696), (2, 0.1312807), (3, 0.05910477), (4, 0.043571375), (6, 0.011198353), (8, 0.029148854), (10, 0.29759195), (11, 0.035867892), (13, 0.019189859), (14, 0.040663175), (15, 0.031228002), (16, 0.1642329)]
review 949: [(0, 0.058138214), (2, 0.026453646), (4, 0.07714348), (6, 0.0111891385), (8, 0.024554595), (10, 0.37668917), (13, 0.014564842), (14, 0.011680005), (16, 0.34143)]
review 950: [(0, 0.052704524), (3, 0.049693108), (4, 0.019792322), (6, 0.016717164), (7, 0.010363663), (9, 0.010368212), (10, 0.485249), (11, 0.011845051), (13, 0.06659546), (14, 0.021956088), (16, 0.18580443), (18, 0.012855968)]
review 951: [(0, 0.069504075), (3, 0.01281015), (4, 0.023489682), (5, 0.010003904), (6, 0.021556169), (7, 0.0122996755), (9, 0.012316235), (10, 0.22449699), (11, 0.014053785), (13, 0.030169439), (14, 0.02408495), (16, 0.4267407), (18, 0.06195156)]
review 952: [(0, 0.131082), (2, 0.022080667), (3, 0.03536867), (4, 0.02796293), (9, 0.037289776), (10, 0.22299403), (11,

In [76]:
lda_list=[]
for i in range(0,len(corpus)):
    
    lda_list.append(lda_model[corpus[i]])

In [84]:
for i in lda_list[0]:
    print (i[0])

0
1
2
3
4
17
18
19


In [238]:
ldas=[]
for i in range(len(lda_list)):
    top_ids=[]
    lda_values=[]
    lda_matrix=[]
    for j in lda_list[i]:
        top_ids = j[0]
        lda_values=j[1]
        lda_value={'top_id':top_ids,'lda_value':lda_values,'reviewid':i}
        lda_matrix.append(lda_value)
    ldas.append(lda_matrix)

In [153]:
ldas

[[{'lda_value': 0.121129796, 'reviewid': 0, 'top_id': 0},
  {'lda_value': 0.010241036, 'reviewid': 0, 'top_id': 1},
  {'lda_value': 0.025099335, 'reviewid': 0, 'top_id': 2},
  {'lda_value': 0.010653764, 'reviewid': 0, 'top_id': 3},
  {'lda_value': 0.37031358, 'reviewid': 0, 'top_id': 4},
  {'lda_value': 0.0273425, 'reviewid': 0, 'top_id': 17},
  {'lda_value': 0.3439503, 'reviewid': 0, 'top_id': 18},
  {'lda_value': 0.01985244, 'reviewid': 0, 'top_id': 19}],
 [{'lda_value': 0.1799856, 'reviewid': 1, 'top_id': 0},
  {'lda_value': 0.014693246, 'reviewid': 1, 'top_id': 2},
  {'lda_value': 0.019412635, 'reviewid': 1, 'top_id': 3},
  {'lda_value': 0.27304426, 'reviewid': 1, 'top_id': 4},
  {'lda_value': 0.017864536, 'reviewid': 1, 'top_id': 8},
  {'lda_value': 0.318818, 'reviewid': 1, 'top_id': 10},
  {'lda_value': 0.017802484, 'reviewid': 1, 'top_id': 16},
  {'lda_value': 0.017859906, 'reviewid': 1, 'top_id': 17},
  {'lda_value': 0.1046483, 'reviewid': 1, 'top_id': 18}],
 [{'lda_value': 0.2

In [239]:
from itertools import chain
df=pd.DataFrame(list(chain.from_iterable(ldas)))

In [187]:
df.set_index('reviewid',inplace=True)

In [192]:
df=df.stack()

In [205]:
df

Unnamed: 0,lda_value,reviewid,top_id
0,0.121130,0,0
1,0.010241,0,1
2,0.025099,0,2
3,0.010654,0,3
4,0.370314,0,4
5,0.027343,0,17
6,0.343950,0,18
7,0.019852,0,19
8,0.179986,1,0
9,0.014693,1,2


In [240]:
table = pd.pivot_table(df, values='lda_value', index=['reviewid'],
                  columns=['top_id'], aggfunc=np.sum,fill_value=0)

In [241]:
table.head(2)

top_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
reviewid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,0.12113,0.010241,0.025099,0.010654,0.370314,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.027343,0.34395,0.019852
1,0.179986,0.0,0.014693,0.019413,0.273044,0.0,0.0,0.0,0.017865,0.0,0.318818,0.0,0.0,0.0,0.0,0.0,0.017802,0.01786,0.104648,0.0


In [242]:
table['id']=list(df_raw['review_id'])[:1000]

In [243]:
table['incentizied']=list(df_raw['incentivized'])[:1000]

In [246]:
table.incentizied.value_counts()

0    962
1     38
Name: incentizied, dtype: int64

In [251]:
table.to_pickle('lda_table.pickle')

In [247]:
y = table['incentizied']
X= table.iloc[:,:-2]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5,random_state=42)

In [249]:
nb = GaussianNB()
nb.fit(X_train, y_train)
print(roc_auc_score(y_train, nb.predict(X_train)))
print(roc_auc_score(y_test, nb.predict(X_test)))

0.712495896706423
0.5030090819564504


In [252]:
nb.predict(X_test)

array([0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1,

In [273]:
import pyLDAvis, pyLDAvis.sklearn
from IPython.display import display
from sklearn import datasets

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
