In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score

import gensim
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from gensim.parsing.preprocessing import strip_punctuation

from sklearn.decomposition import PCA

from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.models import load_model

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(True)

Using TensorFlow backend.


## 01. Build your own embeddings & using pre-existing embeddings

### Build your own embeddings

In [2]:
sentences = [["this", "is", "the", "first", "sentence", "for", "word2vec"],
             ["this", "is", "the", "second", "sentence", "for", "word2vec"]]

In [3]:
# sg: 1 for skip-gram, 2 for CBOW
# alpha: learning rate
# window: windows size for skip-grams or CBOW
# min_count: any word counts that are less than this are ignored
# size: dimensions for each word
model = Word2Vec(sentences, alpha=0.025, window=5, min_count=1, sg=1, size=10)

In [4]:
print(model)

Word2Vec(vocab=8, size=10, alpha=0.025)


In [5]:
print(list(model.wv.vocab))

['this', 'is', 'the', 'first', 'sentence', 'for', 'word2vec', 'second']


In [6]:
model.wv[model.wv.vocab]

array([[-0.00881377,  0.02380548, -0.0276916 ,  0.00943281,  0.0353776 ,
         0.04147253, -0.00760771,  0.03068272,  0.03813342,  0.04655814],
       [ 0.02699806,  0.04353414,  0.02711135,  0.02182131,  0.03855942,
        -0.01136886,  0.01113139,  0.00316165,  0.0324847 ,  0.02678204],
       [-0.02763442, -0.04942229,  0.03640752, -0.04303996,  0.03313507,
         0.00819083,  0.04979353,  0.00522264,  0.03980155, -0.0078884 ],
       [ 0.02680141, -0.0432024 ,  0.00554856,  0.00348457, -0.00249102,
        -0.00375387,  0.04069378,  0.02697994, -0.03181545, -0.03122688],
       [-0.01044888,  0.01529652,  0.02926535, -0.03910768,  0.01157004,
         0.04935143,  0.02350862,  0.04925704,  0.01833132,  0.02647658],
       [ 0.0018603 , -0.01861919, -0.0279343 ,  0.04353117,  0.04889671,
         0.02309764,  0.01833814, -0.00708604, -0.01628845,  0.00257618],
       [-0.00298509,  0.02508593,  0.01865868, -0.00970567, -0.03590273,
         0.01946534,  0.00823023,  0.02897257

In [7]:
model.save('model.bin')

In [8]:
model = Word2Vec.load('model.bin')

In [9]:
pca = PCA(n_components=3)
pca_result = pca.fit_transform(model.wv[model.wv.vocab])

In [10]:
pca.explained_variance_ratio_

array([0.41189152, 0.24735574, 0.17814666], dtype=float32)

In [11]:
pca_result_df = pd.DataFrame([pca_result[:,0], pca_result[:,1], pca_result[:,2]]).T
pca_result_df.columns = ["x", "y", "z"]
pca_result_df["word"] = list(model.wv.vocab)

In [12]:
pca_result_df

Unnamed: 0,x,y,z,word
0,0.076952,-0.000653,0.005211,this
1,0.05871,0.013753,0.001054,is
2,-0.037101,-0.039932,0.066518,the
3,-0.06466,0.035363,-0.015182,first
4,0.019333,-0.05867,-0.009261,sentence
5,0.014208,0.071526,0.021258,for
6,-0.000948,-0.019166,-0.05687,word2vec
7,-0.066495,-0.002221,-0.012729,second


In [13]:
trace1 = go.Scatter3d(x=pca_result_df["x"], y=pca_result_df["y"], z=pca_result_df["z"],
                      mode='markers', marker={"size": 4, "opacity": 0.8},
                      text=pca_result_df["word"])
data = [trace1]

layout = go.Layout(margin={"l": 0, "r": 0, "b": 0, "t": 0}, 
                   scene={"xaxis": {"title": "1st Principle Component"}, 
                          "yaxis": {"title": "2nd Principle Component"}, 
                          "zaxis": {"title": "3rd Principle Component"},
                          "annotations": [{"showarrow": False,
                                           "x": row["x"],
                                           "y": row["y"],
                                           "z": row["z"],
                                           "text": row["word"],
                                           "xanchor": "left",
                                           "xshift": 5,
                                           "opacity": 1} for _, row in pca_result_df.iterrows()]})

iplot(go.Figure(data=data, layout=layout))

### Use Stanford NLP Group's Glove Vectors

In [4]:
# 6B tokens, 400K vocab, uncased, 50d vectors
glove_vectors = KeyedVectors.load_word2vec_format('./vectors/word2vec.6B.50d.txt')

In [5]:
print(list(glove_vectors.vocab)[0:10])

['the', ',', '.', 'of', 'to', 'and', 'in', 'a', '"', "'s"]


In [6]:
glove_vectors[glove_vectors.vocab]

array([[ 0.418   ,  0.24968 , -0.41242 , ..., -0.18411 , -0.11514 ,
        -0.78581 ],
       [ 0.013441,  0.23682 , -0.16899 , ..., -0.56657 ,  0.044691,
         0.30392 ],
       [ 0.15164 ,  0.30177 , -0.16763 , ..., -0.35652 ,  0.016413,
         0.10216 ],
       ...,
       [-0.51181 ,  0.058706,  1.0913  , ..., -0.25003 , -1.125   ,
         1.5863  ],
       [-0.75898 , -0.47426 ,  0.4737  , ...,  0.78954 , -0.014116,
         0.6448  ],
       [ 0.072617, -0.51393 ,  0.4728  , ..., -0.18907 , -0.59021 ,
         0.55559 ]], dtype=float32)

In [17]:
pca = PCA(n_components=3)
pca_result = pca.fit_transform(glove_vectors[glove_vectors.vocab])

In [18]:
pca.explained_variance_ratio_

array([0.05418382, 0.04413974, 0.03595718])

In [23]:
pca_result_df = pd.DataFrame([pca_result[:,0], pca_result[:,1], pca_result[:,2]]).T
pca_result_df.columns = ["x", "y", "z"]
pca_result_df["word"] = list(glove_vectors.vocab)

In [24]:
pca_result_df = pca_result_df.head(300)

In [25]:
pca_result_df.head(10)

Unnamed: 0,x,y,z,word
0,5.216968,2.071525,-1.041188,the
1,4.079723,3.07118,-0.782012,","
2,4.607769,2.452483,-0.470738,.
3,5.049289,2.102895,-1.197363,of
4,5.046917,2.349576,-0.315912,to
5,4.548707,2.689034,-0.496939,and
6,5.170194,1.925231,-1.519525,in
7,5.011414,2.323164,-0.15561,a
8,3.896892,2.540642,-0.945204,""""
9,4.346787,3.494284,-0.825948,'s


In [26]:
trace1 = go.Scatter3d(x=pca_result_df["x"], y=pca_result_df["y"], z=pca_result_df["z"],
                      mode='markers', marker={"size": 4, "opacity": 0.8},
                      text=pca_result_df["word"])
data = [trace1]

layout = go.Layout(margin={"l": 0, "r": 0, "b": 0, "t": 0}, 
                   scene={"xaxis": {"title": "1st Principle Component"}, 
                          "yaxis": {"title": "2nd Principle Component"}, 
                          "zaxis": {"title": "3rd Principel Component"},
                          "annotations": [{"showarrow": False,
                                           "x": row["x"],
                                           "y": row["y"],
                                           "z": row["z"],
                                           "text": row["word"],
                                           "xanchor": "left",
                                           "xshift": 5,
                                           "opacity": 1} for _, row in pca_result_df.iterrows()]})

iplot(go.Figure(data=data, layout=layout))

In [27]:
pca_result_df = pd.DataFrame([pca_result[:,0], pca_result[:,1], pca_result[:,2]]).T
pca_result_df.columns = ["x", "y", "z"]
pca_result_df["word"] = list(glove_vectors.vocab)

In [28]:
pca_result_df = pca_result_df[pca_result_df["word"].isin(["king", "queen", "uncle", "aunt", "man", "woman"])]

In [29]:
pca_result_df

Unnamed: 0,x,y,z,word
300,4.164795,2.86948,-0.076412,man
691,3.350322,3.133873,-1.148513,king
787,3.934626,2.170757,-0.023347,woman
2060,3.360914,1.810852,-1.060087,queen
5152,1.837812,3.200901,-0.300916,uncle
9651,1.592711,2.367068,-0.39079,aunt


In [30]:
trace1 = go.Scatter3d(x=pca_result_df["x"], y=pca_result_df["y"], z=pca_result_df["z"],
                      mode='markers', marker={"size": 4, "opacity": 0.8},
                      text=pca_result_df["word"])
data = [trace1]

layout = go.Layout(margin={"l": 0, "r": 0, "b": 0, "t": 0}, 
                   scene={"xaxis": {"title": "1st Principle Component"}, 
                          "yaxis": {"title": "2nd Principle Component"}, 
                          "zaxis": {"title": "3rd Principel Component"},
                          "annotations": [{"showarrow": False,
                                           "x": row["x"],
                                           "y": row["y"],
                                           "z": row["z"],
                                           "text": row["word"],
                                           "xanchor": "left",
                                           "xshift": 5,
                                           "opacity": 1} for _, row in pca_result_df.iterrows()]})

iplot(go.Figure(data=data, layout=layout))

## 02. Sentiment Analysis our pre-built embeddings

In [31]:
documents = ["oh my god! So good. Philly grinder!",
             "This is one of the best pizza's in town. The 7 cheese pizza is A mazing.... stuffed churros, yum. Great upbeat staff. I could eat here every night.",
             "I'm telling you YOUR SERVICE STINKS AND YOU'D RETAIN MORE CUSTOMERS IF YOU'D JUST TEACH THEM TO USE THE PHONE PROPERLY AND TAKE AN ORDER PROPERLY.",
             "My favorite Fort Collins pizza place! I always go for the south of the border. They also have the best spicy ranch.",
             "Nope. Don't do it. Ordered a pizza over 2 hours ago and it still hasn't shown up.",
             "My favorite pizza place in fort collins! I always order South of the border with cream cheese the staff are awesome. One time the pizza deliver guy came super late. He apologized and gave us restaurant credit."]

labels = np.array([1, 1, 0, 1, 1, 0])

In [32]:
encoded_documents = []
for line in documents:
    line = strip_punctuation(line)
    encoded = []
    for word in line.split():
        try:
            encoded.append(glove_vectors.vocab[word].index)
        except:
            encoded.append(0)
    encoded_documents.append(encoded)

In [33]:
encoded_documents[0]

[3202, 192, 1533, 0, 219, 0, 39890]

In [34]:
max_length = max([len(i.split()) for i in documents])

In [35]:
max_length

37

In [36]:
padded_documents = pad_sequences(encoded_documents, maxlen=max_length, padding='post')

In [37]:
padded_documents[0]

array([ 3202,   192,  1533,     0,   219,     0, 39890,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0], dtype=int32)

In [38]:
embedding_input_dim = glove_vectors[glove_vectors.vocab].shape[0]
embedding_output_dim = glove_vectors[glove_vectors.vocab].shape[1]

In [39]:
embedding_input_dim

400000

In [40]:
embedding_output_dim

50

In [41]:
model = Sequential()
model.add(Embedding(input_dim=embedding_input_dim, output_dim=embedding_output_dim, weights=[glove_vectors[glove_vectors.vocab]], input_length=max_length))
model.add(Flatten())
model.add(Dense(128, activation="relu"))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss="binary_crossentropy", optimizer="adam")
model.fit(padded_documents, labels, epochs=50, verbose=0)
model.evaluate(padded_documents, labels, verbose=3)

0.0003093615814577788

In [42]:
model.save("model.h5")
model = load_model("model.h5")

In [43]:
model.predict(padded_documents)

array([[9.9987817e-01],
       [9.9998033e-01],
       [1.4585104e-03],
       [9.9992275e-01],
       [9.9982244e-01],
       [2.5677042e-07]], dtype=float32)

In [44]:
model.predict_classes(padded_documents)

array([[1],
       [1],
       [0],
       [1],
       [1],
       [0]], dtype=int32)

In [45]:
labels

array([1, 1, 0, 1, 1, 0])

In [46]:
accuracy_score(labels, model.predict_classes(padded_documents))

1.0

### Create an example document

In [47]:
documents = ["The pizza is great, I would highly recommend it",
             "The pizza is bad"]

In [48]:
encoded_documents = []
for line in documents:
    line = strip_punctuation(line)
    encoded = []
    for word in line.split():
        try:
            encoded.append(glove_vectors.vocab[word].index)
        except:
            encoded.append(0)
    encoded_documents.append(encoded)

In [49]:
encoded_documents[0]

[0, 9388, 14, 353, 0, 54, 1786, 7546, 20]

In [50]:
padded_documents = pad_sequences(encoded_documents, maxlen=max_length, padding='post')

In [51]:
padded_documents[0]

array([   0, 9388,   14,  353,    0,   54, 1786, 7546,   20,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0], dtype=int32)

In [52]:
model.predict_classes(padded_documents)

array([[1],
       [0]], dtype=int32)

## Embedding Projector

https://projector.tensorflow.org/