In [14]:
import numpy as np
import os
from sklearn.datasets import fetch_20newsgroups
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

# Deciding which embedding to use
possible_word_vectors = (50, 100, 200, 300)
word_vectors = possible_word_vectors[0]

# TODO: download from here
# https://github.com/stanfordnlp/GloVe
# specifically: http://nlp.stanford.edu/data/wordvecs/glove.6B.zip
file_name = f'glove.6B.{word_vectors}d.txt'

filepath = '/Users/nickager/Downloads/glove/'
pretrained_embedding = os.path.join(filepath, file_name)

    
embeddings_index = {}
with open(pretrained_embedding, "rb") as f:
    for line in f:
        values = line.split()
        word = values[0].decode("utf-8") 
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs


In [13]:
# taken from: /Users/nickager/Downloads/glove/glove.6B.50d.txt
"king 0.50451 0.68607 -0.59517 -0.022801 0.60046 -0.13498 -0.08813 0.47377 -0.61798 -0.31012 -0.076666 1.493 -0.034189 -0.98173 0.68229 0.81722 -0.51874 -0.31503 -0.55809 0.66421 0.1961 -0.13495 -0.11476 -0.30344 0.41177 -2.223 -1.0756 -1.0783 -0.34354 0.33505 1.9927 -0.04234 -0.64319 0.71125 0.49159 0.16754 0.34344 -0.25663 -0.8523 0.1661 0.40102 1.1685 -1.0137 -0.21585 -0.15155 0.78321 -0.91241 -1.6106 -0.64426 -0.51042".replace(" ", ",")
king = np.asarray([0.50451,0.68607,-0.59517,-0.022801,0.60046,-0.13498,-0.08813,0.47377,-0.61798,-0.31012,-0.076666,1.493,-0.034189,-0.98173,0.68229,0.81722,-0.51874,-0.31503,-0.55809,0.66421,0.1961,-0.13495,-0.11476,-0.30344,0.41177,-2.223,-1.0756,-1.0783,-0.34354,0.33505,1.9927,-0.04234,-0.64319,0.71125,0.49159,0.16754,0.34344,-0.25663,-0.8523,0.1661,0.40102,1.1685,-1.0137,-0.21585,-0.15155,0.78321,-0.91241,-1.6106,-0.64426,-0.51042])
"queen 0.37854 1.8233 -1.2648 -0.1043 0.35829 0.60029 -0.17538 0.83767 -0.056798 -0.75795 0.22681 0.98587 0.60587 -0.31419 0.28877 0.56013 -0.77456 0.071421 -0.5741 0.21342 0.57674 0.3868 -0.12574 0.28012 0.28135 -1.8053 -1.0421 -0.19255 -0.55375 -0.054526 1.5574 0.39296 -0.2475 0.34251 0.45365 0.16237 0.52464 -0.070272 -0.83744 -1.0326 0.45946 0.25302 -0.17837 -0.73398 -0.20025 0.2347 -0.56095 -2.2839 0.0092753 -0.60284".replace(" ", ",")
queen = np.asarray([0.37854,1.8233,-1.2648,-0.1043,0.35829,0.60029,-0.17538,0.83767,-0.056798,-0.75795,0.22681,0.98587,0.60587,-0.31419,0.28877,0.56013,-0.77456,0.071421,-0.5741,0.21342,0.57674,0.3868,-0.12574,0.28012,0.28135,-1.8053,-1.0421,-0.19255,-0.55375,-0.054526,1.5574,0.39296,-0.2475,0.34251,0.45365,0.16237,0.52464,-0.070272,-0.83744,-1.0326,0.45946,0.25302,-0.17837,-0.73398,-0.20025,0.2347,-0.56095,-2.2839,0.0092753,-0.60284])
"man -0.094386 0.43007 -0.17224 -0.45529 1.6447 0.40335 -0.37263 0.25071 -0.10588 0.10778 -0.10848 0.15181 -0.65396 0.55054 0.59591 -0.46278 0.11847 0.64448 -0.70948 0.23947 -0.82905 1.272 0.033021 0.2935 0.3911 -2.8094 -0.70745 0.4106 0.3894 -0.2913 2.6124 -0.34576 -0.16832 0.25154 0.31216 0.31639 0.12539 -0.012646 0.22297 -0.56585 -0.086264 0.62549 -0.0576 0.29375 0.66005 -0.53115 -0.48233 -0.97925 0.53135 -0.11725".replace(" ", ",")
man = np.asarray([-0.094386,0.43007,-0.17224,-0.45529,1.6447,0.40335,-0.37263,0.25071,-0.10588,0.10778,-0.10848,0.15181,-0.65396,0.55054,0.59591,-0.46278,0.11847,0.64448,-0.70948,0.23947,-0.82905,1.272,0.033021,0.2935,0.3911,-2.8094,-0.70745,0.4106,0.3894,-0.2913,2.6124,-0.34576,-0.16832,0.25154,0.31216,0.31639,0.12539,-0.012646,0.22297,-0.56585,-0.086264,0.62549,-0.0576,0.29375,0.66005,-0.53115,-0.48233,-0.97925,0.53135,-0.11725])
"woman -0.18153 0.64827 -0.5821 -0.49451 1.5415 1.345 -0.43305 0.58059 0.35556 -0.25184 0.20254 -0.71643 0.3061 0.56127 0.83928 -0.38085 -0.90875 0.43326 -0.014436 0.23725 -0.53799 1.7773 -0.066433 0.69795 0.69291 -2.6739 -0.76805 0.33929 0.19695 -0.35245 2.292 -0.27411 -0.30169 0.00085286 0.16923 0.091433 -0.02361 0.036236 0.34488 -0.83947 -0.25174 0.42123 0.48616 0.022325 0.5576 -0.85223 -0.23073 -1.3138 0.48764 -0.10467".replace(" ", ",")
woman = np.asarray([-0.18153,0.64827,-0.5821,-0.49451,1.5415,1.345,-0.43305,0.58059,0.35556,-0.25184,0.20254,-0.71643,0.3061,0.56127,0.83928,-0.38085,-0.90875,0.43326,-0.014436,0.23725,-0.53799,1.7773,-0.066433,0.69795,0.69291,-2.6739,-0.76805,0.33929,0.19695,-0.35245,2.292,-0.27411,-0.30169,0.00085286,0.16923,0.091433,-0.02361,0.036236,0.34488,-0.83947,-0.25174,0.42123,0.48616,0.022325,0.5576,-0.85223,-0.23073,-1.3138,0.48764,-0.10467])

predicted_queen = king - man + woman

diffence = queen - predicted_queen
print("queen =",queen)
print("predicted_queen = ", predicted_queen)
print("diffence = ", diffence)
# I guess it might be closer with a longer encoding eg "glove.6B.300d.txt"

queen = [ 0.37854    1.8233    -1.2648    -0.1043     0.35829    0.60029
 -0.17538    0.83767   -0.056798  -0.75795    0.22681    0.98587
  0.60587   -0.31419    0.28877    0.56013   -0.77456    0.071421
 -0.5741     0.21342    0.57674    0.3868    -0.12574    0.28012
  0.28135   -1.8053    -1.0421    -0.19255   -0.55375   -0.054526
  1.5574     0.39296   -0.2475     0.34251    0.45365    0.16237
  0.52464   -0.070272  -0.83744   -1.0326     0.45946    0.25302
 -0.17837   -0.73398   -0.20025    0.2347    -0.56095   -2.2839
  0.0092753 -0.60284  ]
predicted_queen =  [ 0.417366    0.90427    -1.00503    -0.062021    0.49726     0.80667
 -0.14855     0.80365    -0.15654    -0.66974     0.234354    0.62476
  0.925871   -0.971       0.92566     0.89915    -1.54596    -0.52625
  0.136954    0.66199     0.48716     0.37035    -0.214214    0.10101
  0.71358    -2.0875     -1.1362     -1.14961    -0.53599     0.2739
  1.6723      0.02931    -0.77656     0.46056286  0.34866    -0.057417
  0.1944

In [15]:
# Getting the data
cats = ['alt.atheism', 'sci.space']
# After try a multiclass example
# cats = ['alt.atheism', 'talk.religion.misc',
#         'comp.graphics', 'sci.space']

newsgroups_train = fetch_20newsgroups(subset='train', categories=cats)
newsgroups_test = fetch_20newsgroups(subset='test', categories=cats)

X_train = newsgroups_train['data']
y_train = newsgroups_train['target']

X_test = newsgroups_test['data']
y_test = newsgroups_test['target']

In [17]:
class EmbeddingVectorizer(object):
    """
    Follows the scikit-learn API
    Transform each document in the average
    of the embeddings of the words in it
    """

    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.dim = 50
        
    def fit(self, X, y):
        return self

    def transform(self, X):
        """
        Find the embedding vector for each word in the dictionary
        and take the mean for each document
        """
        # Renaming it just to make it more understandable 
        documents = X
        embedded_docs = []
        for document in documents:
            # For each document
            # Consider the mean of all the embeddings
            embedded_document = []
            for words in document:

                for w in words:
                    if w in self.word2vec:
                        embedded_word = self.word2vec[w]
                    else:
                        embedded_word = np.zeros(self.dim)
                    embedded_document.append(embedded_word
            embedded_docs.append(np.mean(embedded_document, axis=0))

        return embedded_docs


SyntaxError: invalid syntax (<ipython-input-17-74c05b0f5b11>, line 35)

In [263]:
# Creating the embedding
e = EmbeddingVectorizer(embeddings_index)
X_train_embedded = e.transform(X_train)

# Train the classifier
rf = RandomForestClassifier(n_estimators=50, n_jobs=-1)
rf.fit(X_train_embedded, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [264]:
X_test_embedded = e.transform(X_test)
predictions = rf.predict(X_test_embedded)

In [268]:
print('AUC score: ', roc_auc_score(predictions, y_test))
confusion_matrix(predictions, y_test)

AUC score:  0.7405204936377006


array([[224,  88],
       [ 95, 306]])