In [24]:
from gensim.models import Word2Vec
from scipy.spatial.distance import cosine
import pandas as pd
import numpy as np

data = pd.read_csv('data.csv', encoding='latin-1')

education = data['education']
print('education shape:', education.shape)

# define training data
sentences = []
for i in range(education.shape[0]):
    if isinstance(education[i], str):
        tmp = education[i].split()
        sentences.append(tmp)
    else:
        sentences.append([])


# train model
# word embeddings of size 100x1
model = Word2Vec(sentences, size=100, window=5, workers=4)

# take the vector for every user by adding the embedding of each word
users = []
for i in range(len(sentences)):
    if len(sentences[i]) > 0:
        sum = [0] * 100
        for word in sentences[i]:
            if word in list(model.wv.vocab):
                we = model[word]
                sum = [x + y for x, y in zip(sum, we)]
        users.append(sum)
    else:
        users.append([-100] * 100)


users_np = np.array(users)
# Take 200 users just to demonstrate
users_np = users_np[0:200]
print('users shape:', users_np.shape)

education shape: (70417,)
users shape: (200, 100)


In [26]:
# find similarities
similarities = np.zeros((users_np.shape[0], users_np.shape[0]))
for i in range(users_np.shape[0]):
    print('User:', i, 'out of', users_np.shape[0]-1)
    for j in range(i,users_np.shape[0]):
        similarities[i][j] = cosine(users_np[i], users_np[j])

User: 0 out of 199
User: 1 out of 199
User: 2 out of 199
User: 3 out of 199
User: 4 out of 199
User: 5 out of 199
User: 6 out of 199
User: 7 out of 199
User: 8 out of 199
User: 9 out of 199
User: 10 out of 199
User: 11 out of 199
User: 12 out of 199
User: 13 out of 199
User: 14 out of 199
User: 15 out of 199
User: 16 out of 199
User: 17 out of 199
User: 18 out of 199
User: 19 out of 199
User: 20 out of 199
User: 21 out of 199
User: 22 out of 199
User: 23 out of 199
User: 24 out of 199
User: 25 out of 199
User: 26 out of 199
User: 27 out of 199
User: 28 out of 199
User: 29 out of 199
User: 30 out of 199
User: 31 out of 199
User: 32 out of 199
User: 33 out of 199
User: 34 out of 199
User: 35 out of 199
User: 36 out of 199
User: 37 out of 199
User: 38 out of 199
User: 39 out of 199
User: 40 out of 199
User: 41 out of 199
User: 42 out of 199
User: 43 out of 199
User: 44 out of 199
User: 45 out of 199
User: 46 out of 199
User: 47 out of 199
User: 48 out of 199
User: 49 out of 199
User: 50 o

In [27]:
# copy upper triangle to lower triangle
for i in range(users_np.shape[0]):
    for j in range(i,users_np.shape[0]):
        similarities[j][i] = similarities[i][j]

In [28]:
# ensure that the main diagonal is one
for i in range(users_np.shape[0]):
    similarities[i][i] = 1.0

In [29]:
pd.DataFrame(similarities)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,1.000000,0.503777,0.211188,0.979106,0.139602,0.279723,0.434610,0.306443,0.235275,8.410454e-02,...,0.319916,0.201192,0.291156,0.392431,0.227895,0.295813,0.979106,0.486971,8.410454e-02,0.508905
1,0.503777,1.000000,0.299720,0.863628,0.533933,0.265926,0.306195,0.532286,0.362945,4.652255e-01,...,0.403326,0.403514,0.358005,0.330000,0.378574,0.479854,0.863628,0.741486,4.652255e-01,0.399206
2,0.211188,0.299720,1.000000,0.999106,0.230327,0.119340,0.324166,0.280566,0.191540,1.667292e-01,...,0.205019,0.156722,0.145160,0.408003,0.144673,0.324716,0.999106,0.624640,1.667292e-01,0.515173
3,0.979106,0.863628,0.999106,1.000000,0.917020,0.944067,0.940996,0.981173,0.917859,9.681781e-01,...,0.972260,0.976595,1.002817,0.806514,0.942304,0.950055,0.000000,0.962260,9.681781e-01,0.864892
4,0.139602,0.533933,0.230327,0.917020,1.000000,0.308347,0.608147,0.393511,0.245049,8.561727e-02,...,0.417980,0.146847,0.402802,0.476120,0.203895,0.446673,0.917020,0.505917,8.561727e-02,0.675483
5,0.279723,0.265926,0.119340,0.944067,0.308347,1.000000,0.111769,0.270201,0.135593,2.001361e-01,...,0.088690,0.162665,0.053355,0.297979,0.045239,0.299171,0.944067,0.562573,2.001361e-01,0.290060
6,0.434610,0.306195,0.324166,0.940996,0.608147,0.111769,1.000000,0.369835,0.275046,4.162794e-01,...,0.150250,0.384651,0.120932,0.288362,0.206675,0.353748,0.940996,0.641824,4.162794e-01,0.213312
7,0.306443,0.532286,0.280566,0.981173,0.393511,0.270201,0.369835,1.000000,0.363543,3.362559e-01,...,0.215902,0.341235,0.237878,0.375860,0.298790,0.146389,0.981173,0.229462,3.362559e-01,0.298872
8,0.235275,0.362945,0.191540,0.917859,0.245049,0.135593,0.275046,0.363543,1.000000,1.416988e-01,...,0.217020,0.175889,0.187950,0.367663,0.113028,0.365233,0.917859,0.660348,1.416988e-01,0.434636
9,0.084105,0.465226,0.166729,0.968178,0.085617,0.200136,0.416279,0.336256,0.141699,1.000000e+00,...,0.305777,0.113880,0.256148,0.417682,0.128029,0.360592,0.968178,0.548358,-2.220446e-16,0.558305
