In [1]:
import pandas as pd
from sklearn.preprocessing import normalize
from gensim.models import Word2Vec

import nltk
from nltk.tokenize import word_tokenize
import string

nltk.download('stopwords')
nltk.download('punkt')

from nltk.corpus import stopwords

import numpy as np


[nltk_data] Downloading package stopwords to /Users/maxko/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/maxko/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
data = pd.read_csv('CompanyDescriptions/results/result_wikipedia.csv', header=None)

# can be changed 
num_dim = 10


In [3]:

stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = text.lower()
    text = ''.join([word for word in text if word not in string.punctuation])
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

y_train_preprocessed = data[1].apply(preprocess)

y_train_preprocessed

0      3m company originally minnesota mining manufac...
1      smith corporation american manufacturer reside...
2      abbott laboratories american multinational med...
3      abbvie inc american pharmaceutical company hea...
4      accenture plc irishamerican34 professional ser...
                             ...                        
498    yum brands inc sometimes called simply yum for...
499    zebra technologies corporation american mobile...
500    zimmer biomet holdings inc publicly traded med...
501    zions bancorporation national bank headquarter...
502    zoetis inc zōehtis3 american drug company worl...
Name: 1, Length: 503, dtype: object

In [4]:
sentences = [sentence.split() for sentence in y_train_preprocessed]
w2v_model = Word2Vec(sentences, vector_size=num_dim, window=5, min_count=5, workers=4)

In [5]:
def vectorize(sentence):
    words = sentence.split()
    words_vecs = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
    if len(words_vecs) == 0:
        return np.zeros(num_dim)
    words_vecs = np.array(words_vecs)
    return words_vecs.mean(axis=0)

y_train_vec = np.array([vectorize(sentence) for sentence in y_train_preprocessed])
y_train_vec
# 100 dims

array([[ 0.14531448, -0.31409773,  0.93277335, ...,  0.79821116,
        -1.04476368, -0.93657184],
       [ 0.14396156, -0.33005032,  0.95867735, ...,  0.79357165,
        -1.0555861 , -0.94542086],
       [ 0.15634735, -0.33780304,  0.98141581, ...,  0.86712593,
        -1.10869014, -0.99961019],
       ...,
       [ 0.15221711, -0.33111107,  0.96135604, ...,  0.82860547,
        -1.08630109, -0.97787547],
       [ 0.17055503, -0.32381296,  0.93152189, ...,  0.78968233,
        -1.02204382, -0.8764475 ],
       [ 0.16871123, -0.33991182,  0.98023301, ...,  0.81830347,
        -1.10821581, -0.98673064]])

In [6]:

# Calculate the Frobenius norm
frobenius_norm = np.linalg.norm(y_train_vec)

# Normalize the matrix
normalized_matrix = y_train_vec / frobenius_norm

#print
normalized_matrix

array([[ 0.00290956, -0.00628901,  0.01867643, ...,  0.01598216,
        -0.02091875, -0.01875248],
       [ 0.00288247, -0.00660842,  0.01919509, ...,  0.01588927,
        -0.02113544, -0.01892966],
       [ 0.00313046, -0.00676365,  0.01965037, ...,  0.017362  ,
        -0.02219872, -0.02001467],
       ...,
       [ 0.00304776, -0.00662966,  0.01924872, ...,  0.01659073,
        -0.02175043, -0.01957948],
       [ 0.00341493, -0.00648354,  0.01865137, ...,  0.01581139,
        -0.02046384, -0.01754864],
       [ 0.00337802, -0.00680587,  0.01962669, ...,  0.01638446,
        -0.02218922, -0.01975679]])

In [7]:
DF = pd.DataFrame(y_train_vec)
DF.insert(loc=0, column=None, value=data[0])
DF.iloc[1:,:]
DF

Unnamed: 0,NaN,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0
0,MMM,0.145314,-0.314098,0.932773,0.134556,0.550578,0.169763,1.015928,0.798211,-1.044764,-0.936572
1,AOS,0.143962,-0.330050,0.958677,0.104702,0.533082,0.151679,1.012805,0.793572,-1.055586,-0.945421
2,ABT,0.156347,-0.337803,0.981416,0.128889,0.594078,0.152767,1.075543,0.867126,-1.108690,-0.999610
3,ABBV,0.178424,-0.330705,0.990502,0.097597,0.559187,0.154106,1.075746,0.805009,-1.112250,-0.992841
4,ACN,0.130635,-0.306604,0.901744,0.104943,0.546478,0.159083,1.021946,0.744592,-1.029023,-0.906673
...,...,...,...,...,...,...,...,...,...,...,...
498,YUM,0.113347,-0.235512,0.738066,0.100920,0.462426,0.101264,0.803032,0.628627,-0.828404,-0.733375
499,ZBRA,0.161444,-0.310024,0.950601,0.120797,0.562274,0.158039,1.033530,0.769918,-1.048851,-0.949256
500,ZBH,0.152217,-0.331111,0.961356,0.096282,0.578794,0.158430,1.058673,0.828605,-1.086301,-0.977875
501,ZION,0.170555,-0.323813,0.931522,0.083883,0.523752,0.161114,1.025756,0.789682,-1.022044,-0.876447


In [8]:
# save to csv
f_name = f'stock2vec_{num_dim}.csv'
DF.to_csv(f_name, header=None, index=None)

In [9]:
stock_vector = pd.read_csv('stock2vec.csv', header=None)
stock_vector

Unnamed: 0,0,1,2,3,4,5
0,MMM,0.245664,0.784547,1.871211,-0.849434,-0.139912
1,AOS,0.262782,0.738603,1.861308,-0.855644,-0.119608
2,ABT,0.278116,0.829196,1.947163,-0.912031,-0.145768
3,ABBV,0.264274,0.805243,1.955221,-0.911898,-0.149799
4,ACN,0.250276,0.819325,1.827672,-0.892013,-0.095926
...,...,...,...,...,...,...
498,YUM,0.220775,0.655211,1.488360,-0.664815,-0.085088
499,ZBRA,0.266783,0.806968,1.863385,-0.851786,-0.155958
500,ZBH,0.336645,0.882239,1.907940,-0.919268,-0.185976
501,ZION,0.382002,0.849537,1.906690,-0.989798,-0.090075


In [10]:
######## For Testing ########

tickerA = 'AAPL'
tickerB = 'MSFT'

a = stock_vector[stock_vector[0] == tickerA].to_numpy()[0][1:]
b = stock_vector[stock_vector[0] == tickerB].to_numpy()[0][1:]
print(tickerA, a)
print(tickerB, b)

dist = np.linalg.norm(a-b)
dist

AAPL [0.2484474629163742 0.8379353284835815 1.878481149673462
 -0.8634873032569885 -0.1390103548765182]
MSFT [0.2450104504823684 0.7343908548355103 1.7268694639205933
 -0.8050528764724731 -0.1115623712539672]


0.19464672710562833