In [2]:
%pip install --upgrade gensim

Collecting gensim
  Downloading gensim-4.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.3 kB)
Downloading gensim-4.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.3.2
Note: you may need to restart the kernel to use updated packages.


In [52]:

import pandas as pd
import re
from nltk.corpus import stopwords
from gensim.models import Word2Vec
import multiprocessing

In [53]:
df_fin_phrase = pd.read_csv('../../Data/Raw/fin_phrase_bank.csv')

In [54]:
df_fin_phrase.columns

Index(['sentence', 'label'], dtype='object')

In [55]:
def clean_data(text):
    
    text = re.sub(r'[\\/×\^\]\[÷]', '', text)
     # remove punctuation at the end of words
    text = re.sub(r'[.,!?]', ' ', text)
    # remove special characters except fo decimal points
    text = re.sub(r'[^a-zA-Z\s.]', ' ', text)
    # remove extra spaces
    text = re.sub(r'\s+', ' ', text)
    # remove leading and trailing spaces
    text = re.sub(r'^\s+|\s+?$', ' ', text.lower())
    return text
def change_lower(text):
    text = text.lower()
    return text

stopwords_list = stopwords.words("english")
def remover(text):
    text_tokens = text.split(" ")
    final_list = [word for word in text_tokens if not word in stopwords_list]
    text = ' '.join(final_list)
    return text

def get_w2vdf(df):
    w2v_df = pd.DataFrame(df["sentence"]).values.tolist()
    for i in range(len(w2v_df)):
        w2v_df[i] = w2v_df[i][0].split(" ")
    return w2v_df


In [72]:
def train_w2v(w2v_df):
    cores = multiprocessing.cpu_count()
    w2v_model = Word2Vec(min_count=2,
                         window=2,
                         vector_size=500, 
                         alpha=0.03, 
                         min_alpha=0.0007, 
                         sg=1,
                         workers=cores-1)
    
    w2v_model.build_vocab(w2v_df, progress_per=10000)
    w2v_model.train(w2v_df, total_examples=w2v_model.corpus_count, epochs=100, report_delay=1)
    return w2v_model

In [None]:
#getting size of the vector size for sentence
def getVectorSize():
    pass

In [56]:
df_fin_phrase[["sentence"]] = df_fin_phrase[["sentence"]].astype(str)
df_fin_phrase["sentence"] = df_fin_phrase["sentence"].apply(change_lower)
df_fin_phrase["sentence"] = df_fin_phrase["sentence"].apply(clean_data)
df_fin_phrase["sentence"] = df_fin_phrase["sentence"].apply(remover)

In [57]:
df_fin_phrase["sentence"]

0       according gran company plans move production r...
1       technopolis plans develop stages area less squ...
2       international electronic industry company elco...
3       new production plant company would increase ca...
4       according company updated strategy years baswa...
                              ...                        
4841    london marketwatch share prices ended lower lo...
4842    rinkuskiai beer sales fell per cent million li...
4843    operating profit fell eur mn eur mn including ...
4844    net sales paper segment decreased eur mn secon...
4845    sales finland decreased january sales outside ...
Name: sentence, Length: 4846, dtype: object

In [73]:
w2v_df = get_w2vdf(df_fin_phrase)
w2v_model = train_w2v(w2v_df)

In [74]:
# print keys of the model
w2v_model.wv.key_to_index

{'': 0,
 'eur': 1,
 'company': 2,
 'mn': 3,
 'said': 4,
 'finnish': 5,
 'sales': 6,
 'million': 7,
 'net': 8,
 'profit': 9,
 'year': 10,
 'finland': 11,
 'group': 12,
 'operating': 13,
 'mln': 14,
 'new': 15,
 'business': 16,
 'period': 17,
 'quarter': 18,
 'oyj': 19,
 'share': 20,
 'market': 21,
 'also': 22,
 'services': 23,
 'shares': 24,
 'first': 25,
 'euro': 26,
 'helsinki': 27,
 'loss': 28,
 'operations': 29,
 'today': 30,
 'compared': 31,
 'contract': 32,
 'nokia': 33,
 'mobile': 34,
 'total': 35,
 'per': 36,
 'financial': 37,
 'based': 38,
 'production': 39,
 'products': 40,
 'corporation': 41,
 'percent': 42,
 'bank': 43,
 'according': 44,
 'hel': 45,
 'companies': 46,
 'technology': 47,
 'corresponding': 48,
 'plant': 49,
 'service': 50,
 'v': 51,
 'solutions': 52,
 'construction': 53,
 'one': 54,
 'capital': 55,
 'increased': 56,
 'well': 57,
 'agreement': 58,
 'investment': 59,
 'customers': 60,
 'increase': 61,
 'rose': 62,
 'value': 63,
 'pct': 64,
 'order': 65,
 'oy': 66

In [71]:
# window of 6
w2v_model.wv.most_similar("publish")

[('bulletin', 0.5709467530250549),
 ('eet', 0.5367853045463562),
 ('analyses', 0.5361486673355103),
 ('summary', 0.4960907995700836),
 ('books', 0.4860750138759613),
 ('statements', 0.481698602437973),
 ('interim', 0.47395867109298706),
 ('viii', 0.4587167203426361),
 ('pakistan', 0.45790576934814453),
 ('announcement', 0.45124995708465576)]

In [75]:
# window of 2
w2v_model.wv.most_similar("publish")

[('revealed', 0.5955195426940918),
 ('anne', 0.5440590977668762),
 ('interim', 0.5399895906448364),
 ('biomass', 0.5242285132408142),
 ('bulletin', 0.5096632242202759),
 ('hydrogen', 0.5056923627853394),
 ('peroxide', 0.5012044310569763),
 ('analyses', 0.49967774748802185),
 ('books', 0.4982476830482483),
 ('unaudited', 0.4959008991718292)]

In [None]:
w2v_model.wv.most_similar("publish")