# **WORD2VEC**

In [3]:
import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
import logging

# Enable logging for Gensim
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Function to read and preprocess text file
def read_corpus(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            # Tokenize the line into words
            yield simple_preprocess(line)

# Path to your text file
file_path = 'finance_dataset.txt'

# Load and preprocess the text data
corpus = list(read_corpus(file_path))



In [2]:
corpus[1]

['whether',
 'you',
 're',
 'first',
 'time',
 'investor',
 'or',
 'have',
 'been',
 'investing',
 'for',
 'many',
 'years',
 'there',
 'are',
 'some',
 'basic',
 'questions',
 'you',
 'should',
 'always',
 'ask',
 'before',
 'you',
 'commit',
 'your',
 'hard',
 'earned',
 'money',
 'to',
 'an',
 'investment']

## CMOD architecture

In [3]:
# Initialize and train the Word2Vec model
cmod_model = Word2Vec(sentences=corpus, vector_size=100, window=5, min_count=2, workers=8)

# Save the model
# model.save("word2vec.model")

# Load the model (if you want to continue training or use it later)
# model = Word2Vec.load("word2vec.model")

# Optional: Fine-tune an existing pre-trained model
# model = Word2Vec.load('pretrained_model_path')
# model.build_vocab(corpus, update=True)
# model.train(corpus, total_examples=model.corpus_count, epochs=model.epochs)



2024-07-04 20:35:12,335 : INFO : collecting all words and their counts
2024-07-04 20:35:12,336 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2024-07-04 20:35:12,344 : INFO : collected 2800 word types from a corpus of 41775 raw words and 2686 sentences
2024-07-04 20:35:12,345 : INFO : Creating a fresh vocabulary
2024-07-04 20:35:12,348 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=2 retains 1854 unique words (66.21% of original 2800, drops 946)', 'datetime': '2024-07-04T20:35:12.348218', 'gensim': '4.3.2', 'python': '3.11.9 (main, Apr 19 2024, 16:48:06) [GCC 11.2.0]', 'platform': 'Linux-5.15.0-113-generic-x86_64-with-glibc2.31', 'event': 'prepare_vocab'}
2024-07-04 20:35:12,348 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=2 leaves 40829 word corpus (97.74% of original 41775, drops 946)', 'datetime': '2024-07-04T20:35:12.348576', 'gensim': '4.3.2', 'python': '3.11.9 (main, Apr 19 2024, 16:48:06) [GCC 11.2.0]', 'platform': 'L

In [4]:
# Test the model
print(cmod_model.wv.most_similar('money'))

[('market', 0.9996460676193237), ('one', 0.9995118975639343), ('bonds', 0.9994921684265137), ('stock', 0.999480128288269), ('prices', 0.9994395971298218), ('higher', 0.9994344115257263), ('their', 0.9994331002235413), ('by', 0.9994164705276489), ('securities', 0.9994156956672668), ('than', 0.9994068741798401)]


In [5]:
cmod_model.wv.most_similar('fraud')

[('more', 0.99833744764328),
 ('so', 0.9982842206954956),
 ('these', 0.9982642531394958),
 ('their', 0.9982610940933228),
 ('would', 0.99825519323349),
 ('if', 0.9982499480247498),
 ('generally', 0.9982090592384338),
 ('within', 0.9981884956359863),
 ('will', 0.9981809854507446),
 ('class', 0.9981784820556641)]

## skip gram architecture

In [6]:
# Create Skip Gram model
skip_gram_model = gensim.models.Word2Vec(corpus, min_count=1, vector_size=100,
                                window=5, sg=1)

2024-07-04 20:35:37,390 : INFO : collecting all words and their counts
2024-07-04 20:35:37,391 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2024-07-04 20:35:37,396 : INFO : collected 2800 word types from a corpus of 41775 raw words and 2686 sentences
2024-07-04 20:35:37,398 : INFO : Creating a fresh vocabulary
2024-07-04 20:35:37,405 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=1 retains 2800 unique words (100.00% of original 2800, drops 0)', 'datetime': '2024-07-04T20:35:37.405253', 'gensim': '4.3.2', 'python': '3.11.9 (main, Apr 19 2024, 16:48:06) [GCC 11.2.0]', 'platform': 'Linux-5.15.0-113-generic-x86_64-with-glibc2.31', 'event': 'prepare_vocab'}
2024-07-04 20:35:37,405 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=1 leaves 41775 word corpus (100.00% of original 41775, drops 0)', 'datetime': '2024-07-04T20:35:37.405944', 'gensim': '4.3.2', 'python': '3.11.9 (main, Apr 19 2024, 16:48:06) [GCC 11.2.0]', 'platform': 'Lin

In [7]:
skip_gram_model.wv.most_similar('money')

[('market', 0.9646204113960266),
 ('stocks', 0.9560030698776245),
 ('lower', 0.9381133913993835),
 ('low', 0.9293083548545837),
 ('stock', 0.9273928999900818),
 ('prices', 0.9264268279075623),
 ('many', 0.9257183074951172),
 ('offer', 0.92472904920578),
 ('hold', 0.9210673570632935),
 ('government', 0.9208395481109619)]

In [8]:
skip_gram_model.wv.most_similar('fraud')

[('forms', 0.9980872273445129),
 ('ks', 0.9980408549308777),
 ('providing', 0.9980121850967407),
 ('format', 0.9979974627494812),
 ('regulators', 0.997897744178772),
 ('gathering', 0.9978830814361572),
 ('communications', 0.9978770613670349),
 ('calendar', 0.9978488683700562),
 ('nfa', 0.9978326559066772),
 ('doesn', 0.9978095889091492)]

In [9]:
skip_gram_model.wv.most_similar('mutual')

[('costs', 0.9296042919158936),
 ('exchange', 0.9281546473503113),
 ('other', 0.9243658185005188),
 ('these', 0.9242442846298218),
 ('associated', 0.9214975237846375),
 ('are', 0.9205516576766968),
 ('marketing', 0.9147732257843018),
 ('etfs', 0.9144077897071838),
 ('transaction', 0.9140551686286926),
 ('cover', 0.9133368134498596)]

In [10]:
vect1=skip_gram_model.wv.get_vector("bonds")
vect1

array([ 0.03186354,  0.32417858,  0.14346322, -0.15774702,  0.09189676,
       -0.47482914, -0.17094539,  0.6206782 , -0.20372252, -0.25861514,
        0.02837114, -0.15952699,  0.0524365 ,  0.11811804,  0.08754283,
       -0.1337938 , -0.25736433, -0.08980535, -0.21476537, -0.30813828,
        0.06541857,  0.06429306,  0.00417759, -0.27984196, -0.06468371,
        0.22596422, -0.06331495, -0.19226734, -0.09995241, -0.04522065,
        0.19653024, -0.05441574, -0.04164902, -0.2122644 , -0.0101136 ,
        0.32226843, -0.18886194, -0.05141347, -0.05913561, -0.4526042 ,
        0.11383466, -0.2763586 , -0.11798654, -0.11933243,  0.31945884,
       -0.05703748, -0.22343343, -0.05948214,  0.27238002,  0.15054794,
        0.13620704, -0.09449068, -0.21174619, -0.05979099,  0.0951125 ,
        0.23590812,  0.01026949, -0.20144197,  0.19672489, -0.11833731,
       -0.07434704,  0.04828395,  0.1223577 ,  0.02831285, -0.26652297,
        0.3052809 ,  0.22848807,  0.20004109, -0.4478334 ,  0.45

In [11]:
vect2=skip_gram_model.wv.get_vector("money")
vect2

array([-4.78620045e-02,  3.58871520e-01, -4.38098535e-02, -1.64416388e-01,
        2.66807992e-03, -4.34183747e-01, -2.19288631e-06,  6.10115588e-01,
       -1.92931235e-01, -2.80438960e-01,  2.03854982e-02, -1.10006459e-01,
       -5.94976023e-02,  1.18451245e-01,  1.88981280e-01, -9.04520974e-03,
        2.91454978e-02, -1.66115195e-01, -2.24505931e-01, -3.83823991e-01,
       -4.94315177e-02,  6.48309384e-03,  8.59050378e-02, -3.12959641e-01,
       -4.24083360e-02,  1.85719520e-01, -1.01716600e-01, -2.09054202e-01,
       -6.13547340e-02,  2.05489760e-03,  1.94964126e-01, -1.05095461e-01,
        3.70531790e-02, -3.11945409e-01, -1.14812352e-01,  3.24904233e-01,
       -1.94945857e-01, -2.07500886e-02, -5.87579096e-03, -3.74134809e-01,
        1.50425091e-01, -3.37718546e-01, -6.83552995e-02, -1.68188155e-01,
        3.40573192e-01, -4.92074974e-02, -4.81893234e-02, -8.20087641e-02,
        3.39681432e-02,  1.42702147e-01,  2.15705261e-01, -7.68500492e-02,
       -6.58555180e-02, -

In [12]:
skip_gram_model.wv.cosine_similarities(vector_1=vect1 , vectors_all=[vect2])

array([0.87678164], dtype=float32)