In [1]:
import numpy as np
import pandas as pd
import gensim
import os

In [4]:
import os
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from gensim.utils import simple_preprocess

story = []

# Get the set of stopwords for efficiency
stop_words = set(stopwords.words('english'))

# Loop through each file in the 'data' directory
for filename in os.listdir('data'):
    # Open the file safely using 'with' to avoid file not being closed
    with open(os.path.join('data', filename), 'r', encoding='utf-8', errors='ignore') as f:
        corpus = f.read()

        # Tokenize the corpus into sentences
        raw_sent = sent_tokenize(corpus)

        # Process each sentence
        for sentence in raw_sent:
            # Tokenize the sentence into words
            word_tokens = word_tokenize(sentence)

            # Remove stopwords and add valid words to 'story'
            filtered_words = [word for word in word_tokens if word.lower() not in stop_words]

            # You can directly append the filtered words list (no need for simple_preprocess here)
            if filtered_words:  # Only append if the sentence is not empty
                # Using simple_preprocess (optional, if you want lowercasing and further cleaning)
                processed_sentence = simple_preprocess(" ".join(filtered_words))
                if processed_sentence:  # Check if the sentence is not empty
                    story.append(processed_sentence)

# Now 'story' is a list of lists of words, without stopwords, ready for Word2Vec
print(story)  # Optionally print to verify the structure




In [6]:
story

[['game',
  'thrones',
  'book',
  'one',
  'song',
  'ice',
  'fire',
  'george',
  'martin',
  'prologue',
  'start',
  'back',
  'gared',
  'urged',
  'woods',
  'began',
  'grow',
  'dark',
  'around'],
 ['wildlings', 'dead'],
 ['dead', 'frighten'],
 ['ser', 'waymar', 'royce', 'asked', 'hint', 'smile'],
 ['gared', 'rise', 'bait'],
 ['old', 'man', 'past', 'fifty', 'seen', 'lordlings', 'come', 'go'],
 ['dead', 'dead', 'said'],
 ['business', 'dead'],
 ['dead'],
 ['royce', 'asked', 'softly'],
 ['proof'],
 ['saw', 'gared', 'said'],
 ['says', 'dead', 'proof', 'enough'],
 ['known', 'would', 'drag', 'quarrel', 'sooner', 'later'],
 ['wished', 'later', 'rather', 'sooner'],
 ['mother', 'told', 'dead', 'men', 'sing', 'songs', 'put'],
 ['wet', 'nurse', 'said', 'thing', 'royce', 'replied'],
 ['never', 'believe', 'anything', 'hear', 'woman', 'tit'],
 ['things', 'learned', 'even', 'dead'],
 ['voice', 'echoed', 'loud', 'twilit', 'forest'],
 ['page', 'long', 'ride', 'us', 'gared', 'pointed'],
 ['eig

In [7]:
import gensim
from gensim.models import Word2Vec

# Correct initialization of Word2Vec model
m = Word2Vec(
    window=10,        # Window size for context words (10 words on each side)
    min_count=2,      # Ignore words that appear less than 2 times in the corpus
    vector_size=100,  # Dimensionality of the word vectors (e.g., 100-dimensional vectors)
    workers=4         # Number of CPU cores to use for training (optional)
)


# Train the Word2Vec model on the corpus
m.build_vocab(story)  # Build the vocabulary
m.train(story, total_examples=m.corpus_count, epochs=m.epochs)  # Train the model


(4440437, 4619615)

In [8]:
m.build_vocab(story)

In [9]:
m.wv.most_similar('throne')   ## when you give any realatble word

[('skin', 0.9485853314399719),
 ('thanked', 0.9407196640968323),
 ('sucking', 0.9402318596839905),
 ('boat', 0.9402036666870117),
 ('flavored', 0.9380468130111694),
 ('amber', 0.9373883008956909),
 ('kitchen', 0.9359745383262634),
 ('merciless', 0.9356104135513306),
 ('accuse', 0.9356040358543396),
 ('stiff', 0.935005784034729)]

In [10]:
### find odd one out

m.wv.doesnt_match(['john','rikon','robb','aarya'])

'john'

In [11]:
m.wv['jon']   ## vector reprenstaion

array([-0.5567684 ,  0.16579589,  0.07520319, -0.13735324, -0.3156549 ,
       -0.9480747 ,  1.0715759 ,  0.9558852 ,  0.19217788,  0.24516393,
        0.5155905 , -0.23075126, -0.38378268,  0.6103956 ,  0.06877378,
       -0.01645072,  0.10498358, -0.5398939 ,  0.44852123, -0.0364571 ,
        0.32609677,  0.1654416 , -0.38084203, -0.1642316 ,  0.2097974 ,
       -0.3005626 ,  0.35797268, -0.14565231, -0.51627   , -0.09736499,
        0.81656784,  0.37007758,  0.09954173,  0.31808528,  0.0044703 ,
       -0.01458387,  0.65102386, -0.4242819 ,  0.07234151, -0.73872423,
        0.35410944,  0.1014679 , -0.6228326 , -0.3319826 ,  0.61381906,
        0.08766419,  0.58582073,  0.603176  ,  0.40297356,  0.47079557,
       -0.07601492, -0.24300502, -0.66222703, -0.23645847, -0.7502047 ,
        0.06339245,  0.44816473, -0.37198645, -0.08391126,  0.34422442,
        0.30386457,  0.6064954 ,  0.00185368, -0.08070336, -0.78026384,
       -0.19857702,  0.08984335, -0.34524652, -0.2850274 ,  0.71

In [12]:
m.wv['jon'].shape

(100,)

In [13]:
## suppose we want norme vectors

m.wv.get_normed_vectors().shape

(17804, 100)

In [14]:
m.wv.get_normed_vectors()

array([[ 2.5651580e-02,  1.3526461e-01, -3.3575822e-02, ...,
        -9.5995374e-02, -1.3310578e-01,  1.8721402e-02],
       [-2.2506630e-02,  1.8406911e-01, -2.7557904e-02, ...,
        -1.1192807e-01,  6.8302259e-02, -7.5124153e-03],
       [ 8.1323199e-02,  6.2865980e-02, -9.3273006e-02, ...,
        -4.5515075e-02,  2.1884483e-01, -1.3213400e-02],
       ...,
       [-1.5862133e-02,  1.5285291e-01,  6.3399144e-05, ...,
        -9.8559603e-02,  2.3409456e-02, -1.1634984e-01],
       [-7.1557119e-02, -2.1706657e-02,  7.8435056e-02, ...,
        -4.1437611e-02,  1.4185910e-01, -9.8717861e-02],
       [-5.3469732e-02,  9.0603344e-02,  2.6755126e-02, ...,
        -1.2731022e-01,  7.5220764e-02, -1.3431697e-02]], dtype=float32)

In [15]:
y = m.wv.index_to_key  ###

In [16]:
y

['said',
 'lord',
 'would',
 'one',
 'ser',
 'could',
 'man',
 'men',
 'back',
 'king',
 'well',
 'like',
 'jon',
 'old',
 'hand',
 'even',
 'never',
 'tyrion',
 'see',
 'know',
 'made',
 'father',
 'eyes',
 'black',
 'told',
 'thought',
 'lady',
 'time',
 'long',
 'might',
 'us',
 'come',
 'still',
 'face',
 'head',
 'red',
 'way',
 'page',
 'must',
 'boy',
 'good',
 'two',
 'little',
 'brother',
 'took',
 'came',
 'though',
 'say',
 'night',
 'three',
 'away',
 'queen',
 'dead',
 'son',
 'blood',
 'take',
 'go',
 'half',
 'make',
 'arya',
 'saw',
 'white',
 'day',
 'first',
 'jaime',
 'look',
 'want',
 'much',
 'enough',
 'tell',
 'sword',
 'great',
 'looked',
 'bran',
 'll',
 'girl',
 'left',
 'knew',
 'asked',
 'gave',
 'called',
 'wall',
 'every',
 'heard',
 'maester',
 'yet',
 'went',
 'let',
 'sansa',
 'turned',
 'need',
 'behind',
 'dany',
 'around',
 'another',
 'beneath',
 'across',
 'snow',
 'keep',
 'gods',
 'found',
 'knight',
 'woman',
 'gold',
 'last',
 'grace',
 'castle

In [17]:
from sklearn.decomposition import PCA

In [18]:
pca = PCA(n_components=3)

In [19]:
X=pca.fit_transform(m.wv.get_normed_vectors())

In [20]:
X.shape

(17804, 3)

In [99]:
# %pip install plotly

Collecting plotly
  Downloading plotly-5.24.1-py3-none-any.whl.metadata (7.3 kB)
Collecting tenacity>=6.2.0 (from plotly)
  Downloading tenacity-9.0.0-py3-none-any.whl.metadata (1.2 kB)
Downloading plotly-5.24.1-py3-none-any.whl (19.1 MB)
   ---------------------------------------- 0.0/19.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/19.1 MB ? eta -:--:--
   ---------------------------------------- 0.2/19.1 MB 3.3 MB/s eta 0:00:06
   - -------------------------------------- 0.5/19.1 MB 5.1 MB/s eta 0:00:04
   - -------------------------------------- 0.6/19.1 MB 3.9 MB/s eta 0:00:05
   - -------------------------------------- 0.7/19.1 MB 3.6 MB/s eta 0:00:06
   - -------------------------------------- 0.9/19.1 MB 3.4 MB/s eta 0:00:06
   -- ------------------------------------- 1.0/19.1 MB 3.2 MB/s eta 0:00:06
   -- ------------------------------------- 1.1/19.1 MB 3.2 MB/s eta 0:00:06
   -- ------------------------------------- 1.2/19.1 MB 3.2 MB/s eta 0:00:06
   --


[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
# %pip install nbformat>=4.2.0

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import plotly.express as px
fig = px.scatter_3d(X[200:300],x=0,y=1,z=2, color=y[200:300])  ## to see the 100 words into visualize the
# fig.show()

In [22]:
fig.show()