# Libraries

In [1]:
!pip install gensim 




[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: C:\Users\Alime\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
import string
import spacy
import spacy.cli
from sklearn.decomposition import PCA
import random
import numpy as np
import plotly.express as px




### Load the Preprocessed Dataset

In [2]:
df = pd.read_csv('dataset.csv')  
comments = df['text'].dropna().tolist()  

In [3]:
for i in range(10):
    print(str(i)+"- "+comments[i])

0- is this for real
1- agree
2- that means you’ve only ever invested in a bull market it’s a great accomplishment if you’ve made a portfolio that performs to your expectations i want to give you words of caution from someone who has seen similar stories such as yours a former colleague had a mill account and blew it down to k in a year that was only slightly bearish
3- improved now i use leverage
4- terrible idea you have nothing to gain from this but everything to lose just because hes your uncle doesnt mean you have to do it repeat after m no
5- to not risk getting robbed right suredo not touch this with a million mile pole stay away do not compromise your accounts let him launder his own money
6- has nothing to do with my comment on gold my point was i do not trust the behavior of gold efts as they act like a hybrid not as a precious metal there’s a lot of people who don’t seem to see the market reactions of gold efts many of them absolutely have their own supply and demand curve ar

### Tokenization

In [4]:
from gensim.utils import simple_preprocess

# Tokenization
tokenized_texts = [simple_preprocess(text) for text in comments]



In [5]:
print(tokenized_texts[1589])
print(tokenized_texts[12])
print(tokenized_texts[29])

['need', 'all', 'the', 'luck', 'can', 'get', 'at', 'this', 'point']
['my', 'concern', 'is', 'it', 'trump', 'if', 'there', 'anything', 'he', 'known', 'for', 'it', 'manipulating', 'the', 'markets', 'with', 'news', 'already', 'lost', 'back', 'in', 'his', 'first', 'presidency', 'that', 'had', 'in', 'what', 'thought', 'was', 'relatively', 'safe', 'etf', 'xiv', 'he', 'got', 'it', 'delisted', 'and', 'bankrupted', 'to', 'off', 'one', 'tweet', 'about', 'north', 'korea', 'all', 'it', 'takes', 'is', 'tweet', 'not', 'even', 'action', 'or', 'signed', 'deal', 'he', 'says', 'some', 'shit', 'like', 'our', 'big', 'beautiful', 'military', 'is', 'coming', 'to', 'save', 'europe', 'talking', 'out', 'of', 'his', 'ass', 'and', 'it', 'crashes', 'lol', 'idk', 'if', 've', 'got', 'the', 'risk', 'tolerance', 'for', 'those', 'games']
['user', 'report', 'total', 'submissions', 'first', 'seen', 'in', 'wsb', 'month', 'agototal', 'comments', 'previous', 'best', 'dd', 'account', 'age', 'month', 'join', 'wsb', 'discord'

In [61]:
# Train the Model
model = Word2Vec(
    sentences=tokenized_texts,
    vector_size=50,          # 50–300 is typical for strong embeddings
    window=5,                 # 5 is standard; 10 is good for more context
    min_count=1,              # Lower to capture more rare words, 2 is good tradeoff
    workers=8,                # Use all CPU cores available
    sg=1,                     # Use Skip-gram (sg=1), better for rare words
    negative=5,              # Negative sampling; more = better up to a point
    epochs=10,                # Train for multiple passes
)

In [63]:
len(model.wv)

143760

In [64]:
term = 'car'
print(model.wv[term])

[ 0.43365663  0.4709199   0.02852446  0.03542501 -0.1902581  -0.09683113
  0.59167254  0.5031434  -0.39708167 -0.42880923  0.07364506 -0.40253478
  0.60316795 -0.10081554  0.04983263  0.3273224   0.3869971  -0.27011186
 -0.7747347   0.55998135 -0.20124014  0.29493454  0.13443454  0.15121493
  0.33801368  0.34947535 -0.02961941 -0.14751141 -0.32238227 -0.2836017
 -0.16971336 -0.06070832 -0.4122097  -0.99319315 -0.8656033   0.23580389
  0.6589589  -0.06076656 -0.04486569 -0.5152422   1.103717   -0.54633486
 -0.5244833   0.0189059   0.23897165 -0.01285927 -1.033242   -1.1203344
  0.09064746 -0.05477126]


In [65]:
model.wv.most_similar("btc")

[('bitcoin', 0.926262378692627),
 ('bitcoins', 0.8772911429405212),
 ('mstr', 0.8655718564987183),
 ('eth', 0.8588991761207581),
 ('microstrategy', 0.8301677107810974),
 ('tether', 0.8174738883972168),
 ('dxy', 0.8006730079650879),
 ('xrp', 0.7745649814605713),
 ('mstrs', 0.7740401029586792),
 ('crypto', 0.7719651460647583)]

## 3D INTERACTIVE PLOT

In [104]:




# Step 1: Sample 20 words from the vocabulary
words = list(model.wv.index_to_key)
sampled_words = random.sample(words, 20)

# Step 2: Get vectors and reduce with PCA
vectors = np.array([model.wv[word] for word in sampled_words])
pca = PCA(n_components=3)
reduced_vectors = pca.fit_transform(vectors)

# Step 3: Plot using Plotly
fig = px.scatter_3d(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    z=reduced_vectors[:, 2],
    text=sampled_words,  # Word labels
    labels={'x': 'PCA-1', 'y': 'PCA-2', 'z': 'PCA3'},
    title="3D PCA Plot of Word Embeddings",
    width=1000,
    height=800
)


fig.update_traces(marker=dict(size=5,line=dict(width=4)),textfont_size=10)
fig.show()



## WORDS FROM ONE CLUSTER INTERACTIVE 3D PLOT

In [105]:


# Define crypto-related words
# List of crypto-related words
crypto_words = [
    "bitcoin", "ethereum", "crypto", "blockchain", "altcoin", "wallet", "token",
    "nft", "btc", "eth", "smartcontract",'people'
]


# Filter out words not in the model
available_words = [w for w in crypto_words if w in model.wv.key_to_index]
vectors = np.array([model.wv[w] for w in available_words])


# reduced_vectors = tsne.fit_transform(vectors)

pca = PCA(n_components=3)
reduced_vectors = pca.fit_transform(vectors)



# Create a DataFrame for Plotly
df = pd.DataFrame(reduced_vectors, columns=["x", "y", "z"])
df["word"] = available_words
print(df)

# 3D scatter plot with Plotly
fig = px.scatter_3d(
    df,
    x="x", y="y", z="z",    
    range_x = (-5,5),
    range_z = (-5,5),
    range_y = (-5,5),
    text="word",
    title="3D PCA of Crypto-Related Word Embeddings",
    color_discrete_sequence=["blue"],
    width=1000,
    height=800
)
fig.update_traces(marker=dict(size=5,line=dict(width=4)),textfont_size=10)
fig.show()


           x         y         z        word
0  -0.668550 -0.223874 -0.262965     bitcoin
1  -0.419764  0.083407 -0.284386    ethereum
2  -0.220211 -0.150626 -0.250087      crypto
3  -0.235305  0.880406 -1.784512  blockchain
4   0.348352 -0.418733  0.729718     altcoin
5   0.490557  1.636047  1.092922      wallet
6   1.086566  0.647754  0.185831       token
7   0.176890 -0.244682 -0.183912         nft
8  -1.033133 -0.576076  0.384494         btc
9  -1.425807 -0.376242  0.613702         eth
10  1.900404 -1.257380 -0.240805      people


In [106]:


# Step 1: Get top 10 most similar words to "trump" and "oil", "gas"
trump_similar = model.wv.most_similar('trump', topn=10)
oil_similar = model.wv.most_similar('oil', topn=10)
gas_similar = model.wv.most_similar('gas', topn=10)

# Step 2: Extract words and vectors
trump_words = [word for word, _ in trump_similar]
oil_words = [word for word, _ in oil_similar]
gas_words = [word for word, _ in gas_similar]


# Include "trump" and "oil" themselves in the plot
all_words = ['trump'] + trump_words + ['oil'] + oil_words + ['gas'] + gas_words
vectors = np.array([model.wv[word] for word in all_words])

# Step 3: Reduce dimensions using PCA
pca = PCA(n_components=3)
reduced_vectors = pca.fit_transform(vectors)

# Step 4: Assign colors
colors = ['red'] * 11 + ['blue'] * 11 + ['green'] * 11  # 10 similar + the original word for each

# Step 5: Plot with Plotly
fig = px.scatter_3d(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    z=reduced_vectors[:, 2],
    text=all_words,
    color=colors,
    labels={'x': 'PCA-1', 'y': 'PCA-2', 'z': 'PCA-3'},
    title="Top 10 Similar Words to 'trump' (red) and 'oil' (blue) and 'gas' (green)",
    width=1000,
    height=800
)

fig.update_traces(marker=dict(size=5, line=dict(width=1)), textposition='top center')
fig.show()
