In [None]:
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
import re

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')
# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to scrape text from the given URL
def get_webpage_text(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    paragraphs = soup.find_all('p')
    text = ' '.join([p.get_text() for p in paragraphs])
    return text

# Function to preprocess the text
def preprocess_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()

    # Tokenize into sentences, then words
    sentences = sent_tokenize(text)
    processed_text = []
    for sentence in sentences:
        words = word_tokenize(sentence)
        # Remove stopwords and apply lemmatization
        words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
        processed_text.append(words)

    return processed_text  # List of sentences, where each sentence is a list of words
    Print(processed_text)

# Main function to scrape, preprocess, and apply Word2Vec
def main():
    url = 'https://awealthofcommonsense.com/2024/11/talk-your-book-22/'  # Provided URL
    text = get_webpage_text(url)
    processed_text = preprocess_text(text)

    # Train Word2Vec model
    model = Word2Vec(sentences=processed_text, vector_size=100, window=5, min_count=5, workers=4)

    # Print a sample of word embeddings for demonstration
    #sample_words = ["investment", "market", "wealth", "economy"]
    # The outer loop iterates through each sentence in 'processed_text'
    for sentence in processed_text:
        # The inner loop iterates through each word in the current 'sentence'
        for word in sentence:
            if word in model.wv:
                print(f"Embedding for '{word}':\n{model.wv[word]}\n")
            else:
                print(f"Word '{word}' not in vocabulary.")

if __name__ == "__main__":
    main()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Embedding for 'endorsement':
[ 0.00340627  0.00217024 -0.00326713 -0.00758559  0.0022877   0.00370525
 -0.00612865  0.00689644  0.00254764  0.00305889 -0.00544951 -0.00555737
 -0.00081731  0.00738076  0.00727336 -0.00370333 -0.00437976 -0.00435288
 -0.0081797  -0.00841112 -0.00377279 -0.0088766  -0.0084918   0.00238533
  0.00478885 -0.00030785  0.00021033 -0.00621029  0.0021334  -0.00259597
 -0.00076077 -0.00090063  0.0045201  -0.00586366 -0.00749633 -0.0038851
  0.00315197 -0.00889125  0.00925137 -0.00817418 -0.00291812 -0.01089629
 -0.00282126 -0.00271917 -0.00138161 -0.00700629  0.00325149 -0.00017562
  0.00089851  0.00938415 -0.00540538 -0.00110893 -0.00713619 -0.00197664
  0.00714496  0.0084576  -0.00531879  0.00118099  0.00434154  0.00149702
  0.00833152 -0.00490193 -0.0069731   0.00699888  0.00372654  0.01060232
  0.00255832 -0.00774932 -0.00038165 -0.00197214 -0.01055746  0.0073728
 -0.00862012 -0.00668825 -0.0001