In [None]:
import subprocess
import sys

def install_and_import(package_name):
    try:
        __import__(package_name)
        print(f"{package_name} is already installed.")
    except ImportError:
        print(f"{package_name} is not installed. Installing it now...")
        try:
            subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
            print(f"{package_name} has been successfully installed.")
            __import__(package_name)
        except Exception as e:
            print(f"An error occurred during installation of {package_name}: {e}")

# Check and install nltk and spacy
install_and_import("nltk")
# install_and_import("spacy")
install_and_import("gensim")

In [None]:
#Ściągnięte corpusy (na początku zapewne nie będzie tego za wiele)
import os
import nltk
nltk.download('brown')
from nltk.corpus import brown as cb

In [None]:
len(cb.words())

In [None]:
print(", ".join(cb.words()[:20]))

In [None]:
cb.words()[:10]

In [None]:
cb.tagged_sents(categories='news')[1]

In [None]:
nltk.download('inaugural')
from nltk.corpus import inaugural
from matplotlib.pyplot import figure

figure(figsize=(12, 6), dpi=80)
cfd = nltk.ConditionalFreqDist((target, fileid[:4]) 
                               for fileid in inaugural.fileids() 
                               for w in inaugural.words(fileid) 
                               for target in ['america', 'citizen'] 
                               if w.lower().startswith(target))
cfd.plot()

# TF-IDF

In [None]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

In [None]:
text1 = ('''
The witcher halted at a distance of ten paces.  His sword, slowly drawn from its black enameled sheath, glistened and glowed above his head.
“It’s silver,” he said.  “This blade is silver.”
The pale little face did not flinch; the anthracite eyes did not change expression.
“You’re so like a rusalka, “the witcher continued calmly, “that you could deceive anyone.  All the more as you’re a rare bird, black-haired one.  But horses are never mistaken.  They recognize creatures like you instinctively and perfectly.  What are you?  I think you’re a moola, or an alpor.  An ordinary vampire couldn’t come out in the sun.”
The corners of the pale lips quivered and turned up a little.
''')

In [None]:
text2 = ('''
Born and raised in the Austrian Empire, Joe Tesla studied engineering and physics in the 1870s without receiving a 
degree, gaining practical experience in the early 1880s working in telephony and at Continental Edison in the 
new electric power industry. In 1884 he emigrated to the United States, where he became a naturalized citizen. 
He worked for a short time at the Edison Machine Works in New York City before he struck out on his own. 
With the help of partners to finance and market his ideas, Nicola Tesla set up laboratories and companies in 
New York to develop a range of electrical and mechanical devices. His alternating current (AC) induction 
motor and related polyphase AC patents, licensed by Westinghouse Electric in 1888, earned him a considerable 
amount of money and became the cornerstone of the polyphase system which that company eventually marketed.
''')

In [None]:
text3 = ('''
The huge black eyes narrowed.
“Where is he, black-haired one?  You were singing, so you’ve drunk some blood.  You’ve taken the ultimate measure, which means you haven’t managed to enslave his mind.  Am I right?”
The black-tressed head nodded slightly, almost imperceptibility, and the corners of the mouth turned up even more.  The tiny little face took on an eerie expression.
“No doubt you consider yourself the lady of this castle now?”
A nod, this time clearer.
“Are you a moola?”
A slow shake of the head.  The hiss which reverberated through his bones could only have come from the pale, ghastly, smiling lips, although the witcher didn’t see them move.
“Alpor?”
Denial.
The witcher backed away and clasped the hilt of his sword tighter.  “That means you’re-”
The corners of the lips started to turn up higher and higher, the lips flew open…
“A bruxa!” The witcher shouted, throwing himself towards the fountain.
From behind the pale lips glistened white, spiky fangs.  The vampire jumped up, arched her back like a leopard and screamed.
''')

In [None]:
import math
import re
from collections import Counter

# Function to clean the text by removing punctuation
def clean_text(text):
    return re.sub(r'[^\w\s]', '', text)

# Function to calculate term frequency (TF) for one document
def tf(word, text):
    cleaned_text = clean_text(text)
    word_counts = Counter(cleaned_text.split())
    # simple version
    # total_words = sum(word_counts.values())
    # tf_value = word_counts[word] / total_words if total_words else 0
    l2_norm = np.sqrt(sum((count ** 2) for count in word_counts.values()))
    tf_value = word_counts[word] / l2_norm if l2_norm else 0
    return tf_value

# Function to count how many documents contain the word
def n_containing(word, texts):
    return sum(1 for text in texts if word in clean_text(text).split())

# Function to calculate inverse document frequency (IDF) for all documents
def idf(word, texts):
    n = n_containing(word, texts)
    # simple version
    # idf_value = math.log(len(texts) / (n if n else 1))
    idf_value = math.log((len(texts) + 1) / (n + 1)) + 1
    return idf_value

# Function to calculate TF-IDF
def tfidf(word, text, texts):
    return tf(word, text) * idf(word, texts)

# Function to print the TF-IDF table
def print_tfidf_table(sample_words, texts):
    print(f"{'Word':<15}{'Text':<10}{'TF':<10}{'IDF':<10}{'TF-IDF':<10}")
    print("-" * 55)
    
    # Loop through each word in the sample words
    for word in sample_words:
        # Print for each text
        for idx, text in enumerate(texts):
            tf_score = tf(word, text)
            idf_score = idf(word, texts)
            tfidf_score = tfidf(word, text, texts)
            print(f"{word:<15}Text {idx+1:<7}{tf_score:<10.4f}{idf_score:<10.4f}{tfidf_score:<10.4f}")
        
        # After printing the word three times (once per text), add a blank row
        print("")  # Blank line for separation between words


corpus = [text1.lower(), text2.lower(), text3.lower()]

sample_words = ['sword', 'witcher', 'tesla', 'in', 'vampire', 'the']

# Print the TF-IDF table
print_tfidf_table(sample_words, corpus)

In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

# Use CountVectorizer to vectorize the text
count_vect = CountVectorizer()
text_counts = count_vect.fit_transform(corpus)

# Get the vocabulary (mapping of words to index)
vocabulary = count_vect.vocabulary_

# Use TfidfTransformer with use_idf=False to get TF values
tf_transformer = TfidfTransformer(use_idf=False)
text_tf = tf_transformer.fit_transform(text_counts)

# Use TfidfTransformer to compute TF-IDF values (default settings)
tfidf_transformer = TfidfTransformer()
text_tfidf = tfidf_transformer.fit_transform(text_counts)

# Get the IDF values (Inverse Document Frequency)
idf_values = tfidf_transformer.idf_
def print_tfidf_table_sklearn(sample_words, count_vect, text_tf, idf_values, text_tfidf):
    print(f"{'Word':<15}{'Text':<10}{'TF':<10}{'IDF':<10}{'TF-IDF':<10}")
    print("-" * 55)
    
    # Loop through each word in the sample words
    for word in sample_words:
        if word in vocabulary:
            word_index = vocabulary[word]
            # Extract TF values for this word across all texts
            tf_values_for_word = text_tf[:, word_index].toarray().flatten()
            # Extract TF-IDF values for this word across all texts
            tfidf_values_for_word = text_tfidf[:, word_index].toarray().flatten()
            # Get the IDF value for this word
            idf_value = idf_values[word_index]
            
            # Print TF, IDF, and TF-IDF values for each text
            for idx in range(len(tf_values_for_word)):
                print(f"{word:<15}Text {idx+1:<7}{tf_values_for_word[idx]:<10.4f}{idf_value:<10.4f}{tfidf_values_for_word[idx]:<10.4f}")
            print()  # Add an empty row after each word
        else:
            # If the word is not in the vocabulary, display 0 for all values
            for idx in range(len(textlist)):
                print(f"{word:<15}Text {idx+1:<7}0.0000     0.0000     0.0000     ")
            print()  # Add an empty row after each word


sample_words = ['sword', 'witcher', 'tesla', 'in', 'vampire', 'the']
print_tfidf_table_sklearn(sample_words, count_vect, text_tf, idf_values, text_tfidf)

# Embeddings

In [None]:
import nltk
nltk.download('brown')

In [None]:
from gensim.test.utils import common_texts
from nltk.corpus import brown    
sentences = brown.sents()
from gensim.models import Word2Vec

#Proszę wytrenować model Word2Vec za pomocą korpusu brown, długość embeddingu - 100, wielkość okna - 5, 
#wziąć pod uwagę słowo jeśli występuje chociaż raz, liczbę epok ustawić na 10
#na końcu zapisać model
model = Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=1, epochs=10)
model.save("word2vec.model")

In [None]:
#Jak wygląda embedding dla przykładowego słowa?
model.wv['computer']

In [None]:
model.wv['computer'].shape

In [None]:
#Jakie jest 10 najbliższych wektorów do słówka 'wine'?
model.wv.most_similar('wine', topn=10)

In [None]:
nltk.download('webtext')


In [None]:
nltk.download('punkt')

In [None]:
from nltk.corpus import webtext   
sentences_web = webtext.sents()

#Proszę załadować zapisany model i kontynuować trenowanie dla corpusu webtext, przez 4 epoki
model2 = Word2Vec.load("word2vec.model")
model2.train(sentences_web, total_examples=1, epochs=4)

In [None]:
#Jakie jest 10 najbliższych wektorów do słówka 'wine' teraz?
model2.wv.most_similar('wine', topn=10)

In [None]:
#Co się stanie gdy się zapytamy o niewystępujące słowo?
model2.wv.most_similar('witcher', topn=5)

In [None]:
#Proszę znaleść najbliższe wektory dla "algebry emebddingów": king - man + woman
model2.wv.most_similar(model2.wv['king'] - model2.wv['man'] + model2.wv['woman'], topn=10)

In [None]:
import gensim.downloader
# Zobaczmy jak nazywają się wszystkie dostępne modele z embeddingami w gensim
print(list(gensim.downloader.info()['models'].keys()))

In [None]:
import gensim
w2v_vectors = gensim.downloader.load('word2vec-google-news-300')

In [None]:
w2v_vectors.most_similar(w2v_vectors['father'] - w2v_vectors['man'] + w2v_vectors['woman'], topn=10)

In [None]:
w2v_vectors.most_similar('wine', topn=10)

In [None]:
#Jakie jest 10 najbliższych wektorów do słówka 'dog'?
w2v_vectors.most_similar('dog', topn=10)

In [None]:
#Proszę znaleść najbliższe wektory dla "algebry emebddingów": death - man + computer
w2v_vectors.most_similar(w2v_vectors['death'] - w2v_vectors['man'] + w2v_vectors['computer'], topn=10)

In [None]:
w2v_vectors['death'].shape

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition  import PCA
import numpy as np

def to_2d(embeddings):
    # To reduce embedding dims without losing much information we use PCA
    pca = PCA(n_components=2, whiten=True)
    pca.fit(embeddings)
    return pca.transform(embeddings)


def annotated_scatter(points, names, color='blue'):
    x_coords = points[:, 0]
    y_coords = points[:, 1]
    plt.scatter(x_coords, y_coords, c=color)
    for label, x, y in zip(names, x_coords, y_coords):
                      plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
    plt.xlim(x_coords.min() - .5, x_coords.max() + .5)
    plt.ylim(y_coords.min() - .5, y_coords.max() + .5)

    
def plot_embeddings(embeddings, names, color='blue', show=True):
    X_train = np.array([embeddings[k] for k in names])
    embeddings_2d = to_2d(X_train)
    
    annotated_scatter(embeddings_2d, names, color)
    plt.grid()
    
    if show:
        plt.show()

In [None]:
near_dog = [elem[0] for elem in w2v_vectors.most_similar(w2v_vectors['dog'])]
near_computer = [elem[0] for elem in w2v_vectors.most_similar(w2v_vectors['computer'])]
len(near_computer)

In [None]:
plot_embeddings(w2v_vectors, 
                near_dog + near_computer, 
                color=['red'] * len(near_dog) + ['green'] * len(near_computer))

In [None]:
from collections import namedtuple
import matplotlib.lines as mlines
from matplotlib import cm

LinearSubs = namedtuple('LinearSubs', 
                        ('word_pair', 'name'))

def plot_linear_substructures(linear_subs, embeddings):
    embeddings_matrix = [embeddings[p] for ls in linear_subs for p in ls.word_pair]
    embeddings_matrix = np.array(embeddings_matrix)
    pair_names = [p for ls in linear_subs for p in ls.word_pair]
    ls_names = [ls.name for ls in linear_subs]
    embeddings_2d = to_2d(embeddings_matrix)
    annotated_scatter(embeddings_2d, 
                      pair_names, 
                      cm.Set1.colors[:len(embeddings_2d)])
    
    for i in range(0, len(embeddings_2d), 2):
        p1 = embeddings_2d[i]
        p2 = embeddings_2d[i + 1]
        # Center of the linear substructure
        center = [(p1[i] + p2[i]) / 2 + .04 for i in range(2)]
        
        plt.plot(*zip(p1, p2), '--')
        plt.annotate(ls_names[i // 2], 
                     xy=center, 
                     xytext=(0, 0), textcoords='offset points')

In [None]:
plt.figure(figsize=(20, 5))

plt.subplot(131)
plot_linear_substructures([LinearSubs(('man', 'woman'), 'sex'),
                           LinearSubs(('king', 'queen'), 'sex'),
                           LinearSubs(('mother', 'father'), 'sex')], w2v_vectors)

plt.subplot(132)
plot_linear_substructures([LinearSubs(('cat', 'feline'), 'family'),
                           LinearSubs(('dog', 'canine'), 'family'),
                           LinearSubs(('parrot', 'bird'), 'family')], w2v_vectors)

plt.subplot(133)
plot_linear_substructures([LinearSubs(('samsung', 'mobile'), 'product'),
                           LinearSubs(('sony', 'tv'), 'product'),
                           LinearSubs(('ikea', 'furniture'), 'product')], w2v_vectors)

In [None]:
import os
import zipfile
import urllib.request

# Define file paths
zip_file = 'glove.6B.zip'
extracted_folder = 'glove.6B'

# Check if the zip file exists
if not os.path.exists(zip_file):
    print(f"File not found! Downloading {zip_file}...")
    # Download the GloVe zip file
    url = 'http://nlp.stanford.edu/data/glove.6B.zip'
    urllib.request.urlretrieve(url, zip_file)
else:
    print(f"{zip_file} already exists. Skipping download.")

# Check if the folder with unzipped files exists
if not os.path.exists(extracted_folder):
    print(f"Unzipping {zip_file}...")
    # Unzip the file
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extracted_folder)
else:
    print(f"{extracted_folder} already exists. Skipping unzip.")

In [None]:
#Glove 6B

# !curl -OL http://nlp.stanford.edu/data/glove.6B.zip -o glove.6B.zip
#wget
# !wget http://nlp.stanford.edu/data/glove.6B.zip

In [None]:
# !unzip -o glove.6B.zip
# !unzip -o /content/glove.6B.zip

In [None]:
glove_embeddings = {}
with open('glove.6B/glove.6B.300d.txt') as f:
    glove_embeddings = {l.split()[0]: np.array(l.split()[1:]).astype('float') for l in f}

In [None]:
glove_embeddings['computer']

In [None]:
def get_closest(x, embeddings, topn=3):
    """
    Get the closest embeddings calculating the euclidean distance
    Parameters
    ----------
    x: np.ndarray
      Vector containing an embedding
    top_k: int, optional
      Get the top k similar embeddings
    Returns
    -------
    dict
      Dict containing the top k similar embeddings to the given x
    """
    # Stack all embeddings in a single matrix. Note: the matrix dimention will be
    # V x D where V is the vocabulary size and D is the embedding dimension
    embedding_matrix = np.array(list(embeddings.values()))
    # Using broadcasting compute distance to each embedding in our vocabulary
    distances = x - embedding_matrix
    # Comoute the magnitude of each distance
    distances = np.linalg.norm(distances, axis=1)
    # Sort distance and keep the smallest k
    min_idx = np.argsort(distances)[:topn]
    return [list(embeddings)[i] for i in min_idx]

In [None]:
plt.figure(figsize=(20, 5))

plt.subplot(131)
plot_linear_substructures([LinearSubs(('man', 'woman'), 'sex'),
                           LinearSubs(('king', 'queen'), 'sex'),
                           LinearSubs(('mother', 'father'), 'sex')], glove_embeddings)

plt.subplot(132)
plot_linear_substructures([LinearSubs(('cat', 'feline'), 'family'),
                           LinearSubs(('dog', 'canine'), 'family'),
                           LinearSubs(('parrot', 'bird'), 'family')], glove_embeddings)

plt.subplot(133)
plot_linear_substructures([LinearSubs(('samsung', 'mobile'), 'product'),
                           LinearSubs(('sony', 'tv'), 'product'),
                           LinearSubs(('ikea', 'furniture'), 'product')], glove_embeddings)

In [None]:
get_closest(glove_embeddings['rome'] - glove_embeddings['italy'] + glove_embeddings['france'], glove_embeddings)

In [None]:
get_closest(glove_embeddings['king'] - glove_embeddings['men'] + glove_embeddings['woman'], glove_embeddings)