In [None]:
import spacy

# Load the medium model with vectors
nlp = spacy.load('en_core_web_md')

In [None]:
# Example text
text = "Apple and orange are fruits. London and Paris are cities."
doc = nlp(text)
doc

In [None]:
for token in doc:
    print(token.text, token.has_vector, token.vector_norm, token.vector)

In [None]:
token1 = nlp("barak obama")
token2 = nlp("king")
print("Similarity:", token1.similarity(token2))

In [None]:
fruits = ["apple", "orange", "banana", "grape", "strawberry"]
for fruit1 in fruits:
    for fruit2 in fruits:
        if fruit1 != fruit2:
            print(f"{fruit1} - {fruit2}: {nlp(fruit1).similarity(nlp(fruit2))}")

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Get vectors and labels
vectors = [nlp(text).vector for text in fruits]
pca = PCA(n_components=3)
vectors_transformed = pca.fit_transform(vectors)

# Plotting
plt.scatter(vectors_transformed[:, 0], vectors_transformed[:, 1], c=vectors_transformed[:, 2])
for i, word in enumerate(fruits):
    plt.annotate(word, (vectors_transformed[i, 0], vectors_transformed[i, 1]))
plt.show()

# Now let's try with some cities
cities = ["London", "Paris", "Berlin", "Madrid", "Rome"]
vectors = [nlp(text).vector for text in cities]
pca = PCA(n_components=2)
vectors_transformed = pca.fit_transform(vectors)

plt.scatter(vectors_transformed[:, 0], vectors_transformed[:, 1])
for i, word in enumerate(cities):
    plt.annotate(word, (vectors_transformed[i, 0], vectors_transformed[i, 1]))
plt.show()



In [None]:
professions = ["doctor", "engineer", "artist", "teacher", "chef", "farmer", "software", "scientist", "lawyer", "nurse", "police officer", "firefighter", "soldier"]
dog_breeds = ["beagle", "labrador", "bulldog", "poodle", "husky", "rottweiler", "boxer", "doberman", "dalmatian", "chihuahua", "golden retriever", "pug", "Pit Bull"]

def find_closest_breed(profession, breeds):
    profession_token = nlp(profession)
    max_similarity = -1
    closest_breed = None
    for breed in breeds:
        breed_token = nlp(breed)
        similarity = profession_token.similarity(breed_token)
        if similarity > max_similarity:
            max_similarity = similarity
            closest_breed = breed
    return closest_breed, max_similarity

for profession in professions:
    closest_breed, max_similarity = find_closest_breed(profession, dog_breeds)
    print(f"The closest dog breed to {profession} is {closest_breed} and the similarity is {max_similarity}.")




In [None]:
# now names and professions
names = ["Tjarks", "Mossio", "Marlon", "Matsumoto"]
professions = ["doctor", "engineer", "artist", "teacher", "chef", "farmer", "software developer", "scientist", "lawyer", "nurse", "police officer", "firefighter", "soldier", "pilot", "singer", "actor", "writer", "dancer", "architect", "designer", "photographer", "journalist", "athlete", "musician", "veterinarian", "psychologist", "accountant", "economist", "entrepreneur", "manager", "consultant", "plumber", "electrician", "mechanic","gardener", "hairdresser", "tailor", "librarian", "receptionist", "secretary", "waiter", "bartender", "cashier", "salesperson", "policeman", "fireman", "soldier", "pilot", "singer", "actor", "writer", "dancer", "architect", "designer", "photographer", "journalist", "athlete", "musician", "veterinarian", "psychologist", "accountant", "economist", "entrepreneur", "manager", "consultant", "plumber", "electrician", "mechanic", "gardener", "hairdresser", "tailor", "librarian", "receptionist", "secretary", "waiter", "bartender", "cashier", "salesperson", "policeman", "fireman", "soldier"]
for name in names:
    name_token = nlp(name)
    max_similarity = -1
    closest_profession = None
    for profession in professions:
        profession_token = nlp(profession)
        similarity = name_token.similarity(profession_token)
        if similarity > max_similarity:
            max_similarity = similarity
            closest_profession = profession
    print(f"The closest profession to {name} is {closest_profession}.")

In [None]:
# now names and countries
names = ["Laura", "Giacomo", "Riku", "Tjarks", "Mossio", "Matsumoto", ]
countries = ["Germany", "Italy", "Brazil", "Japan", "France", "Spain", "USA", "Russia", "China", "India", "UK", "Canada", "Australia", "Mexico", "South Africa", "Argentina", "Nigeria", "Germany", "Italy", "Brazil", "Japan", "France", "Spain", "USA", "Russia", "China", "India", "UK", "Canada", "Australia", "Mexico", "South Africa", "Argentina", "Nigeria"]
for name in names:
    name_token = nlp(name)
    max_similarity = -1
    closest_country = None
    for country in countries:
        country_token = nlp(country)
        similarity = name_token.similarity(country_token)
        if similarity > max_similarity:
            max_similarity = similarity
            closest_country = country
    print(f"The closest country to {name} is {closest_country} and similarity is {max_similarity}.")

In [None]:
# now let's try with similarities beetwen countries and food ordered by similarity
countries = ["France", "Spain", "Germany", "Italy", "Japan", "India", "Mexico", "USA", "UK", "Australia", "Brazil", "Canada", "China", "Russia", "South Africa"]
food = ["sushi", "pizza", "curry", "pasta", "sausages"]

def find_closest_food(country, food):
    country_token = nlp(country)
    max_similarity = -1
    closest_food = None
    for food_item in food:
        food_token = nlp(food_item)
        similarity = country_token.similarity(food_token)
        if similarity > max_similarity:
            max_similarity = similarity
            closest_food = food_item
    return closest_food

for country in countries:
    closest_food = find_closest_food(country, food)
    print(f"The closest food to {country} is {closest_food} and the similarity is {nlp(country).similarity(nlp(closest_food))}.")

In [None]:

vectors = [nlp(text).vector for text in professions + dog_breeds]
pca = PCA(n_components=2)
vectors_transformed = pca.fit_transform(vectors)
plt.scatter(vectors_transformed[:, 0], vectors_transformed[:, 1])
for i, word in enumerate(professions + dog_breeds):
    plt.annotate(word, (vectors_transformed[i, 0], vectors_transformed[i, 1]))
plt.show()

In [None]:
#now let's plot the professions and dog breeds in a 3D space with labels
vectors = [nlp(text).vector for text in professions + dog_breeds]
pca = PCA(n_components=3)
vectors_transformed = pca.fit_transform(vectors)
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(vectors_transformed[:, 0], vectors_transformed[:, 1], vectors_transformed[:, 2])
for i, word in enumerate(professions + dog_breeds):
    ax.text(vectors_transformed[i, 0], vectors_transformed[i, 1], vectors_transformed[i, 2], word)

In [None]:
#now let's plot the professions and dog breeds in a 3D space with labels
vectors = [nlp(text).vector for text in cities]
pca = PCA(n_components=3)
vectors_transformed = pca.fit_transform(vectors)
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(vectors_transformed[:, 0], vectors_transformed[:, 1], vectors_transformed[:, 2])
for i, word in enumerate(cities):
    ax.text(vectors_transformed[i, 0], vectors_transformed[i, 1], vectors_transformed[i, 2], word)

In [None]:
# now similarities between cities ordered by similarity
similarities = []
for city1 in cities:
    for city2 in cities:
        if city1 != city2:
            similarity = nlp(city1).similarity(nlp(city2))
            similarities.append((city1, city2, similarity))

similarities.sort(key=lambda x: x[2], reverse=True)
for city1, city2, similarity in similarities:
    print(f"{city1} - {city2}: {similarity}")

In [None]:
# now similarities between professions ordered by similarity
similarities = []
for profession1 in professions:
    for profession2 in professions:
        if profession1 != profession2:
            similarity = nlp(profession1).similarity(nlp(profession2))
            similarities.append((profession1, profession2, similarity))

similarities.sort(key=lambda x: x[2], reverse=True)
for profession1, profession2, similarity in similarities:
    print(f"{profession1} - {profession2}: {similarity}")

In [None]:
# now similarities between countries ordered by similarity
countries = ["France", "Germany", "Spain", "Italy", "United Kingdom", "United States", "Canada", "Australia", "China", "Japan", "Brazil", "India", "Russia", "South Africa", "Switzerland", "Sweden", "Norway", "Finland", "Denmark", "Netherlands", "Belgium", "Austria", "Portugal", "Greece", "Turkey", "Egypt",]

# now let's see which countries are the most similar to a given country
country = "Egypt"
similarities = []
for other_country in countries:
    if other_country != country:
        similarity = nlp(country).similarity(nlp(other_country))
        similarities.append((country, other_country, similarity))

similarities.sort(key=lambda x: x[2], reverse=True)
for country1, country2, similarity in similarities:
    print(f"{country1} - {country2}: {similarity}")

In [None]:
# now the similarity between italian cities
cities = ["Rome", "Milan", "Naples", "Turin", "Palermo", "Genoa", "Bologna", "Florence", "Bari", "Catania", "Venice", "Bra"]

similarities = []
for city1 in cities:
    for city2 in cities:
        if city1 != city2:
            similarity = nlp(city1).similarity(nlp(city2))
            similarities.append((city1, city2, similarity))

similarities.sort(key=lambda x: x[2], reverse=True)
for city1, city2, similarity in similarities:
    print(f"{city1} - {city2}: {similarity}")


In [None]:
# now colors and professions
colors = ["red", "blue", "green", "yellow", "orange", "purple", "pink", "brown", "black", "white", "gray"]
for profession in professions:
    profession_token = nlp(profession)
    max_similarity = -1
    closest_color = None
    for color in colors:
        color_token = nlp(color)
        similarity = profession_token.similarity(color_token)
        if similarity > max_similarity:
            max_similarity = similarity
            closest_color = color
    print(f"The closest color to {profession} is {closest_color}.")

In [None]:
# now colors and countries
print(len(countries))
print(countries)
for country in countries:
    country_token = nlp(country)
    max_similarity = -1
    closest_color = None
    for color in colors:
        color_token = nlp(color)
        similarity = country_token.similarity(color_token)
        if similarity > max_similarity:
            max_similarity = similarity
            closest_color = color
    print(f"The closest color to {country} is {closest_color}.")

In [94]:
# now colors and cities

for city in cities:
    city_token = nlp(city)
    max_similarity = -1
    closest_color = None
    for color in colors:
        color_token = nlp(color)
        similarity = city_token.similarity(color_token)
        if similarity > max_similarity:
            max_similarity = similarity
            closest_color = color
    print(f"The closest color to {city} is {closest_color}.")

The closest color to London is black.
The closest color to Paris is yellow.
The closest color to Berlin is black.
The closest color to Madrid is yellow.
The closest color to Rome is white.
