#### Question 4
---

In [None]:
from collections import defaultdict

import numpy as np


def get_entropy_y(labels):
    cnt, ttl = defaultdict(lambda: 0), len(labels)
    for label in labels:
        cnt[label] += 1
    
    probs = [count / ttl for count in cnt.values()]
    
    return -sum(p * np.log2(p) for p in probs if p > 0)


def get_entropy_x(subsets):
    ttl = sum(len(subset) for subset in subsets)
    entropies = [get_entropy_y(subset) * len(subset) / ttl for subset in subsets]
    
    return sum(entropies)


X = {
    "Humidity":["Dry","Dry","Medium","Medium","Medium","Medium","Wet","Wet","Wet"],
    "Temperature": ["Cold","Cold","Average","Average","Hot","Hot","Cold","Average","Hot"],
    "Wind":["High","Low","High","Low","High","Low","Low","Low","High"],
}
Y = ["Yes","No","Yes","No","No","No","Yes","No","Yes"]

entropy_y = get_entropy_y(Y)

info_gains = {}
for feature, values in X.items():
    subsets = defaultdict(list)
    for value, label in zip(values, Y):
        subsets[value].append(label)
    entropy_x = get_entropy_x(list(subsets.values()))
    info_gains[feature] = entropy_y - entropy_x

sum(info_gains.values())

#### Question 5
---

In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize


def lemmatise_corpus(corpu):
    words = word_tokenize(corpu.lower())
    
    return " ".join(lemmatiser.lemmatize(word) for word in words)


corpora = [
    "Upon the road I met seven wives",
    "Every wife had seven sacks",
    "Every sack had seven cats",
    "Every cat had seven kits",
]

lemmatiser = WordNetLemmatizer()
lemmatised_corpora = [lemmatise_corpus(corpus) for corpus in corpora]

vectoriser = CountVectorizer(token_pattern=r"\b\w+\b")
bow = vectoriser.fit_transform(lemmatised_corpora)

X = cosine_similarity(bow)
indices = np.triu_indices(len(X), k=1)
X[indices].max()

#### Question 6
---

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize


def lemmatise_corpus(corpu):
    words = word_tokenize(corpu.lower())
    
    return " ".join(lemmatiser.lemmatize(word) for word in words)


corpora = [
    "Upon the road I met seven wives",
    "Every wife had seven sacks",
    "Every sack had seven cats",
    "Every cat had seven kits",
]

lemmatiser = WordNetLemmatizer()
lemmatised_corpora = [lemmatise_corpus(corpus) for corpus in corpora]

vectoriser = CountVectorizer(token_pattern=r"\b\w+\b")
bow = vectoriser.fit_transform(lemmatised_corpora)
bow = pd.DataFrame(bow.toarray(), columns=vectoriser.get_feature_names_out())

tf = bow.div(bow.sum(axis=1), axis=0)
idf = np.log10(len(corpora) / (bow > 0).sum())
tfidf = tf * idf

X = cosine_similarity(tfidf)
X[1,2]

#### Question 7
---

In [None]:
import numpy as np


W_x = np.array([
    [0.08,0.64,0.77,0.38],
    [0.1,0.57,0.29,0.48],
    [0.23,0.19,0.15,0.29],
])
W_h = np.array([
    [0.21,0.22,0.23],
    [0.31,0.44,0.26],
    [0.41,0.54,0.36],
])
B = np.array([0.41,0.12,0.63])
H_1 = np.array([0.3,0.2,0.1])
X_1 = np.array([1,0,0,0])

np.tanh(np.dot(W_x, X_1) + np.dot(W_h, H_1) + B)[2]

#### Question 8
---

In [None]:
import numpy as np


def softmax(z):
    
    return np.exp(z) / sum(np.exp(z))


W_y = np.array([
    [0.24,0.27,0.35],
    [0.15,0.18,0.21],
    [0.25,0.26,0.64],
    [0.86,0.13,0.41],
])
H = np.array([0.45,0.56,0.84])

softmax(np.dot(W_y, H))[2]

#### Question 15
---

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


corpora = [
    "The sun has long been set.",
    "The stars are out by twos and threes.",
    "The little birds are piping yet.",
    "Among the bushes and trees.",
]

vectoriser = TfidfVectorizer()
tfidf = vectoriser.fit_transform(corpora)

X = cosine_similarity(tfidf)
X[1,3]

#### Question 16
---

In [None]:
import gensim
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


model = gensim.models.KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin", binary=True)
 
rocket = model["rocket"]
plane = model["plane"]
ship = model["ship"]
banana = model["banana"]

A = cosine_similarity([rocket], [plane])[0][0]
B = cosine_similarity([ship], [banana])[0][0]

A - B