In [1]:
# Import Libaries
import requests
import collections
import random
import math

In [None]:
url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
response = requests.get(url)
response.raise_for_status() # Raise an exception for invalid HTTP status codes
text_data = response.text
len(text_data), text_data[:100]

In [None]:
# sample
random.seed(42)

pos = random.randint(0, len(text_data) - 1000)
print(text_data[pos:pos+100])

In [None]:
# preprocessing - do not change
def preprocess_text(text_data):
  text_data = text_data.replace(',',' , ').replace(';', ' ').replace(':', ' ').replace('.',' . ').replace('?',' ? ').replace('!',' ! ')
  text_data = text_data.replace('-', ' ')
  text_data = text_data.replace('\'', '').replace('"', '')
  text_data = text_data.replace('  ', ' ')
  text_data = text_data.replace('\n\n','\n').replace('\n',' </s> <s> ')
  text_data = '<s> ' + text_data + ' </s>'
  text_data = text_data.lower()
  return text_data

text_data = preprocess_text(response.text)
print(f"Number of words: {len(text_data.split(' '))}")

In [None]:
train_data = text_data[:-10_000]
test_data = text_data[-10_000:]
len(train_data), len(test_data)

In [None]:
vocab = set(train_data.split(' '))
print(f"Number of unique words: {len(vocab)}")
print(f"Sample unique words: {list(vocab)[:10]}")

In [7]:
def identify_oov_words(corpus, n=3):
    """
    Identify out-of-vocabulary (OOV) words that appear less than `n` times in the dataset.

    Parameters:
    - dataset: The dataset to process. It should be a dictionary with a 'text' key.
    - n: The frequency threshold below which words are considered OOV.

    Returns:
    - A set of out-of-vocabulary words.
    """
    # INSERT CODE HERE

    words = corpus.split(' ')
    word_counts = collections.Counter(words)
    oov_words = {word for word, count in word_counts.items() if count < n}
    return oov_words



In [None]:
oov_words = identify_oov_words(train_data)

vocab = vocab - oov_words
vocab.add('<UNK>')
print(f"Number of OOV words: {len(oov_words)}")
print(f"Expected number of OOV words: {7181}")

assert len(oov_words) == 7181

In [9]:
train_data = ' '.join(['<UNK>' if word not in vocab else word for word in train_data.split(' ')])
test_data = ' '.join(['<UNK>' if word not in vocab else word for word in test_data.split(' ')])

In [10]:
uni_counts = collections.defaultdict(lambda:0)
bi_counts = collections.defaultdict(lambda:0)
tri_counts = collections.defaultdict(lambda:0)
four_counts = collections.defaultdict(lambda:0)
five_counts = collections.defaultdict(lambda:0)

In [11]:
words = train_data.split(' ')

for i in range(len(words)):
    uni_counts[words[i]] += 1
    if i < len(words) - 1:
        bi_counts[(words[i], words[i + 1])] += 1
    if i < len(words) - 2:
        tri_counts[(words[i], words[i + 1], words[i + 2])] += 1
    if i < len(words) - 3:
        four_counts[(words[i], words[i + 1], words[i + 2], words[i + 3])] += 1
    if i < len(words) - 4:
        five_counts[(words[i], words[i + 1], words[i + 2], words[i + 3], words[i + 4])] += 1


In [12]:
uni = collections.defaultdict(lambda:0)
bi = collections.defaultdict(lambda:0)
tri = collections.defaultdict(lambda:0)
four = collections.defaultdict(lambda:0)
five = collections.defaultdict(lambda:0)

In [13]:
total_words = sum(uni_counts.values())

for word, count in uni_counts.items():
    uni[word] = count / total_words
for (w1, w2), count in bi_counts.items():
    bi[(w1, w2)] = count / uni_counts[w1]
for (w1, w2, w3), count in tri_counts.items():
    tri[(w1, w2, w3)] = count / bi_counts[(w1, w2)]

for (w1, w2, w3, w4), count in four_counts.items():
    four[(w1, w2, w3, w4)] = count / tri_counts[(w1, w2, w3)]

for (w1, w2, w3, w4, w5), count in five_counts.items():
    five[(w1, w2, w3, w4, w5)] = count / four_counts[(w1, w2, w3, w4)]


In [14]:
# Evaluation
assert five[('<s>', 'against', 'the', 'roman', 'state')] == 1.0 # prob of last given prev 4
assert four[('remain', '</s>', '<s>', 'i')] == 0.25 # prob of last given prev 3
assert tri[('did', 'see', 'and')] == 0.5 # prob of last given prev 2
assert bi[('rash', 'like')] == 0.1 # prob of last given prev 1
assert round(uni[('citizen')],5) == 0.00031 # prob of last

In [15]:

def calculate_bigram_probability_with_smoothing(word1, word2):

    bigram_count = bi_counts[(word1, word2)] + 1
    unigram_count = uni_counts[word1] + len(vocab)
    return bigram_count / unigram_count

In [16]:
def compute_perplexity(data):
  """
    Computes the perplexity of a given text data using a bigram language model.

    Parameters:
    - data : str
    Returns:
    - float
  """

  assert len(data.split(' ')) >= 5
  # Hint: You should use the math library for exp and log
  # INSERT CODE HERE

  words = data.split(' ')

  log_sum = 0

  for i in range(len(words) - 1):
      word1 = words[i]
      word2 = words[i + 1]
      prob = calculate_bigram_probability_with_smoothing(word1, word2)
      log_sum += math.log(prob)

  perplexity = math.exp(-log_sum / (len(words)))

  print(f"perplexity: {perplexity}")

  return perplexity


In [None]:
assert round(compute_perplexity(test_data)) == 129

In [50]:
# run this cell to import nltk
import nltk

In [None]:
nltk.download('twitter_samples')
nltk.download('stopwords')
# you only need to run it once


In [52]:
import numpy as np
import pandas as pd
from nltk.corpus import twitter_samples
import re
import string

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

In [53]:
# select the set of positive and negative tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

In [54]:
# split the data into two pieces, one for training and one for testing (validation set)
test_pos = all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]
test_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]

train_x = train_pos + train_neg
test_x = test_pos + test_neg

In [55]:
# combine positive and negative labels
train_y = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)
test_y = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0)

In [56]:
def process_tweet(tweet):

  stemmer = PorterStemmer()
  stop_words = set(stopwords.words('english'))
  tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)

  tweet = re.sub(r'^RT[\s]+', '', tweet)

  tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)

  tweet = re.sub(r'#', '', tweet)

  tweet_tokens = tokenizer.tokenize(tweet)

  processed_tweet = []
  for word in tweet_tokens:
      if (word not in stop_words and
          word not in string.punctuation):
          stemmed_word = stemmer.stem(word)
          processed_tweet.append(stemmed_word)

  return processed_tweet

In [57]:
def build_freqs(tweets, ys):

    freqs = {}

    for tweet, label in zip(tweets, ys):
        processed_tweet = process_tweet(tweet)

        label = label.item()

        for word in processed_tweet:
            pair = (word, label)

            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1
    return freqs

In [58]:

def sigmoid(z):
    '''
    Input:
        z: is the input (can be a scalar or an array)
    Output:
        h: the sigmoid of z
    '''

    # write your code here
    z = np.clip(z, -100, 100) #divide by zero error
    h = 1 / (1 + np.exp(-z))

    return h

In [59]:
def gradientDescent(x, y, theta, alpha, num_iters):
    '''
    Input:
        x: matrix of features which is (m,n+1)
        y: corresponding labels of the input matrix x, dimensions (m,1)
        theta: weight vector of dimension (n+1,1)
        alpha: learning rate
        num_iters: number of iterations you want to train your model for
    Output:
        J: the final cost
        theta: your final weight vector
    Hint: you might want to print the cost to make sure that it is going down.
    '''
    ### Write your code here

    m = len(y)

    J_cost = []

    for i in range(num_iters):
        z = np.dot(x, theta)
        h = sigmoid(z)
        h = np.clip(h, 1e-10, (1 - 1e-10))

        J = (-1/m) * (np.dot(y.T, np.log(h)) + np.dot((1 - y).T, np.log(1 - h)))
        J_cost.append(J[0][0])

        #grad = (alpha/m) *

        theta = theta - ((alpha/m) * np.dot(x.T, (h - y)))

        #  print(f"cost: {J[0][0]}")

    return J_cost, theta



In [60]:
def extract_features(tweet, freqs):
    '''
    Input:
        tweet: a list of words for one tweet
        freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
    Output:
        x: a feature vector of dimension (1,3)
    '''

    # process_tweet tokenizes, stems, and removes stopwords
    word_l = process_tweet(tweet)

    # 3 elements in the form of a 1 x 3 vector
    x = np.zeros((1, 3))

    #bias term is set to 1
    x[0,0] = 1

    # write your code here
    pos_words = 0
    neg_words = 0

    for word in word_l:
        if (word, 1.0) in freqs:
            pos_words += freqs[(word, 1.0)]

        if (word, 0.0) in freqs:
            neg_words += freqs[(word, 0.0)]

    # normalizing to improve accuracy, long tweets filled with common words matter less
    total = pos_words + neg_words
    if total > 0: #divide by zero error
        x[0, 1] = pos_words / total
        x[0, 2] = neg_words / total

    return x

In [None]:
freqs = build_freqs(train_x, train_y)

m = len(train_x)

X_train = np.ones((m, 3))

for i in range(m):
    X_train[i, :] = extract_features(train_x[i], freqs)

Y_train = train_y

theta = np.zeros((3, 1))

alpha = 0.1
num_iters = 1500

J_cost, theta_final = gradientDescent(X_train, Y_train, theta, alpha, num_iters)
theta = theta_final

print(f"final cost J: {J_cost[-1]}")
print(f"final theta: \n{theta_final}")


In [62]:
def predict_tweet(tweet, freqs, theta):
    '''
    Input:
        tweet: a string
        freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
        theta: (3,1) vector of weights
    Output:
        y_pred: the probability of a tweet being positive or negative
    '''
    # write your code here

    x = extract_features(tweet, freqs)

    z = np.dot(x, theta)

    y_pred = sigmoid(z)

    return y_pred

In [63]:
def test_logistic_regression(test_x, test_y, freqs, theta):
    """
    Input:
        test_x: a list of tweets
        test_y: (m, 1) vector with the corresponding labels for the list of tweets
        freqs: a dictionary with the frequency of each pair (or tuple)
        theta: weight vector of dimension (3, 1)
    Output:
        accuracy: (# of tweets classified correctly) / (total # of tweets)
    """

   # write your code here

    m = len(test_x)

    correct = 0

    for i in range(m):
        tweet = test_x[i]

        y_pred = predict_tweet(tweet, freqs, theta)
        if y_pred > 0.5:
          y_hat = 1
        else:
           y_hat = 0

        if y_hat == test_y[i][0]:
              correct += 1

    accuracy = correct / m
    #print(f"num pos: {np.sum(train_y == 1)}")
    #print(f"num neg: {np.sum(train_y == 0)}")
    #print("final w:", theta_final)

    return accuracy

In [None]:
tmp_accuracy = test_logistic_regression(test_x, test_y, freqs, theta)
print(f"Logistic regression model's accuracy = {tmp_accuracy:.4f}")

In [None]:
!pip install gensim

In [36]:
import gensim.downloader
import numpy as np

In [None]:
glove_vectors = gensim.downloader.load('glove-wiki-gigaword-50')

In [None]:
print('data type:', type(glove_vectors.vectors))
print('# words:', glove_vectors.vectors.shape[0])
print('Embedding dimension:', glove_vectors.vectors.shape[1])

In [None]:
glove_vectors.most_similar('cat', topn=10)

In [None]:
glove_vectors.get_index('apple') # get the index of a word in the vocabulary

In [None]:
glove_vectors.index_to_key[3292] # index to word

In [None]:
glove_vectors.get_vector('apple') # get the word vector of a word

In [43]:
def my_most_similar(glove_vectors, query_word, topn):
    """
    Find the most similar words to a given query word based on cosine similarity in the GloVe embedding space.

    Args:
        glove_vectors (Gensim KeyedVectors)
        query_word (str): The word for which to find the most similar words.
        topn (int): The number of most similar words to return.

    Returns:
        list of tuples:
            - Each tuple contains a word (str) and its corresponding cosine similarity score (float) to the query word.
            - The list is sorted in descending order of cosine similarity.
    """
    # W (numpy.ndarray): glove word embeddings of shape (400000, 50)
    W = glove_vectors.vectors

    # Your code here

    query_vec = glove_vectors.get_vector(query_word)

    query_vec_norm = np.sum(query_vec ** 2) ** 0.5

    similarities = []

    for idx, word in enumerate(glove_vectors.index_to_key):
        word_vec = W[idx]

        word_vec_norm = np.sum(word_vec ** 2) ** 0.5

        numerator = np.dot(query_vec, word_vec)

        cos = numerator / (query_vec_norm * word_vec_norm) # (v1*v2) / ||v1||||v2||

        if word != query_word:
            similarities.append((word, cos))

    similarities = sorted(similarities, key=lambda x: x[1], reverse=True)

    my_list = similarities[:topn]

    assert len(my_list) == topn
    return my_list

In [None]:
#Your code here

min_vector_norm = float('inf')
max_vector_norm = -float('inf')

for word in glove_vectors.index_to_key:
    word_vec = glove_vectors.get_vector(word)

    norm = np.sum(word_vec ** 2) ** 0.5

    if norm < min_vector_norm:
        min_vector_norm = norm
    if norm > max_vector_norm:
        max_vector_norm = norm

print(f'max_vector_norm: {max_vector_norm:.3f}, min_vector_norm: {min_vector_norm:.3f}')

In [45]:
def diff_results(oracle_list, my_list):
  for oracle, mine in zip(oracle_list, my_list):
    assert oracle[0] == mine[0], "find the wrong word"
    assert np.isclose(oracle[1], mine[1]), "wrong consine similarity"

for query in ['computer', 'frog', 'car']:
  oracle_list = glove_vectors.most_similar(query, topn=10)
  my_list = my_most_similar(glove_vectors, query, topn=10)
  diff_results(oracle_list, my_list)

In [46]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
Z = pca.fit_transform(glove_vectors.vectors)

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10,5))
for word in ['king', 'queen', 'lord', 'lady', 'prince', 'princess', 'men', 'women']:
  point = Z[glove_vectors.get_index(word)]
  plt.scatter(point[0], point[1], color='b')
  plt.annotate(word, (point[0], point[1]))
  plt.xticks([])
  plt.yticks([])

In [48]:
def word_analogy(glove_vectors, word1, word2, word3):
    """
    Args:
        glove_vectors (Gensim KeyedVectors)
        word1 (str): The first word in the analogy.
        word2 (str): The second word in the analogy.
        word3 (str): The third word in the analogy for which to find the analogous word.

    Returns:
        pred_word (str): The word that best completes the analogy.
    """

    # Your code here

    vec1 = glove_vectors.get_vector(word1)
    vec2 = glove_vectors.get_vector(word2)
    vec3 = glove_vectors.get_vector(word3)
    direction = vec2 - vec1
    vec_tgt = vec3 + direction

    similarities = []

    for idx, word in enumerate(glove_vectors.index_to_key):
        if word == word3:
            continue

        word_vec = glove_vectors.get_vector(word)

        norm_vectgt = np.sum(vec_tgt ** 2) ** 0.5
        norm_wordvec = np.sum(word_vec ** 2) ** 0.5

        cos = np.dot(vec_tgt, word_vec) / ((norm_vectgt) * (norm_wordvec))

        similarities.append((word, cos))

    similarities = sorted(similarities, key=lambda x: x[1], reverse=True)

    pred_word = similarities[0][0]

    print(f'{word1} is to {word2} as {word3} is to? {pred_word}')
    assert pred_word != word3
    return pred_word

In [None]:
pred_word = word_analogy(glove_vectors, 'prince', 'princess', 'lord')
pred_word = word_analogy(glove_vectors, 'aunt', 'uncle', 'queen')
pred_word = word_analogy(glove_vectors, 'london', 'england', 'paris')
pred_word = word_analogy(glove_vectors, 'cat', 'cats', 'car')