In [1]:
!pip install wikidata

import pandas as pd
import numpy as np
import os
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm



In [2]:
"""from google.colab import drive
drive.mount('/content/drive/', force_remount=True)
checkpoint_path = '/content/drive/MyDrive/NLP_HOMEWORK_1/word2vec/'"""

"from google.colab import drive\ndrive.mount('/content/drive/', force_remount=True)\ncheckpoint_path = '/content/drive/MyDrive/NLP_HOMEWORK_1/word2vec/'"

In [2]:
dataset_path = "./gold_dataset_clean.csv"
dataset = pd.read_csv(dataset_path, sep=",")
dataset.head()

Unnamed: 0,item,name,description,type,category,subcategory,label
0,Q306,Sebastián Piñera,Chilean entrepreneur and politician (1949–2024),entity,politics,politician,cultural exclusive
1,Q12735,John Amos Comenius,"Czech teacher, educator, philosopher and write...",entity,politics,politician,cultural representative
2,Q1752,Macrinus,Roman emperor from 217 to 218,entity,politics,politician,cultural representative
3,Q1639,Lamine Diack,Senegalese sports manager (1933–2021),entity,politics,politician,cultural representative
4,Q9588,Richard Nixon,President of the United States from 1969 to 1974,entity,politics,politician,cultural representative


In [3]:
dataset.shape

(6238, 7)

In [3]:
import requests
import re
from wikidata.client import Client


wiki_path = "/content/drive/MyDrive/NLP_HOMEWORK_1/data/wikitext.txt"

def retrieve_wikitext(entities_id, file_path):
  """
    Function to return Wikipedia Article of a given Wikidata Entity (in english).

    Arguments:
    entities_id -- list of entities id from Wikidata
  """
  # Wikidata client instantiation
  client = Client()



  # Useful sub-functions
  def clean_wikipedia_extract(text):
    """Sub-function that cleans wikipedia text"""

    # Remove unwanted paragraphs

    text = re.sub(r"^==.*?==\s*", "", text, flags=re.MULTILINE)

    end_markers = ["See also", "References", "External links", "Further reading"]
    for marker in end_markers:
        pattern = rf"==\s*{marker}\s*==.*"
        text = re.sub(pattern, "", text, flags=re.IGNORECASE | re.DOTALL)

    text = text.replace("\n", " ")
    text = text.replace("\t", " ")
    return text.strip()

  def get_text(item):
    """
      Sub-function that handles the get request from Wikipedia

      Arguments:
      item -- wikidata.Entity
    """
    sitelinks = item.data.get("sitelinks", {})
    enwiki = sitelinks.get("enwiki")
    if enwiki:
        title = enwiki["title"]

        api_url = "https://en.wikipedia.org/w/api.php"
        params = {
            "action": "query",
            "prop": "extracts",
            "explaintext": True,
            "titles": title,
            "format": "json",
            "redirects": 1
        }

        res = requests.get(api_url, params=params).json()
        pages = res.get("query", {}).get("pages", {})
        if not pages:
            return ""
        page = next(iter(pages.values()))
        text = page.get("extract", "")
        text = text.lower()
        text = clean_wikipedia_extract(text)
        return text
    else:
      print(f"No English Wikipedia page found for entity . (skipping)")
      return ""


  tot = len(entities_id)
  with open(file_path, "a") as f:
    for entity_id in tqdm(entities_id, total=tot):
      item = client.get(entity_id, load=True)
      text = get_text(item)
      text += "\n"
      f.write(text)

  


Retrieve text from all articles (78m 34s)

In [5]:

entities_id = dataset["item"].to_numpy()

if(not os.path.isfile("./wiki-text.txt")):
    retrieve_wikitext(entities_id=entities_id, file_path="./wiki-text.txt")


    

In [6]:
import collections

class Word2VecDataset(torch.utils.data.IterableDataset):

    def __init__(self, txt_path, vocab_size, unk_token, window_size, pre_word2id=None):
        """
        Args:
          txt_file (str): Path to the raw-text file.
          vocab_size (int): Maximum amount of words that we want to embed.
          unk_token (str): How will unknown words represented (e.g. 'UNK').
          window_size (int): Number of words to consider as context.
          pre_word2id (np.array): Word to ID dictionary of a pretrained model 
        """
        self.window_size = window_size
        self.pre_word2id = pre_word2id
        # [[w_{1,s1}, w_{2,s1}, ..., w_{|s1|,s1}], [w_{1,s2}, w_{2,s2}, ..., w_{|s2|,s2}], ..., [w_{1,sn}, ..., w_{|sn|,sn}]]
        self.data_words = self.read_data(txt_path)
        self.build_vocabulary(vocab_size, unk_token)

        

    def __iter__(self):

        """
          When iterating through this dataset we must include the window size
          So for each sentence in the dataset we iterate through each word

          For each word wi we output input: wi, target: wi-j
          where j is in range (-window_size, window_size)

          Example:
          sentence = ["the", "dog", "is", "lazy"]
          window_size = 2

          first iteration -> {input: "the" (as index), output: "dog"} {input: "the", output: "is"}

        """
        sentences = self.data_words
        for sentence in sentences:
            len_sentence = len(sentence)

            for input_idx in range(len_sentence):
                current_word = sentence[input_idx]

                # must be a word in the vocabulary
                if current_word in self.word2id and self.keep_word(current_word):
                    # index of input word
                    current_word_id = self.word2id[current_word]

                    # left and right window indices
                    min_idx = max(0, input_idx - self.window_size)
                    max_idx = min(len_sentence, input_idx + self.window_size)

                    window_idxs = [x for x in range(min_idx, max_idx) if x != input_idx]
                    for target_idx in window_idxs:
                        # must be a word in the vocabulary
                        if sentence[target_idx] in self.word2id:
                            # index of target word in vocab
                            target_word_id = self.word2id[sentence[target_idx]]
                            output_dict = {'inputs':current_word_id, 'targets':target_word_id}

                            yield output_dict

    def keep_word(self, word):
        '''Implements subsampling to avoid overly frequent words and returns true if we can keep the occurrence as training instance.'''
        z = self.frequency[word] / self.tot_occurrences  # f(w): relative frequency
        t = 1e-5  # standard value used in practice
        p_keep = np.sqrt(t / z)
        p_keep = min(1.0, p_keep)  # cap at 1
        return np.random.rand() < p_keep

    def read_data(self, txt_path):
        """Converts each line in the input file into a list of lists of tokenized words."""
        data = []
        total_words = 0
        # tot_lines = self.count_lines(txt_path)
        with open(txt_path) as f:
            for line in f:
                split = self.tokenize_line(line)
                if split:
                    # split is a list of words which is appended to data
                    data.append(split)
                    total_words += len(split)
        return data

    # "The pen is on the table" -> ["the, "pen", "is", "on", "the", "table"]
    def tokenize_line(self, line, pattern='\W'):
        """Tokenizes a single line."""

        return [word for word in re.split(pattern, line.lower()) if word]

    def build_vocabulary(self, vocab_size, unk_token):
        """Defines the vocabulary to be used. Builds a mapping (word, index) for
        each word in the vocabulary.

        Args:
          vocab_size (int): size of the vocabolary
          unk_token (str): token to associate with unknown words
        """
        counter_list = []
        # context is a list of tokens within a single sentence
        for context in self.data_words:
            counter_list.extend(context)
        counter = collections.Counter(counter_list)
        # just for debugging/example purposes: print(counter['house'], counter['plane'], counter['the'])
        counter_len = len(counter)
        print(f"Number of distinct words: {counter_len}")

        # consider only the (vocab size-1) most common words to build the vocab
        # dictionary will contain pairs 'word, index' where index is a unique ID for the word
        # which will be used as the position in the one-hot encoding for the word


        #Compatibility with pre trained models 
        if(self.pre_word2id != None):
            self.word2id = self.pre_word2id 
            word2index = self.pre_word2id
        else:
            word2index = {key: index for index, (key, _) in enumerate(counter.most_common(vocab_size-1))}
            # UNK doesn't occur in the dictionary
            assert unk_token not in word2index
            # all the other words are mapped to UNK
            word2index[unk_token] = vocab_size-1
            self.word2id = word2index

        # we create a new "counter" dictionary only for our vocab words, containing (word, frequency) pairs
        dict_counts = {x: counter[x] for x in word2index if x is not unk_token}
        self.frequency = dict_counts
        self.tot_occurrences = sum(dict_counts[x] for x in dict_counts)

        print(f'Total occurrences of words in the dataset (excl. UNK tokens): {self.tot_occurrences}')

        if(self.pre_word2id == None):
            less_freq_word = min(dict_counts, key=counter.get)
            print(f'Least frequent word in dictionary ({less_freq_word}) appears {dict_counts[less_freq_word]} times')

        # create the index to word dictionary

        self.id2word = {value: key for key, value in word2index.items()}

        # data is the text converted to indexes, as list of lists
        data = []
        # for each sentence
        for sentence in self.data_words:
            paragraph = []
            # for each word in the sentence
            for w in sentence:
                #retrieve the id of the word
                #if the word is an unknown word, don't add to data list
                #otherwise it may cause inbalance
                id_ = word2index[w] if w in word2index else word2index[unk_token]
                if id_ == word2index[unk_token]:
                    continue
                paragraph.append(id_)
            data.append(paragraph)
        # list of lists of indices, where each sentence is a list of indices, ignoring UNK
        self.data_idx = data
        # now I have a field self.data_words which contains the sentences as
        # a list of lists of tokens and a field self.data_idx which contains
        # the sentences as a list of lists of the corresponding indices having
        # removed unknown tokens



  def tokenize_line(self, line, pattern='\W'):


In [7]:
VOCAB_SIZE = 10000

train_set = Word2VecDataset("./wiki-text.txt", VOCAB_SIZE, "UNK", 5)

Number of distinct words: 268036
Total occurrences of words in the dataset (excl. UNK tokens): 9928675
Least frequent word in dictionary (fix) appears 93 times


In [8]:
trainset_loader = DataLoader(train_set, 32)


In [4]:
import torch.nn as nn

class SkipGram(nn.Module):

    def __init__(self, vocabulary_size, embedding_dim, id2word, word_counts, weights=None):
        super(SkipGram, self).__init__()
        
        self.vocabulary_size = vocabulary_size
        self.embedding_dim = embedding_dim

        # matrix W
        # because the first component is just a lookup table, we avoid using: self.embeddings = nn.Linear(self.vocabulary_size, self.embedding_dim)
        
        if(weights != None):
            # load pre-trained embedding
            self.embeddings = nn.Embedding.from_pretrained(weights, freeze=False)
        else:
            self.embeddings = nn.Embedding(num_embeddings=vocabulary_size, embedding_dim=embedding_dim)
        # matrix W' and loss function
        self.output_weights = nn.Linear(self.embedding_dim, self.vocabulary_size)
        self.loss_function = nn.CrossEntropyLoss()

    def forward(self, input_idx):
        input_embeddings = self.embeddings(input_idx)
        output_logits = self.output_weights(input_embeddings)
        return output_logits

In [10]:
class Trainer():
    def __init__(self, model, optimizer, device):

        self.device = device

        self.model = model
        self.optimizer = optimizer
        # starts requires_grad for all layers
        self.model.train()  # we are using this model for training (some layers have different behaviours in train and eval mode)
        self.model.to(self.device)  # move model to GPU if available

    def train(self, train_dataset, output_folder, epochs=1, total_batches=0):

        train_loss = 0.0
        for epoch in range(epochs):
            epoch_loss = 0.0
            num_batches = 0

            # each element (sample) in train_dataset is a batch
            for step, sample in tqdm(enumerate(train_dataset), total=total_batches, desc="Batch", leave=False):

                inputs = sample['inputs'].to(self.device)
                # outputs in the batch
                targets = sample['targets'].to(self.device)

                output_distribution = self.model(inputs)
                loss = self.model.loss_function(output_distribution, targets)

                # calculates the gradient and accumulates
                loss.backward()  # we backpropagate the loss
                # updates the parameters
                # by applying the gradients to update all the parameters (weights and biases) of the model
                # as computed using backpropagation during the call to loss.backward()
                self.optimizer.step()
                # zeroes the gradients
                self.optimizer.zero_grad()

                epoch_loss += loss.item()
                num_batches += 1

            avg_epoch_loss = epoch_loss / num_batches

            print(f'Epoch: {epoch} avg loss = {avg_epoch_loss:.4f}')

            train_loss += avg_epoch_loss
            torch.save(self.model.state_dict(),
                       os.path.join(output_folder, f'state_{epoch}.pt'))  # save the model state

        avg_epoch_loss = train_loss / epochs
        return avg_epoch_loss

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:



model = SkipGram(vocabulary_size=VOCAB_SIZE, 
                 embedding_dim=300, 
                 id2word=train_set.id2word, 
                 word_counts=train_set.frequency)

optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

c = 0
for b in trainset_loader:
  c += 1
print("Number of batches:", c)

trainer = Trainer(model, optimizer, device)

Number of batches: 526824


In [12]:
train = False
if(train):
    avg_loss = trainer.train(trainset_loader, "./data/word2vec/50k_vocab_size", epochs=10, total_batches=c)

In [18]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt 


for epoch in [0, 5, 10]:
  ## load model from checkpoint

  model.load_state_dict(torch.load(os.path.join("./data/word2vec/", f'state_{epoch}.pt')))

  # set the model in evaluation mode
  # (disables dropout, does not update parameters and gradient)
  model.eval()

  # retrieve the trained embeddings
  embeddings = model.embeddings.weight

  print(list(train_set.word2id.keys())[:100])

  # pick some words to visualise
  words = ['italy', 'rome', 'france', 'paris']

  # perform PCA to reduce our 300d embeddings to 2d points that can be plotted
  pca = PCA(n_components=2)
  pca_result = pca.fit_transform(embeddings.detach().cpu()) # .t() transpose the embeddings

  indexes = [train_set.word2id[x] for x in words]
  
  points = [pca_result[i] for i in indexes]
  for i,(x,y) in enumerate(points):
      plt.plot(x, y, 'ro')
      plt.text(x, y, words[i], fontsize=12) # add a point label, shifted wrt to the point
  plt.title('epoch {}'.format(epoch))
  plt.show()

RuntimeError: Error(s) in loading state_dict for SkipGram:
	size mismatch for embeddings.weight: copying a param with shape torch.Size([4000, 300]) from checkpoint, the shape in current model is torch.Size([50000, 300]).
	size mismatch for output_weights.weight: copying a param with shape torch.Size([4000, 300]) from checkpoint, the shape in current model is torch.Size([50000, 300]).
	size mismatch for output_weights.bias: copying a param with shape torch.Size([4000]) from checkpoint, the shape in current model is torch.Size([50000]).

In [17]:
def analogy_test(embeddings, positive, negative):
    """
    Perform an analogy test: positive[0] - negative[0] + positive[1] ≈ closest word
    """
    cosine_similarity = torch.nn.CosineSimilarity(dim=0)
    pos_indexes = [train_set.word2id[word] for word in positive]
    neg_indexes = [train_set.word2id[word] for word in negative]
    # Get embeddings for the positive and negative words

    positive_emb = 0
    negative_emb = 0
    for pos in pos_indexes:
        positive_emb += embeddings[pos]
    
    for neg in neg_indexes:
        negative_emb += embeddings[pos]
    
    # Calculate the resulting vector
    result_vector = positive_emb - negative_emb
    
    # Find the most similar word in the vocabulary
    max_similarity = -1
    best_match_idx = -1
    for idx in range(50000):
        if idx in pos_indexes:  # Skip the words in the analogy itself
            continue
        if idx in neg_indexes:
            continue

        sim = cosine_similarity(result_vector, embeddings[idx])
        if sim > max_similarity:
            max_similarity = sim
            best_match_idx = idx
    
    return train_set.id2word[best_match_idx], max_similarity


word_a = 'pizza'
word_b = 'italy'
word_c = 'germany'

word, sim = analogy_test(embeddings=embeddings, positive=[word_a, word_c], negative=[word_c])

word, sim

('puccini', tensor(0.2587, device='cuda:0', grad_fn=<SumBackward1>))

# Pre - Trained Word2Vec

In [5]:
import gensim.downloader
from gensim.models import KeyedVectors


create_again = False 
if(create_again):
    word2vec_google = gensim.downloader.load('word2vec-google-news-300')
    pretrained_embedding = word2vec_google.vectors

    pretrained_embedding = torch.tensor(pretrained_embedding)

    torch.save(pretrained_embedding, "word2vec_embedding.pt")

Load Google's Word2Vec stuff


In [6]:
import pickle


with open("./data/word2vec/w2v_index_to_key.pkl", "rb") as f:
    w2v_google_index2key= pickle.load(f)

with open("./data/word2vec/w2v_key_to_index.pkl", "rb") as f:
    w2v_google_key2index = pickle.load(f)

w2v_embedding = torch.load("./data/word2vec/word2vec_embedding.pt")


display(w2v_google_index2key)
display(w2v_google_key2index)
display(w2v_embedding)

['</s>',
 'in',
 'for',
 'that',
 'is',
 'on',
 '##',
 'The',
 'with',
 'said',
 'was',
 'the',
 'at',
 'not',
 'as',
 'it',
 'be',
 'from',
 'by',
 'are',
 'I',
 'have',
 'he',
 'will',
 'has',
 '####',
 'his',
 'an',
 'this',
 'or',
 'their',
 'who',
 'they',
 'but',
 '$',
 'had',
 'year',
 'were',
 'we',
 'more',
 '###',
 'up',
 'been',
 'you',
 'its',
 'one',
 'about',
 'would',
 'which',
 'out',
 'can',
 'It',
 'all',
 'also',
 'two',
 'after',
 'first',
 'He',
 'do',
 'time',
 'than',
 'when',
 'We',
 'over',
 'last',
 'new',
 'other',
 'her',
 'people',
 'into',
 'In',
 'our',
 'there',
 'A',
 'she',
 'could',
 'just',
 'years',
 'some',
 'U.S.',
 'three',
 'million',
 'them',
 'what',
 'But',
 'so',
 'no',
 'like',
 'if',
 'only',
 'percent',
 'get',
 'did',
 'him',
 'game',
 'back',
 'because',
 'now',
 '#.#',
 'before',
 'company',
 'any',
 'team',
 'against',
 'off',
 'This',
 'most',
 'made',
 'through',
 'make',
 'second',
 'state',
 'well',
 'day',
 'season',
 'says',
 'w

{'</s>': 0,
 'in': 1,
 'for': 2,
 'that': 3,
 'is': 4,
 'on': 5,
 '##': 6,
 'The': 7,
 'with': 8,
 'said': 9,
 'was': 10,
 'the': 11,
 'at': 12,
 'not': 13,
 'as': 14,
 'it': 15,
 'be': 16,
 'from': 17,
 'by': 18,
 'are': 19,
 'I': 20,
 'have': 21,
 'he': 22,
 'will': 23,
 'has': 24,
 '####': 25,
 'his': 26,
 'an': 27,
 'this': 28,
 'or': 29,
 'their': 30,
 'who': 31,
 'they': 32,
 'but': 33,
 '$': 34,
 'had': 35,
 'year': 36,
 'were': 37,
 'we': 38,
 'more': 39,
 '###': 40,
 'up': 41,
 'been': 42,
 'you': 43,
 'its': 44,
 'one': 45,
 'about': 46,
 'would': 47,
 'which': 48,
 'out': 49,
 'can': 50,
 'It': 51,
 'all': 52,
 'also': 53,
 'two': 54,
 'after': 55,
 'first': 56,
 'He': 57,
 'do': 58,
 'time': 59,
 'than': 60,
 'when': 61,
 'We': 62,
 'over': 63,
 'last': 64,
 'new': 65,
 'other': 66,
 'her': 67,
 'people': 68,
 'into': 69,
 'In': 70,
 'our': 71,
 'there': 72,
 'A': 73,
 'she': 74,
 'could': 75,
 'just': 76,
 'years': 77,
 'some': 78,
 'U.S.': 79,
 'three': 80,
 'million': 81

tensor([[ 1.1292e-03, -8.9645e-04,  3.1853e-04,  ..., -1.5640e-03,
         -1.2302e-04, -8.6308e-05],
        [ 7.0312e-02,  8.6914e-02,  8.7891e-02,  ..., -4.7607e-02,
          1.4465e-02, -6.2500e-02],
        [-1.1780e-02, -4.7363e-02,  4.4678e-02,  ...,  7.1289e-02,
         -3.4912e-02,  2.4170e-02],
        ...,
        [-1.9653e-02, -9.0820e-02, -1.9409e-02,  ..., -1.6357e-02,
         -1.3428e-02,  4.6631e-02],
        [ 3.2715e-02, -3.2227e-02,  3.6133e-02,  ..., -8.8501e-03,
          2.6978e-02,  1.9043e-02],
        [ 4.5166e-02, -4.5166e-02, -3.9368e-03,  ...,  7.9590e-02,
          7.2266e-02,  1.3000e-02]])

For every item in the dataset, I will take the average of the words in the Wikipedia Article 

In [7]:
from datasets import load_dataset
dataset = load_dataset('sapienzanlp/nlp2025_hw1_cultural_dataset')

dataset['train'][332] 

{'item': 'http://www.wikidata.org/entity/Q252187',
 'name': 'áo dài',
 'description': 'Vietnamese national costume, tunic',
 'type': 'concept',
 'category': 'fashion',
 'subcategory': 'clothing',
 'label': 'cultural representative'}

In [7]:
import requests
import re
from wikidata.client import Client



  


In [8]:
import pickle

class WikiScraper:

    def __init__(self, dataset, wikidata_client, output_path, load=False, load_file_path=None):

        """
        Args: 
            dataset - Dataset that contains the informations we want 
            wikidata_client - wikidata.Client(), used to scrape the content in the WikiData pages
            output_path - Path to the file that will be saved by the scraper e.g. /home/wikidata/ -> will generate /home/wikidata/wiki-text.txt and /home/wikidata/wikidict.pkl
            load - skips the scraping and loads data from the file in load_file_path 
        """
        self.wikidat_client = wikidata_client
        self.OUTPUT_PATH = output_path
        self.dataset = dataset

        
        
        self.entities_id = self.retrieve_entities()
        self.entity2row = self.entity_to_row()

        if(load == True):
            os.makedirs(os.path.dirname(self.OUTPUT_PATH), exist_ok=True)
            if(not load_file_path):
                raise Exception("Load Path must be a valid path if load=True")
            with open(load_file_path, "rb") as f:

                try:
                    self.wikidict = pickle.load(f)
                except: 
                    raise Exception("File cannot be loaded")
        else: 
            self.wikidict = self.retrieve_wikidict()


    def extract_id(self, link):
        """
        Retrieves the entity id from the "item" column in the dataset, which is a Wikidata link.

            Args: 
            link - the link column
        """
        return link.split("/")[-1]

    def retrieve_entities(self):

        """Retrieves all the Wikidata IDs from the dataset."""
        res = []
        for row in self.dataset:
            # item is the column relative to the wikidata link
            item = row["item"]
            res.append(self.extract_id(item))
        return res

    def entity_to_row(self):
        """Creates the dictionary entity_id: row_number"""
        dic={}
        for index, row in enumerate(self.dataset):
            item = row["item"]
            entity = self.extract_id(item)

            dic[entity] = index 

        return dic

    def retrieve_wikidict(self):
        """
        Function that scrapes WikiData pages for the items in the dataset <br>
        It does two things: <br>
            1. returns a dictionary with the format
            entity_id: wikipedia_article. <br>
            2. it creates a file to OUTPUT_PATH containing all the text scraped from Wikipedia related articles

            Args:
            entities_id: list of entities id from Wikidata
        """
        
        entities_id = self.entities_id
        # Wikidata client instantiation
        client = Client()

        # Useful sub-functions
        def clean_wikipedia_extract(text):
            """Sub-function that cleans wikipedia text"""

            # Remove unwanted paragraphs

            text = re.sub(r"^==.*?==\s*", "", text, flags=re.MULTILINE)

            end_markers = ["See also", "References", "External links", "Further reading"]
            for marker in end_markers:
                pattern = rf"==\s*{marker}\s*==.*"
                text = re.sub(pattern, "", text, flags=re.IGNORECASE | re.DOTALL)

            text = text.replace("\n", " ")
            text = text.replace("\t", " ")
            return text.strip()

        def get_text(item):
            """
            Sub-function that handles the get request from Wikipedia

            Arguments:
            item -- wikidata.Entity
            """
            sitelinks = item.data.get("sitelinks", {})
            enwiki = sitelinks.get("enwiki")
            if enwiki:
                title = enwiki["title"]

                api_url = "https://en.wikipedia.org/w/api.php"
                params = {
                    "action": "query",
                    "prop": "extracts",
                    "explaintext": True,
                    "titles": title,
                    "format": "json",
                    "redirects": 1
                }

                res = requests.get(api_url, params=params).json()
                pages = res.get("query", {}).get("pages", {})
                if not pages:
                    return ""
                page = next(iter(pages.values()))
                text = page.get("extract", "")
                text = text.lower()
                text = clean_wikipedia_extract(text)
                return text
            else:
                print(f"No English Wikipedia page found for entity . (skipping)")
            return ""


        tot = len(entities_id)
        dic = {}

        # Save related Wikidata text to output_path 
        # and create the dictionary entity: wikipedia article
        with open(self.OUTPUT_PATH+"wiki-text.txt", "a") as f:

            for entity_id in tqdm(entities_id, total=tot):
                item = client.get(entity_id, load = True)
                text = get_text(item)
                dic[entity_id] = text
                text+="\n"
                f.write(text)

        with open(self.OUTPUT_PATH+"wikidict.pkl", "wb") as f:
            pickle.dump(dic, f)

        return dic

    

In [52]:
for i in dataset["train"].select(range(2)):
    print(i)

{'item': 'http://www.wikidata.org/entity/Q32786', 'name': '916', 'description': '2012 film by M. Mohanan', 'type': 'entity', 'category': 'films', 'subcategory': 'film', 'label': 'cultural exclusive'}
{'item': 'http://www.wikidata.org/entity/Q371', 'name': '!!!', 'description': 'American dance-punk band from California', 'type': 'entity', 'category': 'music', 'subcategory': 'musical group', 'label': 'cultural representative'}


In [12]:
wikiscraper = WikiScraper(dataset["train"], Client(), "./WikiScraper/", load=True, load_file_path="./WikiScraper/wikidict.pkl")


Train a Classifier for each item in the dataset. We take the Wikipedia article for the item and embed each word using Google's Word2Vec. For each article we take the Average and this will be passed to a fully connected layer that will output one of the three categories.

In [None]:
dataset["train"][1]

{'item': 'http://www.wikidata.org/entity/Q371',
 'name': '!!!',
 'description': 'American dance-punk band from California',
 'type': 'entity',
 'category': 'music',
 'subcategory': 'musical group',
 'label': 'cultural representative'}

In [31]:
from torch.nn.utils.rnn import pad_sequence

class WikiDataset(torch.utils.data.IterableDataset):

    def __init__(self, dataset, wikidict, entity2row, w2v_key2index):

        self.dataset = dataset 
        self.wikidict = wikidict
        self.entity2row = entity2row
        self.w2v_key2index = w2v_key2index 
    
    def __iter__(self):

        for row in self.dataset:
            item = row["item"]
            entity_id = item.split("/")[-1]
            
            article_words = self.tokenize_words(self.wikidict[entity_id])


            encoded_words = self.encode_words(article_words)

            encoded_words = torch.tensor(encoded_words)

            label = row["label"]

            if(label == "cultural representative"):
                label = 0 
            elif(label == "cultural agnostic"):
                label = 1
            elif(label == "cultural exclusive"):
                label = 2 
            else: 
                continue
            
            one_hot = torch.tensor(label)

            yield {"input": encoded_words, "target": one_hot}

    
    
    # used for the DataLoader
    # the sequences will all have the same length (length of the biggest sequence)
    # the shorter ones will be padded with values 0 
    def collate_fn(self, batch):
        
        token_lists = [item["input"] for item in batch]
        labels = [item["target"] for item in batch]


        padded_tokens = pad_sequence(token_lists, batch_first=True, padding_value=0)  # or your PAD token
        labels = torch.stack(labels)

        return {"input": padded_tokens, "target": labels }


    def encode_words(self, words):

        res = []
        for word in words: 
            if(word and word in self.w2v_key2index):

                res.append(self.w2v_key2index[word])
        
        return res 

    def tokenize_words(self, text, pattern="\W+"):
        return [word for word in re.split(pattern, text.lower()) if word]

  def tokenize_words(self, text, pattern="\W+"):


In [32]:
wikidataset = WikiDataset(dataset["train"], wikiscraper.wikidict, wikiscraper.entity2row, w2v_google_key2index)

wikidataloader = DataLoader(wikidataset, batch_size = 16, collate_fn=wikidataset.collate_fn)

total_batches = 0

for i in wikidataloader:
    total_batches+=1

total_batches


391

Model definition


In [43]:
class CulturalClassifier(nn.Module):

    def __init__(self, weights, embedding_size):

        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(weights)
        self.output = nn.Linear(embedding_size, 3)

    def forward(self, input):

        try:
            indices = input.long()
            doc_mean = torch.mean(self.embedding(indices), dim=1)
        except:
            print(input)
            raise Exception("invalid input")
        output = self.output(doc_mean)

        return output

In [65]:
class Trainer():

    def __init__(self, model, dataloader, optimizer, loss, device="cpu"):

        self.model = model 
        self.dataloader = dataloader 
        self.optimizer = optimizer 
        self.loss = loss
        self.device = device
        self.total_batches = self._calculate_batches()


    def _calculate_batches(self):
        
        count = 0
        for i in self.dataloader:
            count+=1
        return count

    def train(self, epochs, save_checkpoints = False, output_folder = None, checkpoint_interval = 1):
        
        
        if(save_checkpoints and output_folder == None ):
           raise Exception("Path for output checkpoints needed")

        train_loss = 0.0
        for epoch in range(epochs):
            
            epoch_loss = 0.0
            num_batches = 0
            for batch in tqdm(self.dataloader, total = total_batches):
                
                input, target = batch["input"], batch["target"]
                input = input.to(self.device)
                target = target.to(self.device)

                output = self.model(input)


                loss = self.loss(output, target)

                self.optimizer.zero_grad()

                loss.backward()
                self.optimizer.step()

                epoch_loss += loss.item()
                num_batches += 1
            
            avg_loss = epoch_loss / num_batches
            print(f'Epoch: {epoch} avg loss = {avg_loss:.4f}')

            if(save_checkpoints):
                if(epoch % checkpoint_interval==0):
                    self.model.state_dict()
                    torch.save(self.model.state_dict(), os.path.join(output_folder, f'state_{epoch}.pt'))
            train_loss += avg_loss 

        avg_loss = train_loss / epochs 


        
        return avg_loss 

    

In [None]:
torch.cuda.empty_cache()
import gc
#del cultural_classifier

gc.collect()

1189

In [67]:
cultural_classifier = CulturalClassifier(w2v_embedding, 300)

device = "cuda" if torch.cuda.is_available() else "cpu"
cultural_classifier.to(device)

optimizer = torch.optim.SGD(cultural_classifier.parameters(), lr=0.1)
loss = torch.nn.CrossEntropyLoss()

trainer = Trainer(
    model = cultural_classifier, 
    dataloader = wikidataloader,
    optimizer = optimizer,
    loss = loss,
    device =  device)

In [69]:
avg_loss = trainer.train(30, output_folder="./data/cultural_classifier/")

100%|██████████| 391/391 [00:04<00:00, 90.17it/s]


Epoch: 0 avg loss = 1.0651


100%|██████████| 391/391 [00:04<00:00, 91.54it/s]


Epoch: 1 avg loss = 1.0454


100%|██████████| 391/391 [00:04<00:00, 91.32it/s]


Epoch: 2 avg loss = 1.0357


100%|██████████| 391/391 [00:04<00:00, 91.50it/s]


Epoch: 3 avg loss = 1.0293


100%|██████████| 391/391 [00:04<00:00, 92.36it/s] 


Epoch: 4 avg loss = 1.0244


100%|██████████| 391/391 [00:04<00:00, 90.53it/s]


Epoch: 5 avg loss = 1.0202


100%|██████████| 391/391 [00:04<00:00, 91.42it/s]


Epoch: 6 avg loss = 1.0163


100%|██████████| 391/391 [00:04<00:00, 90.97it/s]


Epoch: 7 avg loss = 1.0128


100%|██████████| 391/391 [00:04<00:00, 92.40it/s]


Epoch: 8 avg loss = 1.0094


100%|██████████| 391/391 [00:04<00:00, 90.25it/s] 


Epoch: 9 avg loss = 1.0061


100%|██████████| 391/391 [00:04<00:00, 91.96it/s]


Epoch: 10 avg loss = 1.0030


100%|██████████| 391/391 [00:04<00:00, 90.80it/s]


Epoch: 11 avg loss = 1.0000


100%|██████████| 391/391 [00:04<00:00, 90.36it/s]


Epoch: 12 avg loss = 0.9972


100%|██████████| 391/391 [00:04<00:00, 89.71it/s]


Epoch: 13 avg loss = 0.9944


100%|██████████| 391/391 [00:04<00:00, 90.65it/s]


Epoch: 14 avg loss = 0.9918


100%|██████████| 391/391 [00:04<00:00, 91.50it/s]


Epoch: 15 avg loss = 0.9892


100%|██████████| 391/391 [00:04<00:00, 89.93it/s]


Epoch: 16 avg loss = 0.9868


100%|██████████| 391/391 [00:04<00:00, 91.84it/s]


Epoch: 17 avg loss = 0.9844


100%|██████████| 391/391 [00:04<00:00, 91.22it/s]


Epoch: 18 avg loss = 0.9821


100%|██████████| 391/391 [00:04<00:00, 92.42it/s]


Epoch: 19 avg loss = 0.9799


100%|██████████| 391/391 [00:04<00:00, 92.05it/s] 


Epoch: 20 avg loss = 0.9777


100%|██████████| 391/391 [00:04<00:00, 91.35it/s]


Epoch: 21 avg loss = 0.9756


100%|██████████| 391/391 [00:04<00:00, 91.54it/s]


Epoch: 22 avg loss = 0.9736


100%|██████████| 391/391 [00:04<00:00, 90.78it/s]


Epoch: 23 avg loss = 0.9716


100%|██████████| 391/391 [00:04<00:00, 91.81it/s] 


Epoch: 24 avg loss = 0.9698


100%|██████████| 391/391 [00:04<00:00, 92.72it/s] 


Epoch: 25 avg loss = 0.9679


100%|██████████| 391/391 [00:04<00:00, 92.72it/s]


Epoch: 26 avg loss = 0.9661


100%|██████████| 391/391 [00:04<00:00, 91.95it/s] 


Epoch: 27 avg loss = 0.9644


100%|██████████| 391/391 [00:04<00:00, 92.32it/s] 


Epoch: 28 avg loss = 0.9627


100%|██████████| 391/391 [00:04<00:00, 91.35it/s]

Epoch: 29 avg loss = 0.9611





# Tests


In [16]:
from gensim.models import Word2Vec

# Create a new Word2Vec model with the same vocab
trainable_model = Word2Vec(vector_size=300, min_count=1)

# Build vocab from pretrained model
trainable_model.build_vocab([list(word2vec_google.key_to_index.keys())])

# Copy vectors
trainable_model.wv.vectors = word2vec_google.vectors.copy()

trainable_model.wv




<gensim.models.keyedvectors.KeyedVectors at 0x7139b61858b0>

array([[ 1.1291504e-03, -8.9645386e-04,  3.1852722e-04, ...,
        -1.5640259e-03, -1.2302399e-04, -8.6307526e-05],
       [ 7.0312500e-02,  8.6914062e-02,  8.7890625e-02, ...,
        -4.7607422e-02,  1.4465332e-02, -6.2500000e-02],
       [-1.1779785e-02, -4.7363281e-02,  4.4677734e-02, ...,
         7.1289062e-02, -3.4912109e-02,  2.4169922e-02],
       ...,
       [-1.9653320e-02, -9.0820312e-02, -1.9409180e-02, ...,
        -1.6357422e-02, -1.3427734e-02,  4.6630859e-02],
       [ 3.2714844e-02, -3.2226562e-02,  3.6132812e-02, ...,
        -8.8500977e-03,  2.6977539e-02,  1.9042969e-02],
       [ 4.5166016e-02, -4.5166016e-02, -3.9367676e-03, ...,
         7.9589844e-02,  7.2265625e-02,  1.3000488e-02]], dtype=float32)

In [23]:
VOCAB_SIZE_PRETRAINED, EMBEDDING_DIM_PRETRAINED = word2vec_google.vectors.shape
key_to_index = word2vec_google.key_to_index

train_set_pretrained = Word2VecDataset("./wiki-text.txt", VOCAB_SIZE_PRETRAINED, "UNK", 5, pre_word2id=key_to_index)
trainable_model.build_vocab(train_set_pretrained, update=True)  # Add new words
trainable_model.train(train_set_pretrained, total_examples=trainable_model.corpus_count, epochs=20)


Number of distinct words: 268036
Total occurrences of words in the dataset (excl. UNK tokens): 9180840


(36817483, 787968484)

In [24]:
trainable_model.save("fine_tuned_word2vec.model")



In [17]:
my_model = Word2Vec(sentences=trainset_loader, vector_size = 300, window = 5, min_count = 1, sg=1)

In [None]:
trainable_model.save("our-wiki.model")

: 