# Natural Language Processing en Pytorch

In [10]:
import torch

## Tutorial 2: Implementación de Skip-Gram Word2Vec

El objetivo es calcular la probabilidad de $P(\mathbf{O}|\mathbf{C})$ (probabilidad de las palabras de alrededor dada la palabra centra).

Podemos aproximar esta probabilidad con un modelo de Softmax:

$$
P(\mathbf{O}=o|\mathbf{C}=c) = \frac{\exp(\Theta x_c)}{\sum_{w\in Vocab}\exp(\theta_w^T x_c)}
$$

donde 

$$
\Theta = 
\begin{bmatrix}
-\;\theta_1^T \;- \\
-\; \theta_2^T \;- \\
\vdots \\
-\; \theta_{|V|}^T \; -
\end{bmatrix}
$$

y $x_c$ es un *one-hot* vector en la palabra $c$ del vocabulario $V$.
Si se define

$$
\Theta = U V
$$

con 

$$
\begin{align}
U = 
\begin{bmatrix}
- \; u_1^T \; - \\
- \; u_2^T \; - \\
\vdots \\
- \; u_{|V|}^T \; -
\end{bmatrix} & &
V = 
\begin{bmatrix}
| & | & & | \\
v_1 & v_2 & \cdots & v_{|V|} \\
| & | & & | \\
\end{bmatrix} & & 
u_i , v_i \in \mathbb{R}^{n} \; i= 1, \ldots, |V|
\end{align}
$$

puede verse que la expresión de la probabilidad condicional anterior queda

$$
\begin{align}
P(\mathbf{O}=o|\mathbf{C}=c) &= \frac{\exp(\Theta c)}{\sum_{w\in V}\exp(\theta_w^T c)} \\[.5em]
&= \frac{\exp(u_o^T v_c)}{\sum_{w\in V}\exp(u_w^T v_c)}
\end{align}
$$

donde $v_c$ es la columna de $V$ correspondiente a la plabra $c$ del vocabulario y $u_o^T$ es la fila $o$ de $U$ correspondiente a la palabra $o$ del vocabulario.

El siguiente paso es encontrar los parámetros de la distribución anterior, lo cual se hará minimizando la función de costo *Negative LogLikelihood* por gradiente estocástico. Para eso, se puede utilizar las relaciones

$$
\begin{align}
\frac{\partial J}{\partial U} = \frac{\partial J}{\partial \Theta} V^T & \hspace{3em} &
\frac{\partial J}{\partial V} = U^T \frac{\partial J}{\partial \Theta} 
\end{align}
$$

con $J(U,V) = NLL(P(o|c))$, o hacer el cálculo a mano a partir de la expresión completa:

$$
\begin{align}
J(U,V) &= - \log \left( \prod_{i=1}^N \prod_{j=1}^{|V|} P(o_i = j | x_i)^{\mathbb{1}_{\{y_i = j\} }} \right) \\
&= 
\end{align}
$$


In [36]:
class Stanford(torch.utils.data.Dataset):
    
    def __init__(self, path, samples='Train', context_size=2):
        
        super(Stanford, self).__init__()
        
        self.context_size = context_size
        
        sentences = []
        with open(path + "/datasetSentences.txt", "r") as f:
            first = True
            for line in f:
                if first:
                    first = False
                    continue

                splitted = line.strip().split()[1:]
                # Deal with some peculiar encoding issues with this file
                sentences += [[w.lower() for w in splitted]]
        
        split = [[] for i in range(3)]
        with open(path + "/datasetSplit.txt", "r") as f:
            first = True
            for line in f:
                if first:
                    first = False
                    continue

                splitted = line.strip().split(",")
                split[int(splitted[1]) - 1] += [int(splitted[0]) - 1]
        
        if samples == 'Train':
            self.samples = self.getSamples(split[0], sentences)
        elif samples == 'Validation':
            self.samples = self.getSamples(split[1], sentences)
        elif samples == 'Test':
            self.samples = self.getSamples(split[2], sentences)
        else:
            print('Error: especificar si las muestras son de train, validation o test')
            self.samples = None
        
        # Get the tokens:
        tokens = dict()
        tokenfreq = dict()
        revtokens = []
        idx = 0
        for sentence in sentences:
            for w in sentence:
                if not w in tokens:
                    tokens[w] = idx
                    revtokens += [w]
                    tokenfreq[w] = 1
                    idx += 1
                else:
                    tokenfreq[w] += 1
        tokens["UNK"] = idx
        revtokens += ["UNK"]
        tokenfreq["UNK"] = 1

        self.word_to_index = tokens
        self.revtokens = revtokens
        self.index_to_word = {idx: word for word, idx in zip(tokens.keys(),tokens.values())}
        self.vocabulary = list(tokens.keys())
        self.tokens_freqs = tokenfreq
        
        
    def getSamples(self, corpus, sentences):
        samples = []
        no_sentence = 
        for sentence_idx in corpus:
            sentence = sentences[sentence_idx]
            for i, word in enumerate(sentence):
                first_context_word_index = max(0,i-self.context_size)
                last_context_word_index = min(i+self.context_size+1, len(sentence))
                
                context = [no_sentence for j in range(i-self.context_size,first_context_word_index)] + \
                          sentence[first_context_word_index:i] + \
                          sentence[i+1:last_context_word_index] + \
                          [no_sentence for j in range(last_context_word_index,i+self.context_size+1)]
                
                samples.append((word, context))
    
train_dataset = Stanford(path='../04-Prueba-standarization/StanfordDataset/datasets/stanfordSentimentTreebank',
                         samples='Train')

train_dataset.samples[:10]

NameError: name 'no_sentence' is not defined

In [26]:
class MyStanfordSentiment(object):
    
    def __init__(self, path=None, tablesize = 1000000):
        
        if not path:
            path = "../04-Prueba-standarization/StanfordDataset/datasets/stanfordSentimentTreebank"

        self.path = path
        self.tablesize = tablesize
        
        # Get the sentences:
        sentences = []
        with open(self.path + "/datasetSentences.txt", "r") as f:
            first = True
            for line in f:
                if first:
                    first = False
                    continue

                splitted = line.strip().split()[1:]
                # Deal with some peculiar encoding issues with this file
                sentences += [[w.lower() for w in splitted]]
        self._sentences = sentences
        self._sentlengths = [len(s) for s in sentences]
        self._numSentences = len(sentences)
        
        
        # Get the tokens:
        tokens = dict()
        tokenfreq = dict()
        wordcount = 0
        revtokens = []
        idx = 0
        for sentence in sentences:
            for w in sentence:
                wordcount += 1
                if not w in tokens:
                    tokens[w] = idx
                    revtokens += [w]
                    tokenfreq[w] = 1
                    idx += 1
                else:
                    tokenfreq[w] += 1
        tokens["UNK"] = idx
        revtokens += ["UNK"]
        tokenfreq["UNK"] = 1
        wordcount += 1

        self._tokens = tokens
        self._tokenfreq = tokenfreq
        self._wordcount = wordcount
        self._revtokens = revtokens
        
        # Split Dataset:
        split = [[] for i in range(3)]
        with open(self.path + "/datasetSplit.txt", "r") as f:
            first = True
            for line in f:
                if first:
                    first = False
                    continue

                splitted = line.strip().split(",")
                split[int(splitted[1]) - 1] += [int(splitted[0]) - 1]

        self._split = split
        
        # Reject Probs:
        threshold = 1e-5 * self._wordcount
        nTokens = len(tokens)
        rejectProb = torch.zeros((nTokens,))
        for i in range(nTokens):
            w = self._revtokens[i]
            freq = 1.0 * self._tokenfreq[w]
            # Reweigh
            rejectProb[i] = max(torch.tensor(0.), 1 - torch.sqrt(torch.tensor(threshold / freq)))

        self._rejectProb = rejectProb
        
dataset_obj = MyStanfordSentiment()
dataset_obj._sentences[:10]

[['the',
  'rock',
  'is',
  'destined',
  'to',
  'be',
  'the',
  '21st',
  'century',
  "'s",
  'new',
  '``',
  'conan',
  "''",
  'and',
  'that',
  'he',
  "'s",
  'going',
  'to',
  'make',
  'a',
  'splash',
  'even',
  'greater',
  'than',
  'arnold',
  'schwarzenegger',
  ',',
  'jean-claud',
  'van',
  'damme',
  'or',
  'steven',
  'segal',
  '.'],
 ['the',
  'gorgeously',
  'elaborate',
  'continuation',
  'of',
  '``',
  'the',
  'lord',
  'of',
  'the',
  'rings',
  "''",
  'trilogy',
  'is',
  'so',
  'huge',
  'that',
  'a',
  'column',
  'of',
  'words',
  'can',
  'not',
  'adequately',
  'describe',
  'co-writer\\/director',
  'peter',
  'jackson',
  "'s",
  'expanded',
  'vision',
  'of',
  'j.r.r.',
  'tolkien',
  "'s",
  'middle-earth',
  '.'],
 ['effective', 'but', 'too-tepid', 'biopic'],
 ['if',
  'you',
  'sometimes',
  'like',
  'to',
  'go',
  'to',
  'the',
  'movies',
  'to',
  'have',
  'fun',
  ',',
  'wasabi',
  'is',
  'a',
  'good',
  'place',
  'to',

In [None]:
class StanfordSentiment:
    def __init__(self, path=None, tablesize = 1000000):
        if not path:
            path = "../04-Prueba-standarization/StanfordDataset/datasets/stanfordSentimentTreebank"

        self.path = path
        self.tablesize = tablesize

    def tokens(self):
        if hasattr(self, "_tokens") and self._tokens:
            return self._tokens

        tokens = dict()
        tokenfreq = dict()
        wordcount = 0
        revtokens = []
        idx = 0

        for sentence in self.sentences():
            for w in sentence:
                wordcount += 1
                if not w in tokens:
                    tokens[w] = idx
                    revtokens += [w]
                    tokenfreq[w] = 1
                    idx += 1
                else:
                    tokenfreq[w] += 1

        tokens["UNK"] = idx
        revtokens += ["UNK"]
        tokenfreq["UNK"] = 1
        wordcount += 1

        self._tokens = tokens
        self._tokenfreq = tokenfreq
        self._wordcount = wordcount
        self._revtokens = revtokens
        return self._tokens

    def sentences(self):
        if hasattr(self, "_sentences") and self._sentences:
            return self._sentences

        sentences = []
        with open(self.path + "/datasetSentences.txt", "r") as f:
            first = True
            for line in f:
                if first:
                    first = False
                    continue

                splitted = line.strip().split()[1:]
                # Deal with some peculiar encoding issues with this file
                sentences += [[w.lower() for w in splitted]]

        self._sentences = sentences
        self._sentlengths = np.array([len(s) for s in sentences])
        self._cumsentlen = np.cumsum(self._sentlengths)

        return self._sentences

    def numSentences(self):
        if hasattr(self, "_numSentences") and self._numSentences:
            return self._numSentences
        else:
            self._numSentences = len(self.sentences())
            return self._numSentences

    def allSentences(self):
        if hasattr(self, "_allsentences") and self._allsentences:
            return self._allsentences

        sentences = self.sentences()
        rejectProb = self.rejectProb()
        tokens = self.tokens()
        allsentences = [[w for w in s
            if 0 >= rejectProb[tokens[w]] or random.random() >= rejectProb[tokens[w]]]
            for s in sentences * 30]

        allsentences = [s for s in allsentences if len(s) > 1]

        self._allsentences = allsentences

        return self._allsentences

    def getRandomContext(self, C=5):
        allsent = self.allSentences()
        sentID = random.randint(0, len(allsent) - 1)
        sent = allsent[sentID]
        wordID = random.randint(0, len(sent) - 1)

        context = sent[max(0, wordID - C):wordID]
        if wordID+1 < len(sent):
            context += sent[wordID+1:min(len(sent), wordID + C + 1)]

        centerword = sent[wordID]
        context = [w for w in context if w != centerword]

        if len(context) > 0:
            return centerword, context
        else:
            return self.getRandomContext(C)

    def sent_labels(self):
        if hasattr(self, "_sent_labels") and self._sent_labels:
            return self._sent_labels

        dictionary = dict()
        phrases = 0
        with open(self.path + "/dictionary.txt", "r") as f:
            for line in f:
                line = line.strip()
                if not line: continue
                splitted = line.split("|")
                dictionary[splitted[0].lower()] = int(splitted[1])
                phrases += 1

        labels = [0.0] * phrases
        with open(self.path + "/sentiment_labels.txt", "r") as f:
            first = True
            for line in f:
                if first:
                    first = False
                    continue

                line = line.strip()
                if not line: continue
                splitted = line.split("|")
                labels[int(splitted[0])] = float(splitted[1])

        sent_labels = [0.0] * self.numSentences()
        sentences = self.sentences()
        for i in range(self.numSentences()):
            sentence = sentences[i]
            full_sent = " ".join(sentence).replace('-lrb-', '(').replace('-rrb-', ')')
            sent_labels[i] = labels[dictionary[full_sent]]

        self._sent_labels = sent_labels
        return self._sent_labels

    def dataset_split(self):
        if hasattr(self, "_split") and self._split:
            return self._split

        split = [[] for i in range(3)]
        with open(self.path + "/datasetSplit.txt", "r") as f:
            first = True
            for line in f:
                if first:
                    first = False
                    continue

                splitted = line.strip().split(",")
                split[int(splitted[1]) - 1] += [int(splitted[0]) - 1]

        self._split = split
        return self._split

    def getRandomTrainSentence(self):
        split = self.dataset_split()
        sentId = split[0][random.randint(0, len(split[0]) - 1)]
        return self.sentences()[sentId], self.categorify(self.sent_labels()[sentId])

    def categorify(self, label):
        if label <= 0.2:
            return 0
        elif label <= 0.4:
            return 1
        elif label <= 0.6:
            return 2
        elif label <= 0.8:
            return 3
        else:
            return 4

    def getDevSentences(self):
        return self.getSplitSentences(2)

    def getTestSentences(self):
        return self.getSplitSentences(1)

    def getTrainSentences(self):
        return self.getSplitSentences(0)

    def getSplitSentences(self, split=0):
        ds_split = self.dataset_split()
        return [(self.sentences()[i], self.categorify(self.sent_labels()[i])) for i in ds_split[split]]

    def sampleTable(self):
        if hasattr(self, '_sampleTable') and self._sampleTable is not None:
            return self._sampleTable

        nTokens = len(self.tokens())
        samplingFreq = np.zeros((nTokens,))
        self.allSentences()
        i = 0
        for w in range(nTokens):
            w = self._revtokens[i]
            if w in self._tokenfreq:
                freq = 1.0 * self._tokenfreq[w]
                # Reweigh
                freq = freq ** 0.75
            else:
                freq = 0.0
            samplingFreq[i] = freq
            i += 1

        samplingFreq /= np.sum(samplingFreq)
        samplingFreq = np.cumsum(samplingFreq) * self.tablesize

        self._sampleTable = [0] * self.tablesize

        j = 0
        for i in range(self.tablesize):
            while i > samplingFreq[j]:
                j += 1
            self._sampleTable[i] = j

        return self._sampleTable

    def rejectProb(self):
        if hasattr(self, '_rejectProb') and self._rejectProb is not None:
            return self._rejectProb

        threshold = 1e-5 * self._wordcount

        nTokens = len(self.tokens())
        rejectProb = np.zeros((nTokens,))
        for i in range(nTokens):
            w = self._revtokens[i]
            freq = 1.0 * self._tokenfreq[w]
            # Reweigh
            rejectProb[i] = max(0, 1 - np.sqrt(threshold / freq))

        self._rejectProb = rejectProb
        return self._rejectProb

    def sampleTokenIdx(self):
        return self.sampleTable()[random.randint(0, self.tablesize - 1)]