# Lexical Semantics

**Create a vector  with 10 dimensions , all of value 0**

In [1]:
import numpy as np
vector = np.zeros(10)
print(vector)

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


**Creating a dictionary using defaultdict** 

This avoids having python throw a KeyError when you try to get an item with a key that is not currently in the dictionary. 

In [None]:
from collections import defaultdict

d = defaultdict(int)
print(d.items())

d["the"] += 1
print(d.items())

dict_items([])
defaultdict(<class 'int'>, {'the': 1})


**Create a dictionary mapping each token in a corpus to a distinct integer**

In [3]:
import collections
texts = ["A group of MPs and peers has called for a tightening of regulations controlling betting on sport.", "This is annoying."]

# Create an empty dictionary using collections.defaultdic method
# Set the default value that will be assigned to each token to the current size of the vocabulary
token2int = collections.defaultdict(lambda: len(token2int)) 

# Set the value of <eos> to 0
token2int['<eos>'] = 0

# Add each new word in "texts" to the dictionary
# and map it to the integer corresponding to its first position in the text (= the size of
# the vocabulary present in the dictionary at that time)
for text in texts:
    [token2int[token] for token in text.split()]
    
token2int

defaultdict(<function __main__.<lambda>()>,
            {'<eos>': 0,
             'A': 1,
             'group': 2,
             'of': 3,
             'MPs': 4,
             'and': 5,
             'peers': 6,
             'has': 7,
             'called': 8,
             'for': 9,
             'a': 10,
             'tightening': 11,
             'regulations': 12,
             'controlling': 13,
             'betting': 14,
             'on': 15,
             'sport.': 16,
             'This': 17,
             'is': 18,
             'annoying.': 19})

**Applying SVD decomposition to a matrix**

[svd method](https://numpy.org/doc/stable/reference/generated/numpy.linalg.svd.html)

In [4]:
import numpy as np
# matrix must be a matrix
# full_matrices = False ensures that reduced SVD is used (not full SVD)
U, s, V = np.linalg.svd(matrix, full_matrices = False)

NameError: name 'matrix' is not defined

**Creating a dataframe whose column and row headers are the same and the cell values are 0**

In [5]:
import pandas as pd
vocab = ["a","b"]
print(vocab)
vocab_len = len(vocab)
df = pd.DataFrame(data=np.zeros((vocab_len, vocab_len)), dtype=np.int16,index=vocab,columns=vocab)
df.head()

['a', 'b']


Unnamed: 0,a,b
a,0,0
b,0,0


**Create a dictionary mapping tokens to integer** 

In [6]:
tokens = ["a","b"]

dict(zip(tokens,range(len(tokens))))

{'a': 0, 'b': 1}

**Creating a document/token matrix using sklean Countvectorizer**

In [7]:
docs = ["Shakespeare wrote plays","Shakespeare wrote poems",
        "Hugo wrote novels","Verne wrote novels"
        "Rimbaud wrote poems",
        "John read science", 
        "Peter read books"]

from sklearn.feature_extraction.text import CountVectorizer

# Create a frequency vectorizer object
# Help; using  the option "stop_words = 'english'" ensures that 
# stop_words are removed
count_model = CountVectorizer(ngram_range=(1,1), stop_words = 'english') 

# Convert documents to document/token matrix by applying the vectoriser
# the corpus (docs)
X = count_model.fit_transform(docs)
print(X)
# Print out the document / token matrix
# use the todense() attribute to create the matrix view
print(X.todense()) 

  (0, 10)	1
  (0, 12)	1
  (0, 6)	1
  (1, 10)	1
  (1, 12)	1
  (1, 7)	1
  (2, 12)	1
  (2, 1)	1
  (2, 3)	1
  (3, 12)	2
  (3, 7)	1
  (3, 11)	1
  (3, 4)	1
  (4, 2)	1
  (4, 8)	1
  (4, 9)	1
  (5, 8)	1
  (5, 5)	1
  (5, 0)	1
[[0 0 0 0 0 0 1 0 0 0 1 0 1]
 [0 0 0 0 0 0 0 1 0 0 1 0 1]
 [0 1 0 1 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 1 0 0 1 0 0 0 1 2]
 [0 0 1 0 0 0 0 0 1 1 0 0 0]
 [1 0 0 0 0 1 0 0 1 0 0 0 0]]


In [8]:
print(count_model.vocabulary_)

{'shakespeare': 10, 'wrote': 12, 'plays': 6, 'poems': 7, 'hugo': 1, 'novels': 3, 'verne': 11, 'novelsrimbaud': 4, 'john': 2, 'read': 8, 'science': 9, 'peter': 5, 'books': 0}


In [9]:
print(count_model.vocabulary_['shakespeare'])

10
