#NLP - IR
### Luis Gabriel Moreno Sandoval
### morenoluis@javeriana.edu.co

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import SnowballStemmer
import nltk
nltk.download('stopwords')
nltk.download('punkt')
stop_words_english = nltk.corpus.stopwords.words('english')
stemmer = SnowballStemmer('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
def pre_process(text):
  #print('before pre_process', text)
  text = text.lower()
  #print('after pre_process', text)
  return text

In [None]:
def tokenize_stemmer(text):
  tokens_doc = nltk.word_tokenize(text)
  tokens_doc_wo_stop = [w for w in tokens_doc if w.isalpha()]
  tokens_docs_stem = [stemmer.stem(w) for w in tokens_doc_wo_stop]
  return tokens_docs_stem

In [None]:
docs_raw = []
#docs_raw.append("""El precio del dólar en el mercado interbancario colombiano inició este jueves con relativa estabilidad en su negociación y sin mayores sobre saltos, característicos de las últimas semanas.
#Este jueves, el dólar en Colombia vivió su segunda jornada seguida con pérdidas. Sin embargo, a pesar de la bajada en su precio, la divisa se mantuvo arriba de los 4.500 pesos.""")
docs_raw.append('one fish, two fish')
docs_raw.append('Red fish, blue fish')
docs_raw.append('cat in The Hat')
docs_raw.append('Green eggs and ham')
#docs_raw.append('Green eggs and ham')
#docs_raw.append('Green eggs and ham')

tfidf = TfidfVectorizer(preprocessor=pre_process, tokenizer=tokenize_stemmer, stop_words=stop_words_english)
# tfidf = TfidfVectorizer()
tfs = tfidf.fit_transform(docs_raw)
tfidf.get_feature_names()

  % sorted(inconsistent)


['blue', 'cat', 'egg', 'fish', 'green', 'ham', 'hat', 'one', 'red', 'two']

In [None]:
docs_num, feature_num = tfs.shape
feature_names = tfidf.get_feature_names()
print("n_docs: %d, n_features: %d" % tfs.shape)

n_docs: 4, n_features: 10


In [None]:
print("###### Calculo de Feature Names ######")
for x in range(0, feature_num):
    print(" # ", x ," - ",feature_names[x], " - ", [tfs[n,x] for n in range(0, docs_num)])

###### Calculo de Feature Names ######
 #  0  -  blue  -  [0.0, 0.47212002654617047, 0.0, 0.0]
 #  1  -  cat  -  [0.0, 0.0, 0.7071067811865476, 0.0]
 #  2  -  egg  -  [0.0, 0.0, 0.0, 0.5773502691896257]
 #  3  -  fish  -  [0.7444497035180324, 0.7444497035180324, 0.0, 0.0]
 #  4  -  green  -  [0.0, 0.0, 0.0, 0.5773502691896257]
 #  5  -  ham  -  [0.0, 0.0, 0.0, 0.5773502691896257]
 #  6  -  hat  -  [0.0, 0.0, 0.7071067811865476, 0.0]
 #  7  -  one  -  [0.47212002654617047, 0.0, 0.0, 0.0]
 #  8  -  red  -  [0.0, 0.47212002654617047, 0.0, 0.0]
 #  9  -  two  -  [0.47212002654617047, 0.0, 0.0, 0.0]


In [None]:

response = tfidf.transform(["one fish hat hat", "blue eggs hat", "fish, otra palabra blues"])
print('response:', response)
cosine_similarity_response =  cosine_similarity(response, tfs)
print("n_question: %d, n_features: %d" % response.shape)
print("cosine_similarity ", cosine_similarity_response)


response:   (0, 7)	0.4217647821447532
  (0, 6)	0.8435295642895064
  (0, 3)	0.3325241986862672
  (1, 6)	0.5773502691896257
  (1, 2)	0.5773502691896257
  (1, 0)	0.5773502691896257
  (2, 3)	0.6191302964899972
  (2, 0)	0.7852882757103967
n_question: 3, n_features: 10
cosine_similarity  [[0.44667114 0.24754754 0.59646548 0.        ]
 [0.         0.27257862 0.40824829 0.33333333]
 [0.46091137 0.83166169 0.         0.        ]]


In [None]:
response

<3x10 sparse matrix of type '<class 'numpy.float64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [None]:
!pip install joblib


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from joblib import dump
from datetime import datetime
data_filename_memmap = f"file_tfidf_{datetime.now().strftime('%Y-%m-%d')}.vec"
dump(tfidf, data_filename_memmap)

['file_tfidf_2022-09-10.vec']

In [None]:
from joblib import load
tf_idf_2_re = load('file_tfidf_2022-09-10.vec', mmap_mode='r')

In [None]:
response = tf_idf_2_re.transform(["fish blue cat persona"])
print('response:', response)
cosine_similarity_response =  cosine_similarity(response, tfs)
print("n_question: %d, n_features: %d" % response.shape)
print("cosine_similarity ", cosine_similarity_response)

response:   (0, 3)	0.48693426407352264
  (0, 1)	0.6176143709756019
  (0, 0)	0.6176143709756019
n_question: 1, n_features: 10
cosine_similarity  [[0.36249807 0.65408618 0.43671931 0.        ]]


  % sorted(inconsistent)


In [None]:
import unicodedata
def proper_encoding(text):
    # print('text: ', text)
    text = unicodedata.normalize('NFD', text)
    text = text.encode('ascii', 'ignore')
    text = text.decode("utf-8")
    return text

In [None]:
text = "Holá Miño otro lingüística."
print(proper_encoding(text))

Hola Mino otro linguistica.
