In [48]:
# 必要なパッケージのインストール
!pip install nltk
!pip install gensim



In [49]:
import nltk
import numpy as np
import pandas as pd
import re

In [50]:
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [51]:
def jaccard_similarity(set_a,set_b):
  # 積集合の要素数を計算
  num_intersection = len(set.intersection(set_a, set_b))
  # 和集合の要素数を計算
  num_union = len(set.union(set_a, set_b))
  #Jaccard係数を算出　空集合の時は1を出力
  try:
      return float(num_intersection) / num_union
  except ZeroDivisionError:
      return 1.0 

In [52]:
def tokenize_text(text):
  text = re.sub('[.,]', '', text)
  return text.split()


In [53]:
from nltk.corpus import wordnet as wn #lemmatize関数のためのimport

def lemmatize_word(word):
    # make words lower  example: Python =>python
    word=word.lower()
    
    # lemmatize  example: cooked=>cook
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
      return lemma

In [54]:
#1 nltkのストップワードリスト
en_stop = nltk.corpus.stopwords.words('english')
print(en_stop)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [55]:
def remove_stopwords(word, stopwordset):
  if word in stopwordset:
    return None
  else:
    return word

In [56]:
def preprocessing_text(text):
  #text = cleaning_text(text)
  tokens = tokenize_text(text)
  tokens = [lemmatize_word(word) for word in tokens]
  tokens = [remove_stopwords(word, en_stop) for word in tokens]
  tokens = [word for word in tokens if word is not None]
  return tokens


In [57]:
def bow_vectorizer(docs):
  word2id = {}
  for doc in docs:
    for w in doc:
      if w not in word2id:
        word2id[w] = len(word2id)
        
  result_list = []
  for doc in docs:
    doc_vec = [0] * len(word2id)
    for w in doc:
      doc_vec[word2id[w]] += 1
    result_list.append(doc_vec)
  return result_list, word2id

In [58]:
def tfidf_vectorizer(docs):
  def tf(word2id, doc):
    term_counts = np.zeros(len(word2id))
    for term in word2id.keys():
      term_counts[word2id[term]] = doc.count(term)
    tf_values = list(map(lambda x: x/sum(term_counts), term_counts))
    return tf_values
  
  def idf(word2id, docs):
    idf = np.zeros(len(word2id))
    for term in word2id.keys():
      idf[word2id[term]] = np.log(len(docs) / sum([bool(term in doc) for doc in docs]))
    return idf
  
  word2id = {}
  for doc in docs:
    for w in doc:
      if w not in word2id:
        word2id[w] = len(word2id)
  
  return [[_tf*_idf for _tf, _idf in zip(tf(word2id, doc), idf(word2id, docs))] for doc in docs], word2id
  

In [75]:
texts = ['While organizing the global response to the COVID-19 pandemic and overseeing "more than 35 emergency operations" for cholera, measles and other epidemics internationally, the WHO has been criticized for praising China\'s public health response to the crisis while seeking to maintain a "diplomatic balancing act" between China and the United States. Commentators including John Mackenzie of the WHO\'s emergency committee and Anne Schuchat of the US CDC have stated that China\'s official tally of cases and deaths may be an underestimation. David Heymann, professor of infectious disease epidemiology at the London School of Hygiene and Tropical Medicine, said in response that "China has been very transparent and open in sharing its data... and they opened up all of their files with the WHO."',
         'Japan or Nihon is an island country in East Asia located in the northwest Pacific Ocean. It borders the Sea of Japan to the west, and it extends from the Sea of Okhotsk in the north to the East China Sea and Taiwan in the south. Japan is part of the Pacific Ring of Fire and comprises an archipelago of 6,852 islands covering 377,975 square kilometers (145,937 sq mi); its five main islands, from north to south, are Hokkaido, Honshu, Shikoku, Kyushu, and Okinawa. Tokyo is the country\'s capital and largest city; other major cities include Osaka and Nagoya.',
         'In 2020, in a period of large-scale protests, the Standing Committee of the National People\'s Congress passed the controversial Hong Kong national security law. The law criminalises acts that were previously considered protected speech under Hong Kong law and establishes the Office for Safeguarding National Security of the CPG in the HKSAR, an investigative office under Central People\'s Government authority immune from HKSAR jurisdiction. The United Kingdom considers the law to be a serious violation of the Joint Declaration.',
         'Japanese is an East Asian language spoken by about 128 million people, primarily in Japan, where it is the national language. It is a member of the Japonic language family, and its relation to other languages, such as Korean, is debated. Japonic languages have been grouped with other language families such as Ainu, Austroasiatic, and the now-discredited Altaic, but none of these proposals has gained widespread acceptance.',
         'YouTube allows users to upload, view, rate, share, add to playlists, report, comment on videos, and subscribe to other users. It offers a wide variety of user-generated and corporate media videos. Available content includes video clips, TV show clips, music videos, short and documentary films, audio recordings, movie trailers, live streams, and other content such as video blogging, short original videos, and educational videos. Most content on YouTube is uploaded by individuals, but media corporations including CBS, the BBC, Vevo, and Hulu offer some of their material via YouTube as part of the YouTube partnership program. Unregistered users can only watch (but not upload) videos on the site, while registered users are also permitted to upload an unlimited number of videos and add comments to videos. Videos that are age-restricted are available only to registered users affirming themselves to be at least 18 years old.',
         'The order of search results returned by Google is based, in part, on a priority rank system called "PageRank". Google Search also provides many different options for customized search, using symbols to include, exclude, specify or require certain search behavior, and offers specialized interactive experiences, such as flight status and package tracking, weather forecasts, currency, unit and time conversions, word definitions, and more.',
         'SoftBank Group Corp. is a Japanese multinational conglomerate holding company headquartered in Tokyo. SoftBank owns stakes in many technology, energy, and financial companies. It also runs Vision Fund, the world\'s largest technology-focused venture capital fund, with over $100 billion in capital.',
         'In August 2018, the National Defense Authorization Act for Fiscal Year 2019 was signed into law, containing a provision that banned Huawei and ZTE equipment from being used by the U.S. federal government, citing security concerns. Huawei filed a lawsuit over the act in March 2019, alleging it to be unconstitutional because it specifically targeted Huawei without granting it a chance to provide a rebuttal or due process.',
         'Shinzō Abe is a Japanese politician who has served as Prime Minister of Japan and President of the Liberal Democratic Party since 2012. He previously served as Chief Cabinet Secretary from 2005 to 2006, and as Prime Minister from 2006 to 2007. He is the longest-serving Prime Minister in Japanese history.',
         'In February 2013, IBM announced that Watson software system\'s first commercial application would be for utilization management decisions in lung cancer treatment at Memorial Sloan Kettering Cancer Center, New York City, in conjunction with WellPoint. In 2013, Manoj Saxena, IBM Watson\'s business chief said that 90% of nurses in the field who use Watson now follow its guidance.']

for t in texts:
  print(t, '\n')


While organizing the global response to the COVID-19 pandemic and overseeing "more than 35 emergency operations" for cholera, measles and other epidemics internationally, the WHO has been criticized for praising China's public health response to the crisis while seeking to maintain a "diplomatic balancing act" between China and the United States. Commentators including John Mackenzie of the WHO's emergency committee and Anne Schuchat of the US CDC have stated that China's official tally of cases and deaths may be an underestimation. David Heymann, professor of infectious disease epidemiology at the London School of Hygiene and Tropical Medicine, said in response that "China has been very transparent and open in sharing its data... and they opened up all of their files with the WHO." 

Japan or Nihon is an island country in East Asia located in the northwest Pacific Ocean. It borders the Sea of Japan to the west, and it extends from the Sea of Okhotsk in the north to the East China Sea 

In [60]:
def cosine_similarity(list_a, list_b):
  inner_prod = np.array(list_a).dot(np.array(list_b))
  norm_a = np.linalg.norm(list_a)
  norm_b = np.linalg.norm(list_b)
  try:
      return inner_prod / (norm_a*norm_b)
  except ZeroDivisionError:
      return 1.0

In [61]:
def euclidean_distance(list_a, list_b):
  diff_vec = np.array(list_a) - np.array(list_b)
  return np.linalg.norm(diff_vec)

In [62]:
ttt = [preprocessing_text(t) for t in texts]
bow_vec, _ = bow_vectorizer(ttt)
tfidf_vec, _ = tfidf_vectorizer(ttt)
print(bow_vec)
print(tfidf_vec)

[[1, 1, 3, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [80]:
def print_sim(vec, sim_fun):
  ll = len(vec)
  print('  ', *(['    {:02d}'.format(i + 1) for i in range(ll)]))
  for i in range(ll):
    print('{:02d} '.format(i + 1), end='')
    for j in range(ll):
      sim = sim_fun(vec[i], vec[j])
      print('{:06.3f} '.format(sim), end='')
    print('')
  print('')


In [81]:
print('BoW, cosine')
print_sim(bow_vec, cosine_similarity)

print('BoW, Euclidean')
print_sim(bow_vec, euclidean_distance)

print('TF idf, cosine')
print_sim(tfidf_vec, cosine_similarity)

print('TF idf, Euclidean')
print_sim(tfidf_vec, euclidean_distance)

BoW, cosine
       01     02     03     04     05     06     07     08     09     10
01 01.000 00.023 00.024 00.026 00.013 00.014 00.000 00.029 00.031 00.015 
02 00.023 01.000 00.000 00.065 00.020 00.028 00.070 00.000 00.046 00.015 
03 00.024 00.000 01.000 00.040 00.000 00.000 00.000 00.152 00.016 00.000 
04 00.026 00.065 00.040 01.000 00.000 00.000 00.039 00.017 00.069 00.000 
05 00.013 00.020 00.000 00.000 01.000 00.047 00.010 00.000 00.000 00.000 
06 00.014 00.028 00.000 00.000 00.047 01.000 00.042 00.018 00.000 00.000 
07 00.000 00.070 00.000 00.039 00.010 00.042 01.000 00.000 00.046 00.000 
08 00.029 00.000 00.152 00.017 00.000 00.018 00.000 01.000 00.000 00.019 
09 00.031 00.046 00.016 00.069 00.000 00.000 00.046 00.000 01.000 00.020 
10 00.015 00.015 00.000 00.000 00.000 00.000 00.000 00.019 00.020 01.000 

BoW, Euclidean
       01     02     03     04     05     06     07     08     09     10
01 00.000 12.961 12.845 12.329 18.708 12.042 11.180 11.662 11.489 11.619 
02 12.961 00