# Semnatic Search Testing Ground
### This script is used as a precursor notepad to load sentence embeddings from SBERT and uses it to perform semantic search on the review dataset

In [None]:
pip install sentence-transformers



In [None]:
from sentence_transformers import SentenceTransformer
import scipy
import os
import pandas as pd
import pickle

# LOAD BERT SENTENCE MODEL

In [None]:
# Load the BERT model. 
# More models available under under Natural Language Inference (NLI) https://github.com/UKPLab/sentence-transformers/docs/pretrained-models/nli-models.md 

# Loading BERT pretrained model
model = SentenceTransformer('bert-base-nli-mean-tokens')

# Check max sequence length by number of words
print("Max Sequence Length:", model.max_seq_length)

Downloading:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.95k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/399 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Max Sequence Length: 128


In [None]:
def read_csv(filepath):
     if os.path.splitext(filepath)[1] != '.csv':
          return  # or whatever
     seps = [',', ';', '\t']                    # ',' is default
     encodings = [None, 'utf-8', 'ISO-8859-1', 'utf-16','ascii']  # None is default
     for sep in seps:
         for encoding in encodings:
              try:
                  return pd.read_csv(filepath, encoding=encoding, sep=sep)
              except Exception:  # should really be more specific 
                  pass
     raise ValueError("{!r} is has no encoding in {} or seperator in {}"
                      .format(filepath, encodings, seps))

# DATA CLEANING AND LOADING

In [50]:
# DATA PROVENANCE
# CHECK SOURCE DATA AND CLEAN
# Data file and previous ata provenance available AT https://www.kaggle.com/thomaskonstantin/top-10000-anime-movies-ovas-and-tvshows

TEXT_DATA_DIR = '/otaku_search_engine/data'
DATA_FILE = "Anime_Top10000.csv"
raw_df = read_csv(os.path.join(TEXT_DATA_DIR, DATA_FILE))
raw_df_clean = raw_df.drop_duplicates(subset=['Synopsis']) # Drops duplicate synopsis
#raw_df_clean = raw_df_clean.dropna(subset=['Synopsis']) # Drops duplicate synopsis, NO NULL Values in original dataset
#raw_df_clean = raw_df_clean.drop_duplicates(subset=['Anime_Name']) # Drops duplicate show, DUPLICATE Synopsis solves this issue
print(raw_df_clean.head(5))
print(len(raw_df_clean.index))
raw_df_clean.to_csv(DATA_FILE.split('.')[0] + '_cleaned.csv')


                           Anime_Name  ...                                           Synopsis
0    Fullmetal Alchemist: Brotherhood  ...  "In order for something to be obtained, someth...
1  Shingeki no Kyojin Season 3 Part 2  ...  Seeking to restore humanity's diminishing hope...
2                         Steins;Gate  ...  The self-proclaimed mad scientist Rintarou Oka...
3                            Gintama°  ...  Gintoki, Shinpachi, and Kagura return as the f...
4              Hunter x Hunter (2011)  ...  Hunter x Hunter is set in a world where Hunter...

[5 rows x 5 columns]
9507


In [51]:
# A corpus is a list with documents split by sentences.
TEXT_DATA_DIR = '/otaku_search_engine/data'
DATA_FILE = "Anime_Top10000_cleaned.csv"



input_df = read_csv(os.path.join(TEXT_DATA_DIR, DATA_FILE))
print("Processing... " + str(len(input_df.index)) + " rows.")

sentences = input_df['Synopsis'].values.tolist()

print(sentences[0])
#sentences = ['synopsis 1', 
#             'synopsis 2',
#             'synopsis 3',
#             ...
#             'synopsis x']

# Each sentence is encoded as a 1-D vector with 78 columns
sentence_embeddings = model.encode(sentences)

print('Single BERT embedding vector - length', len(sentence_embeddings[0]))

#print('Single BERT embedding vector (row 1)', sentence_embeddings[0])

Processing... 9507 rows.
"In order for something to be obtained, something of equal value must be lost."

Alchemy is bound by this Law of Equivalent Exchange—something the young brothers Edward and Alphonse Elric only realize after attempting human transmutation: the one forbidden act of alchemy. They pay a terrible price for their transgression—Edward loses his left leg, Alphonse his physical body. It is only by the desperate sacrifice of Edward's right arm that he is able to affix Alphonse's soul to a suit of armor. Devastated and alone, it is the hope that they would both eventually return to their original bodies that gives Edward the inspiration to obtain metal limbs called "automail" and become a state alchemist, the Fullmetal Alchemist.

Three years of searching later, the brothers seek the Philosopher's Stone, a mythical relic that allows an alchemist to overcome the Law of Equivalent Exchange. Even with military allies Colonel Roy Mustang, Lieutenant Riza Hawkeye, and Lieute

In [52]:
# Store sentences & embeddings
with open('/otaku_search_engine/embeddings/otaku_embeddings.pkl', "wb") as fOut:
    pickle.dump({'sentences': sentences, 'embeddings': sentence_embeddings}, fOut, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# Load embeddings
with open('/otaku_search_engine/embeddings/otaku_embeddings.pkl', "rb") as fIn:
    stored_data = pickle.load(fIn)
    stored_sentences = stored_data['sentences']
    stored_embeddings = stored_data['embeddings']


# PERFORM SEMANTIC SEARCH

In [64]:
#@title Sematic Search POST

#query = 'demons oni'
#query = 'dragon china fantasy'
#query = "muscle cars"
#query = 'ninja shogun'
query = 'magical monkey'

queries = [query]
query_embeddings = model.encode(queries)

# Find the closest animes of the corpus for each query sentence based on cosine similarity
num_top_matches = 5 #@param {type: "number"}

print("Semantic Search Results")

for query, query_embedding in zip(queries, query_embeddings):
    distances = scipy.spatial.distance.cdist([query_embedding], stored_embeddings, "cosine")[0]

    results = zip(range(len(distances)), distances)
    results = sorted(results, key=lambda x: x[1])

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop " + str(num_top_matches) + " most similar sentences in corpus:")

    for idx, distance in results[0:num_top_matches]:
        print(input_df['Anime_Name'][idx].strip(), "(Cosine Similarity Score: %.4f)" % (1-distance))

Semantic Search Results




Query: magical monkey

Top 5 most similar sentences in corpus:
Legend of Lemnear: Kyokuguro no Tsubasa Valkisas (Cosine Similarity Score: 0.6179)
Chocolat no Mahou (Cosine Similarity Score: 0.6028)
Himote House (Cosine Similarity Score: 0.5953)
SOS TV Walpurgis Night Fever (Cosine Similarity Score: 0.5499)
Slime Boukenki: Umi da, Yeah! (Cosine Similarity Score: 0.5453)
