In [20]:
import csv
import json
import os
import pymongo
from dotenv import load_dotenv
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import math
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity


load_dotenv()

def dotProduct(a,b):
    dotproduct = 0.0
    for k,v in enumerate(a):
        dotproduct+=a[k]*b[k]
    return dotproduct

def normalize(v):
    sum_of_squares = 0.0
    for i in v:
        sum_of_squares+=math.pow(i,2)

    return [x / math.sqrt(sum_of_squares) for x in v]

def vectorSearch(collection,inputVector,algo,normalized=True):
  pipeline = [
    {
      "$search": {
        "index": "default",
        "knnBeta": {
          "vector": normalize(inputVector) if normalized else inputVector,
          "path": "2d.{}.{}".format('normalized' if normalized else 'raw',algo),
          "filter": {
            "compound": {
              "must":[
                  {
                    "text":{
                        "query":"animal",
                        "path":"type"
                    }
                  }
              ],
              "mustNot":[
                {
                  "equals":{
                    "value":input["_id"],
                    "path":"_id"
                  }
                }
              ]
            }
          },
          "k": 1
        }
      }
    }
  ]

  return list(collection.aggregate(pipeline))

def maximal_marginal_relevance(sentence_vector, documents, embedding_matrix, lambda_constant=0.5, threshold_docs=10):
    """
    With thanks to: https://gist.github.com/aditya00kumar/011b6ad309de616e15c32b5efcd9f66d#file-mmr-py
    Return ranked documents using MMR. Cosine similarity is used as similarity measure.
    :param sentence_vector: Query vector
    :param phrases: list of candidate documents
    :param embedding_matrix: matrix having index as document ID and values as vector
    :param lambda_constant: 0.5 to balance diversity and accuracy. if lambda_constant is high, then higher accuracy. If lambda_constant is low then high diversity.
    :param threshold_docs: number of terms to include in result set
    :return: Ranked documents with score
    """
    # todo: Use cosine similarity matrix for lookup among phrases instead of making call everytime.
    s = []
    r = documents.index.to_list()
    while len(r) > 0:
        score = 0
        docid_to_add = ''
        for i in r:
            first_part = cosine_similarity([sentence_vector], [embedding_matrix.loc[i]])[0][0]
            second_part = 0
            for j in s:
                cos_sim = cosine_similarity([embedding_matrix.loc[i]], [embedding_matrix.loc[j['_id']]])[0][0]
                if cos_sim > second_part:
                    second_part = cos_sim
            equation_score = lambda_constant*(first_part)-(1-lambda_constant) * second_part
            if equation_score > score:
                score = equation_score
                docid_to_add = i
        if docid_to_add == '':
            docid_to_add = i
        r.remove(docid_to_add)
        doc_to_add = documents.loc[docid_to_add].to_dict()
        doc_to_add['score'] = score
        doc_to_add['_id']=i
        s.append(doc_to_add)
    return (s, s[:threshold_docs])[threshold_docs < len(s)]

In [None]:
MDB_URI = os.getenv('MDB_URI')
print(MDB_URI)
animals = []
with open('animals.csv',newline='') as csvFile:
        reader = csv.DictReader(csvFile,fieldnames=['species','body_weight','brain_weight'])
        for i,row in enumerate(reader):
            if i > 0:
                row['_id'] = i
                row['2d'] = {
                    'raw':{
                        'euclidean':[float(row['body_weight']),float(row['brain_weight'])],
                        'dotproduct':[float(row['body_weight']),float(row['brain_weight'])],
                        'cosine':[float(row['body_weight']),float(row['brain_weight'])]
                    },
                    'normalized':{
                        'euclidean':normalize([float(row['body_weight']),float(row['brain_weight'])]),
                        'dotproduct':normalize([float(row['body_weight']),float(row['brain_weight'])]),
                        'cosine':normalize([float(row['body_weight']),float(row['brain_weight'])])
                    }
                }
                row['dimensions'] = 2
                row['body_weight'] = float(row['body_weight'])
                row['brain_weight'] = float(row['brain_weight'])
                row['type'] = 'animal'
                animals.append(row)

client = pymongo.MongoClient(MDB_URI)
collection = client['test']['vectors']

collection.delete_many({"type":'animal'})
collection.insert_many(animals)

In [15]:
input = {"_id":2,"species":"Cow","body_weight":465.000,"brain_weight":423.000}

def makeGraphData (algo,items,normalized=True):
  graphData = []
  for v in items:
    graphData.append(
      {
        "species":v["species"],
        "x":v["2d"]["normalized" if normalized else "raw"][algo][0],
        "y":v["2d"]["normalized" if normalized else "raw"][algo][1]
      }
    )
  return graphData

for i,algo in enumerate(["euclidean","cosine","dotproduct"]):

  fig = make_subplots(rows=1,cols=2,subplot_titles=["Normalized Values","Raw Values"])

  inputVector = [input['body_weight'],input['brain_weight']]
  inputGraphDataNorm=normalize(inputVector)

  results = vectorSearch(collection,inputVector,algo,normalized=True)
  nearest = results[0]
  nearestGraphDataNorm = makeGraphData(algo,[nearest],normalized=True)

  animals = list(collection.find({'_id':{"$nin":[input['_id'],nearest['_id']]},"type":"animal"}))
  otherGraphDataNorm = makeGraphData(algo,animals,normalized=True)

  normTrace0=go.Scatter(hoverinfo="text",hovertext=[input['species']], x=[inputGraphDataNorm[0]],y=[inputGraphDataNorm[1]],mode="markers",name="Input - "+input["species"],legendgroup="1",legendgrouptitle={"text":"Normalized"})
  normTrace1=go.Scatter(hoverinfo="text",hovertext=[d["species"] for d in nearestGraphDataNorm], x=[d["x"] for d in nearestGraphDataNorm],y=[d["y"] for d in nearestGraphDataNorm],mode="markers",name="Nearest - "+nearest["species"],legendgroup="1",legendgrouptitle={"text":"Normalized"})
  normTrace2=go.Scatter(hoverinfo="text",hovertext=[d["species"] for d in otherGraphDataNorm], x=[d["x"] for d in otherGraphDataNorm],y=[d["y"] for d in otherGraphDataNorm],mode="markers",name="Others",legendgroup="1",legendgrouptitle={"text":"Normalized"})

  for trace in [normTrace0,normTrace1,normTrace2]:
    fig.add_trace(trace,row=1,col=1)

  inputGraphDataRaw=inputVector

  results = vectorSearch(collection,inputVector,algo,normalized=False)
  nearest = results[0]
  nearestGraphDataRaw = makeGraphData(algo,[nearest],normalized=False)
  
  animals = list(collection.find({'_id':{"$nin":[input['_id'],nearest['_id']]},"type":"animal"}))
  otherGraphDataRaw = makeGraphData(algo,animals,normalized=False)

  rawTrace0=go.Scatter(hoverinfo="text",hovertext=[input['species']], x=[inputGraphDataRaw[0]],y=[inputGraphDataRaw[1]],mode="markers",name="Input - "+input["species"],legendgroup="2",legendgrouptitle={"text":"Raw"})
  rawTrace1=go.Scatter(hoverinfo="text",hovertext=[d["species"] for d in nearestGraphDataRaw], x=[d["x"] for d in nearestGraphDataRaw],y=[d["y"] for d in nearestGraphDataRaw],mode="markers",name="Nearest - "+nearest["species"],legendgroup="2",legendgrouptitle={"text":"Raw"})
  rawTrace2=go.Scatter(hoverinfo="text",hovertext=[d["species"] for d in otherGraphDataRaw], x=[d["x"] for d in otherGraphDataRaw],y=[d["y"] for d in otherGraphDataRaw],mode="markers",name="Others",legendgroup="2",legendgrouptitle={"text":"Raw"})

  for trace in [rawTrace0,rawTrace1,rawTrace2]:
    fig.add_trace(trace,row=1,col=2)
  
  layout = {
    "xaxis2":{"title":"body_weight (kg)"},
    "yaxis2":{"title":"brain_weight (g)"}
  }
  # fig["layout"]["xaxis"]["title"]="body_weight (kg)"
  fig.update_layout(layout)
  fig.show()

# Taking the 'average' vector
Take the average vector of a group of animals and plot it amongst them.

In [16]:
def averageVector(vectors):
    num = len(vectors)
    length = len(vectors[0])
    avg = [0 for i in vectors[0]]
    for i,v in enumerate(avg):
        for vec in vectors:
            v+=vec[i]

        avg[i]=v/num

    return avg
            

In [17]:
animals = list(collection.find({"$and":[{"body_weight":{"$gt":100}},{"body_weight":{"$lt":500}}]}))
average = averageVector([a['2d']['normalized']['cosine'] for a in animals])

print(average)

[0.5828045234757206, 0.7902100952908057]


In [18]:
fig = make_subplots(rows=1,cols=1,subplot_titles=["Average Animal"])


otherGraphDataNorm = makeGraphData("cosine",animals,normalized=True)
print(otherGraphDataNorm)

avgGraphData = [{
        "species":"Average",
        "x":average[0],
        "y":average[1]
      }]

normTrace0=go.Scatter(hoverinfo="text",hovertext=[d["species"] for d in avgGraphData], x=[d["x"] for d in avgGraphData],y=[d["y"] for d in avgGraphData],mode="markers",name="Average")
normTrace1=go.Scatter(hoverinfo="text",hovertext=[d["species"] for d in otherGraphDataNorm], x=[d["x"] for d in otherGraphDataNorm],y=[d["y"] for d in otherGraphDataNorm],mode="markers",name="Animals")

for trace in [normTrace0,normTrace1]:
    fig.add_trace(trace,row=1,col=1)

fig.show()

[{'species': 'Cow', 'x': 0.7397240889561999, 'y': 0.6729103002762851}, {'species': 'Donkey', 'x': 0.407735203663776, 'y': 0.9131002155805567}, {'species': 'Gorilla', 'x': 0.4542215971428217, 'y': 0.8908887364250512}, {'species': 'Pig', 'x': 0.7295372041400852, 'y': 0.6839411288813299}]


# Using MMR to improve diversity of answers
As explained in [this blog post](https://medium.com/tech-that-works/maximal-marginal-relevance-to-rerank-results-in-unsupervised-keyphrase-extraction-22d95015c7c5)

In [21]:
def mmrQuery(collection,query_vector):
    aggResults = list(collection.aggregate([
        {
            "$search":{
                "index":"default",
                "knnBeta":{
                    "vector":query_vector,
                    "path":"2d.normalized.cosine",
                    "k":50,
                    "filter":{
                        "text":{
                            "query":"animal",
                            "path":"type"
                        }
                    }
                }
            }
        },
        {
            "$project":{
                "_id":1,
                "species":1,
                "embedding":"$2d.normalized.cosine",
                "score":{"$meta":"searchScore"}
            }
        }
    ]))
    candidates = pd.DataFrame(aggResults).set_index('_id')
    embedding_matrix = candidates[["embedding"]].squeeze()
    mmrResults = maximal_marginal_relevance(
        sentence_vector=query_vector,
        documents=candidates,
        embedding_matrix=embedding_matrix,
        lambda_constant=0.1,
        threshold_docs=10
    )
    return({"results":aggResults,"mmrResults":mmrResults})

r = mmrQuery(collection,inputVector)
for i in range(0,10):
    print('Standard result: {}\nMMR ranked: {}'.format(r['results'][i]['species'],r['mmrResults'][i]['species']))

Standard result: Cow
MMR ranked: Cow
Standard result: Pig
MMR ranked: Brachiosaurus
Standard result: African Elephant
MMR ranked: Diplodocus
Standard result: Horse
MMR ranked: Triceratops
Standard result: Giraffe
MMR ranked: Rhesus Monkey
Standard result: Jaguar
MMR ranked: Mole
Standard result: Kangaroo
MMR ranked: Human
Standard result: Asian Elephant
MMR ranked: Mouse
Standard result: Gorilla
MMR ranked: Potar Monkey
Standard result: Donkey
MMR ranked: Chimpanzee


# A simple statistical model for language
Calculate the position of a word in 2 dimensional vector space comprising its average position in a sentence and the average length of the words preceding and suceeding it.

In [None]:
import requests

text = requests.get('https://www.gutenberg.org/cache/epub/71087/pg71087.txt').text

sentences = text.split('.')

wordModel = {}
for s in sentences:
    words = s.split()
    for i,w in enumerate(words):
        w = w.strip("_-?',;:}{][.!@`|=+*‘)(")
        if i > 0:
            precedingLength = len(words[i-1])
        else:
            precedingLength = 0
        
        if i < len(words)-1:
            succeddingLength = len(words[i+1])
        else:
            succeddingLength = 0
        
        if w in wordModel:
            wordModel[w]['vector'][0] = (wordModel[w]['count']*wordModel[w]['vector'][0]+i)/(wordModel[w]['count']+1)
            wordModel[w]['vector'][1] = (wordModel[w]['count']*wordModel[w]['vector'][1]+(succeddingLength+precedingLength)/2)/(wordModel[w]['count']+1)
            wordModel[w]['count'] += 1
        else:
            wordModel[w] = {'count':1,'vector':[float(i)+len(w),float(len(w)+((succeddingLength+precedingLength)/2))]}

graphData = []
for v in wordModel:
    x = wordModel[v]['vector'][0]
    y = wordModel[v]['vector'][1]
    graphData.append(
        {
            "word":v,
            'type':'word',
            '2d':{
                    'raw':{
                        'euclidean':[x,y],
                        'dotproduct':[x,y],
                        'cosine':[x,y]
                    },
                    'normalized':{
                        'euclidean':normalize([x,y]),
                        'dotproduct':normalize([x,y]),
                        'cosine':normalize([x,y])
                    }
                },
            "x":x,
            "y":y
        }
    )

fig = make_subplots(rows=1,cols=1,subplot_titles=["Word Model"])
trace=go.Scatter(hoverinfo="text",hovertext=[d["word"] for d in graphData], x=[d["x"] for d in graphData],y=[d["y"] for d in graphData],mode="markers")
fig.add_trace(trace,row=1,col=1)
fig.show()

collection.delete_many({"type":'word'})
collection.insert_many(graphData)

In [None]:
# Select a random word and find it's nearest neighbours

word = collection.aggregate([{"$sample":{"size":1}}]).next()
vector = [word['x'],word['y']]

neighbours = collection.aggregate([
    {
        "$search":{
            "knnBeta":{
                "vector":vector,
                "path":"2d.raw.cosine",
                "filter":{
                    "text":{
                        "query":"word",
                        "path":"type"
                    }
                },
                "k":200
            }
        }
    },
    {
        "$limit":10
    }
])

print("Nearest neighbours to {}".format(word['word']))
while neighbours._has_next():
    print(neighbours.next()['word'])