# Cross-Lingual Similarity and Semantic Search Engine with Cohere Multilingual API


In [1]:
import cohere
import numpy as np
import altair as alt
from annoy import AnnoyIndex
import pandas as pd
import umap.umap_ as umap
import re

API_KEY = "J0okVO9eZywR0IJc95qT5bpYTyV3ZPHgtbxXhxKR"
co = cohere.Client(API_KEY)


  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


### Load the datasets


In [55]:
df = pd.read_csv("data/hau_ibo.csv")
df.head()

Unnamed: 0,id,ha_text,ig_text,ha_ig_text
0,1,baka film din banza director allah ya dafa ni ...,ndu chineke nyerem,baka film din banza director allah ya dafa ni ...
1,2,allah ubangiji yaiwa rayuwarsa albarka,np onye eze morningdrivelagos moremus...,allah ubangiji yaiwa rayuwarsa albarka
2,3,abu nafarko gamawa da duniya lafiya kuma inaso...,dal ezigbo mmad,abu nafarko gamawa da duniya lafiya kuma inaso...
3,4,allah yasa muyi kyakkyawan qarshe,good afternoon hugeman jj juventus gave napo...,allah yasa muyi kyakkyawan qarshe
4,5,ba wasa a fuskokinsu may you succeed always guys,my manchi like forever daal b ch nyem,ba wasa a fuskokinsu may you succeed always guys


### Preprocess the datasets


In [58]:
def preprocess(text):
    # Remove any non-alphanumeric characters
    text = re.sub(r"@[^\s]+", "", str(text))
    text = re.sub(r"http[^\s]+", "", str(text))
    # Remove any extra whitespace
    text = re.sub(r"\s+", " ", str(text))
    # Convert to lowercase
    text = text.lower()
    return text

df["ha_text"] = df["ha_text"].apply(preprocess)
df["ig_text"] = df["ig_text"].apply(preprocess)
df.head()


Unnamed: 0,id,ha_text,ig_text,ha_ig_text
0,1,baka film din banza director allah ya dafa ni ...,ndu chineke nyerem,baka film din banza director allah ya dafa ni ...
1,2,allah ubangiji yaiwa rayuwarsa albarka,np onye eze morningdrivelagos moremusicradio w...,allah ubangiji yaiwa rayuwarsa albarka
2,3,abu nafarko gamawa da duniya lafiya kuma inaso...,dal ezigbo mmad,abu nafarko gamawa da duniya lafiya kuma inaso...
3,4,allah yasa muyi kyakkyawan qarshe,good afternoon hugeman jj juventus gave napoli...,allah yasa muyi kyakkyawan qarshe
4,5,ba wasa a fuskokinsu may you succeed always guys,my manchi like forever daal b ch nyem,ba wasa a fuskokinsu may you succeed always guys


In [60]:
# Use the Cohere API to create embeddings for each sentence in the Hausa and Igbo from dataset
embeddings = co.embed(df["ha_ig_text"][:100].to_list(), model="embed-multilingual-v2.0")
embeddings = np.array(list(embeddings))

In [62]:
embeddings

array([[ 0.41731507,  0.04754352,  0.31342632, ..., -0.02138824,
         0.12995528, -0.10288824],
       [ 0.44219264,  0.02415373,  0.23588851, ..., -0.10922535,
         0.16411138,  0.27423725],
       [ 0.35250703,  0.06677664,  0.331565  , ..., -0.33712032,
         0.0125113 , -0.01478072],
       ...,
       [ 0.22549625,  0.08665128,  0.17888549, ..., -0.19686654,
         0.12488741,  0.04754797],
       [ 0.15011856,  0.04350951, -0.13691679, ..., -0.21984527,
         0.01208592, -0.09014519],
       [ 0.09198445, -0.11861288, -0.04043893, ..., -0.01114722,
         0.00923344, -0.02196117]])

In [61]:
# Use the embeddings to perform semantic search and find the most similar sentences in the  dataset
results = []
search_results = AnnoyIndex(embeddings.shape[1], 'angular')
for i in range(len(embeddings)):
    # Save the result as a tuple
    search_results.add_item(i, embeddings[i])
search_results.build(40)
search_results.save('text.ann')

True

### Semantic search in Hausa Language

In [67]:
query = "bai hau ba har yanxu tukun amma yananan zuwa insha allah"

# Get the query's embedding
query_embed = co.embed(texts=[query],
                  model="embed-multilingual-v2.0").embeddings

# Retrieve the nearest neighbors
similar_item_ids = search_results.get_nns_by_vector(query_embed[0],10,
                                                include_distances=True)
# Format the results
results = pd.DataFrame(data={'texts': df.iloc[similar_item_ids[0]]['ha_text'],
                             'distance': similar_item_ids[1]})


print(f"Query:'{query}'\nNearest neighbors:")
results

Query:'bai hau ba har yanxu tukun amma yananan zuwa insha allah'
Nearest neighbors:


Unnamed: 0,texts,distance
42,koma wanne irin nau i neh allah zai karemu,0.243861
8,wannan gsky ne wallahi munacikin lahaula allah...,0.26354
32,kuma dai ya allah mun mika lamuramu zuwa garek...,0.276408
15,allah yashiryar damu baki dai wlh wasu mutane ...,0.282136
46,allahu akbar allah ka kawo mana karshen wannan...,0.283701
17,allah ya sa mu dace amin,0.287117
20,gamji mazan kwarai namiji uban yan boko allah ...,0.293154
7,toh allah ya sakawa yan gaza,0.309704
28,allah yasa afara asa a allah yayi jagora,0.309827
9,ya subhanallahi allah ya jiqansu ya gafarta mu...,0.311894


### Semantic search in Igbo Language

In [70]:
query = "mangala na ato m gwogwo ezigbo azu"

# Get the query's embedding
query_embed = co.embed(texts=[query],
                  model="embed-multilingual-v2.0").embeddings

# Retrieve the nearest neighbors
similar_item_ids = search_results.get_nns_by_vector(query_embed[0],10,
                                                include_distances=True)
# Format the results
results = pd.DataFrame(data={'texts': df.iloc[similar_item_ids[0]]['ig_text'],
                             'distance': similar_item_ids[1]})


print(f"Query:'{query}'\nNearest neighbors:")
results

Query:'mangala na ato m gwogwo ezigbo azu'
Nearest neighbors:


Unnamed: 0,texts,distance
98,thank you jesus onye nwe anyi imela goodmornin...,0.257605
92,good morning ndi n anyi bia ka anyi rie cc,0.258559
74,igbo amaka dimkpa anagh a ka nk n egwu ma a gb...,0.263173
82,ezenkwobi ny spare parts na agbani,0.26602
59,haew god ezigbo nwanyi,0.267447
91,odogwu na agha dike na agha nara ekele odogwu ...,0.273286
87,time for our thanksgiving dance and procession...,0.282777
73,chukwu oma onyeoma na chukwu oma onyeoma na ch...,0.285421
54,jehovah idi egwu jehovah idi egwu praise faran...,0.286639
96,ogssian nwanne udo diri gi,0.287594


In [74]:

# UMAP reduces the dimensions from 1024 to 2 dimensions that we can plot
reducer = umap.UMAP(n_neighbors=20)
umap_embeds = reducer.fit_transform(embeddings)
# Prepare the data to plot and interactive visualization
# using Altair
df_explore = pd.DataFrame(data={'text': df['ha_ig_text'][:100]})
df_explore['x'] = umap_embeds[:,0]
df_explore['y'] = umap_embeds[:,1]
# Plot
chart = alt.Chart(df_explore).mark_circle(size=60).encode(
    x=#'x',
    alt.X('x',
        scale=alt.Scale(zero=False)
    ),
    y=
    alt.Y('y',
        scale=alt.Scale(zero=False)
    ),
    tooltip=['text']
).properties(
    width=700,
    height=400
)
chart.interactive()