In [48]:

# Step 2: Import the necessary libraries and define the API key
import cohere
import numpy as np
import altair as alt
from annoy import AnnoyIndex
import pandas as pd
import umap.umap_ as umap
import re

API_KEY = "J0okVO9eZywR0IJc95qT5bpYTyV3ZPHgtbxXhxKR"
co = cohere.Client(API_KEY)


In [49]:
# Step 3: Load the Hausa and Igbo datasets
hau_df = pd.read_csv("data/ha.csv")
ibo_df = pd.read_csv("data/ibo.csv")


In [50]:
# Step 4: Preprocess the datasets
def preprocess(text):
    # Remove any non-alphanumeric characters
    text = re.sub(r"@[^\s]+", "", text)
    text = re.sub(r"http[^\s]+", "", text)
    # Remove any extra whitespace
    text = re.sub(r"\s+", " ", text)
    # Convert to lowercase
    text = text.lower()
    return text

hau_df[" tweet"] = hau_df[" tweet"].apply(preprocess)
ibo_df[" tweet"] = ibo_df[" tweet"].apply(preprocess)


In [51]:
# Step 5: Use the Cohere API to create embeddings for each sentence in the Hausa dataset
hau_embeddings = co.embed(hau_df[" tweet"][:1000].tolist(), model="embed-english-v2.0")
hau_embeddings = np.array(list(hau_embeddings))

CohereConnectionError: HTTPSConnectionPool(host='api.cohere.ai', port=443): Read timed out.

In [37]:
# Step 6: Use the embeddings to perform semantic search and find the most similar sentences in the Igbo dataset
results = []
search_results = AnnoyIndex(hau_embeddings.shape[1], 'angular')
for i in range(len(hau_embeddings)):
    # Save the result as a tuple
    search_results.add_item(i, hau_embeddings[i])
search_results.build(40)
search_results.save('text.ann')

True

In [38]:
query = "Allah yayiwa mahaifiya day yar uwata rasuwa"

# Get the query's embedding
query_embed = co.embed(texts=[query],
                  model="embed-english-v2.0").embeddings

# Retrieve the nearest neighbors
similar_item_ids = search_results.get_nns_by_vector(query_embed[0],10,
                                                include_distances=True)
# Format the results
results = pd.DataFrame(data={'texts': hau_df.iloc[similar_item_ids[0]][' tweet'],
                             'distance': similar_item_ids[1]})


print(f"Query:'{query}'\nNearest neighbors:")
results

Query:'Allah yayiwa mahaifiya day yar uwata rasuwa'
Nearest neighbors:


Unnamed: 0,texts,distance
89,don allah suyi wannan qarin mu wuce gurin tun...,0.688973
70,"uwar me zasu musu, bayan sun fada anqaryata s...",0.719481
58,allah ya tsinewa amnesty albarka. allah ya wu...,0.740768
83,na rasa mahaifiya da yar'uwata a wannan sheka...,0.7458
67,akwai allah aie shiyasa basu damuba koda nige...,0.770275
10,ai shiyasa naga shi kansa dan rainin wayon ya...,0.785281
43,insha allahu baza a bayar ba karawa me karfi ...,0.78721
64,kaji abin haushi toh uwar me mutun zaizo yayi...,0.796501
5,"to ku nemawa talakkawa mafita mana, magamar b...",0.798655
82,ina nillahi ah ah jiya fa akace an kwashe mut...,0.799423


In [89]:

# UMAP reduces the dimensions from 1024 to 2 dimensions that we can plot
reducer = umap.UMAP(n_neighbors=20)
umap_embeds = reducer.fit_transform(hau_embeddings)
# Prepare the data to plot and interactive visualization
# using Altair
df_explore = pd.DataFrame(data={'tweet': hau_df[' tweet'][:100]})
df_explore['x'] = umap_embeds[:,0]
df_explore['y'] = umap_embeds[:,1]
# Plot
chart = alt.Chart(df_explore).mark_circle(size=60).encode(
    x=#'x',
    alt.X('x',
        scale=alt.Scale(zero=False)
    ),
    y=
    alt.Y('y',
        scale=alt.Scale(zero=False)
    ),
    tooltip=['tweet']
).properties(
    width=700,
    height=400
)
chart.interactive()