### Data Load

In [1]:
from datasets import load_dataset

dataset = load_dataset("MBZUAI/Bactrian-X", 'sw')
dataset

  from .autonotebook import tqdm as notebook_tqdm
Downloading builder script: 100%|██████████| 2.26k/2.26k [00:00<?, ?B/s]
Downloading readme: 100%|██████████| 13.5k/13.5k [00:00<00:00, 6.61MB/s]
Downloading data: 100%|██████████| 17.0M/17.0M [00:08<00:00, 2.04MB/s]
Generating train split: 67017 examples [00:02, 25073.64 examples/s]


DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'id', 'output'],
        num_rows: 67017
    })
})

In [4]:
instructions = dataset['train']['instruction']
print(len(instructions))

67017


In [6]:
import pandas as pd

df = pd.DataFrame()

df['text'] = instructions
df.head()

Unnamed: 0,text
0,Unda tangazo fupi la nafaka mpya ya kiamsha ki...
1,Tathmini ufanisi wa mfumo wa afya wa Marekani.
2,Pendekeza suluhisho la kupunguza taka za plast...
3,"Kwa kuzingatia safu ya nambari, ni jumla gani ..."
4,Tengeneza programu ambayo inaweza kupata palli...


In [7]:
for item in instructions[:5]: # print the first five items
    print(item)

Unda tangazo fupi la nafaka mpya ya kiamsha kinywa.
Tathmini ufanisi wa mfumo wa afya wa Marekani.
Pendekeza suluhisho la kupunguza taka za plastiki katika mazingira.
Kwa kuzingatia safu ya nambari, ni jumla gani ya juu inayofuata ya vitu visivyo karibu?
Tengeneza programu ambayo inaweza kupata pallindrome ndefu zaidi katika mfuatano fulani


### Text Cleaning

In [11]:
def clean(text):
    text = str(text)
    text = text.lower()
    text = text.replace("\s+", " ") # remove extra whitespaces
    text = text.replace("[^a-zA-Z]", " ")
    return text

In [12]:
df['clean_text'] = df['text'].apply(clean)

### Tokenization

#### Tokens

In [15]:
df['tokens'] = df['clean_text'].apply(lambda x: x.split())

In [16]:
df.head(10)

Unnamed: 0,text,clean_text,tokens
0,Unda tangazo fupi la nafaka mpya ya kiamsha ki...,unda tangazo fupi la nafaka mpya ya kiamsha ki...,"[unda, tangazo, fupi, la, nafaka, mpya, ya, ki..."
1,Tathmini ufanisi wa mfumo wa afya wa Marekani.,tathmini ufanisi wa mfumo wa afya wa marekani.,"[tathmini, ufanisi, wa, mfumo, wa, afya, wa, m..."
2,Pendekeza suluhisho la kupunguza taka za plast...,pendekeza suluhisho la kupunguza taka za plast...,"[pendekeza, suluhisho, la, kupunguza, taka, za..."
3,"Kwa kuzingatia safu ya nambari, ni jumla gani ...","kwa kuzingatia safu ya nambari, ni jumla gani ...","[kwa, kuzingatia, safu, ya, nambari,, ni, juml..."
4,Tengeneza programu ambayo inaweza kupata palli...,tengeneza programu ambayo inaweza kupata palli...,"[tengeneza, programu, ambayo, inaweza, kupata,..."
5,Ondoa maneno yote yanayorudiwa kutoka kwa sent...,ondoa maneno yote yanayorudiwa kutoka kwa sent...,"[ondoa, maneno, yote, yanayorudiwa, kutoka, kw..."
6,"Zungusha herufi ya 3 ya ""Kozi"" nafasi mbili kulia","zungusha herufi ya 3 ya ""kozi"" nafasi mbili kulia","[zungusha, herufi, ya, 3, ya, ""kozi"", nafasi, ..."
7,"Kwa kuzingatia orodha ya vitu, angalia ikiwa k...","kwa kuzingatia orodha ya vitu, angalia ikiwa k...","[kwa, kuzingatia, orodha, ya, vitu,, angalia, ..."
8,"Kwa kuzingatia mifuatano miwili ya nambari, un...","kwa kuzingatia mifuatano miwili ya nambari, un...","[kwa, kuzingatia, mifuatano, miwili, ya, namba..."
9,Kwa kuzingatia miduara miwili inayozingatia (1...,kwa kuzingatia miduara miwili inayozingatia (1...,"[kwa, kuzingatia, miduara, miwili, inayozingat..."


In [17]:
for item in df['tokens'].iloc[0]:
    print(item)

unda
tangazo
fupi
la
nafaka
mpya
ya
kiamsha
kinywa.


In [28]:
from tqdm.auto import tqdm

tokens = []

for row in tqdm(df['tokens']):
    for token in row:
        tokens.append(token)


100%|██████████| 67017/67017 [00:00<00:00, 540773.37it/s]


In [31]:
tokens = list(set(tokens))
print(len(tokens))

48320


#### Token Ids

In [33]:
import numpy as np

tokens, token_ids = np.unique(tokens, return_inverse=True)

In [34]:
tokens

array(['"', '"!"].', '""', ..., '☠️', '장보기', '💚'], dtype='<U129')

In [37]:
for i in range(10):
    print(tokens[i])
    print(token_ids[i])
    print('--' * 5)

"
274
----------
"!"].
32915
----------
""
5641
----------
"",
2170
----------
"#mixbuffet".
46242
----------
"$variable1"
39995
----------
"%"
46711
----------
"-"
11641
----------
"-".
45899
----------
"-ing"
35506
----------


#### Key Vals.

In [40]:
print(len(tokens), len(token_ids))

48320 48320


In [44]:
json_file = {}
json_file.update({tokens[1000]: token_ids[1000]})

json_file

{'"kijani"': 36247}

In [43]:
word_to_search = 'algoeithm'

# Create a boolean mask based on whether the word is present in each row of the 'text' column
mask = df['text'].str.contains(word_to_search, case=False)

# Apply the mask to filter the DataFrame
result_df = df[mask]

# Display the result 
print(result_df)

Empty DataFrame
Columns: [text, clean_text, tokens]
Index: []


In [41]:
for i in zip(tokens, token_ids):
    json_file.update({token_ids[i] : tokens[i]})

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices